From fe90500e441c91a76bfca98b48cce8e8af60df82 Mon Sep 17 00:00:00 2001 From: Leonid Nikitin Date: Sat, 20 Jun 2026 16:05:31 +0500 Subject: [PATCH] Refactor text splitting logic into dedicated commands and introduce `PartText` DTO for improved modularity and maintainability. --- src/DTO/PartText.php | 40 +++ src/DTO/ProcessTranslateDto.php | 4 +- src/Jobs/ProcessTranslate.php | 64 ++++- .../SplitTextIntoParts/SplitHTMLCommand.php | 270 ++++++++++++++++++ .../SplitTextIntoParts/SplitTextCommand.php | 50 ++++ src/Translate/SplitTextIntoPartsCommand.php | 118 ++------ 6 files changed, 441 insertions(+), 105 deletions(-) create mode 100644 src/DTO/PartText.php create mode 100644 src/Translate/SplitTextIntoParts/SplitHTMLCommand.php create mode 100644 src/Translate/SplitTextIntoParts/SplitTextCommand.php diff --git a/src/DTO/PartText.php b/src/DTO/PartText.php new file mode 100644 index 0000000..18f55fc --- /dev/null +++ b/src/DTO/PartText.php @@ -0,0 +1,40 @@ +texts[$this->part] = $text; + $this->beforeTexts[$this->part] = $beforeText; + $this->afterTexts[$this->part] = $afterText; + + $this->part++; + } + + public function getTextsForTranslation(): array + { + return $this->texts; + } + + public function getTextsAfterTranslation(array $texts): string + { + $result = ''; + foreach ($texts as $key => $text) { + if ($this->beforeTexts[$key] !== null) { + $result .= $this->beforeTexts[$key]; + } + $result .= $text; + if ($this->afterTexts[$key] !== null) { + $result .= $this->afterTexts[$key]; + } + } + return $result; + } +} \ No newline at end of file diff --git a/src/DTO/ProcessTranslateDto.php b/src/DTO/ProcessTranslateDto.php index 811b2da..6e9d0aa 100644 --- a/src/DTO/ProcessTranslateDto.php +++ b/src/DTO/ProcessTranslateDto.php @@ -10,7 +10,7 @@ final readonly class ProcessTranslateDto private string $groupName, private string $key, private int $part, - private string $text, + private PartText $text, private TextType $textType, private string $targetLanguageCode, private ?string $sourceLanguageCode = null, @@ -32,7 +32,7 @@ final readonly class ProcessTranslateDto return $this->part; } - public function getText(): string + public function getText(): PartText { return $this->text; } diff --git a/src/Jobs/ProcessTranslate.php b/src/Jobs/ProcessTranslate.php index d605868..0fd8473 100644 --- a/src/Jobs/ProcessTranslate.php +++ b/src/Jobs/ProcessTranslate.php @@ -59,28 +59,62 @@ final class ProcessTranslate implements ShouldQueue, ShouldBeEncrypted, ProcessT { $param = $this->param; $groupName = $param->getGroupName(); + $key = $param->getKey(); + $part = $param->getPart(); $translated = Cache::get($groupName, []); if (!isset($translated[$param->getKey()])) { $translated[$param->getKey()] = []; } - $translate = Translate::service($param->getDriver()); - $function = $param->getTextType()->functionName(); - $key = $param->getKey(); - $part = $param->getPart(); - - $translated[$key][$part] = $param->getText(); - if (\trim($param->getText()) !== '') { - - $translated[$key][$part] = $translate->{$function}( - $param->getText(), - $param->getTargetLanguageCode(), - $param->getSourceLanguageCode() - ); - - } + $translated[$key][$part] = $this->translate(); Cache::put($groupName, $translated, 86400); } + + private function translate(): string + { + $param = $this->param; + + $translate = Translate::service($param->getDriver()); + $function = $param->getTextType()->functionName(); + + $partText = $param->getText(); + $originalTexts = $partText->getTextsForTranslation(); + + $textsForTranslation = []; + $originalKeysByTranslationIndex = []; + + foreach ($originalTexts as $originalKey => $text) { + if (\trim($text) === '') { + continue; + } + + $translationIndex = count($textsForTranslation); + + $textsForTranslation[$translationIndex] = $text; + $originalKeysByTranslationIndex[$translationIndex] = $originalKey; + } + + if ($textsForTranslation === []) { + return $partText->getTextsAfterTranslation($originalTexts); + } + + $translatedTexts = $translate->{$function}( + $textsForTranslation, + $param->getTargetLanguageCode(), + $param->getSourceLanguageCode() + ); + + foreach ($translatedTexts as $translationIndex => $translatedText) { + if (!isset($originalKeysByTranslationIndex[$translationIndex])) { + continue; + } + + $originalKey = $originalKeysByTranslationIndex[$translationIndex]; + $originalTexts[$originalKey] = $translatedText; + } + + return $partText->getTextsAfterTranslation($originalTexts); + } } \ No newline at end of file diff --git a/src/Translate/SplitTextIntoParts/SplitHTMLCommand.php b/src/Translate/SplitTextIntoParts/SplitHTMLCommand.php new file mode 100644 index 0000000..2eb7841 --- /dev/null +++ b/src/Translate/SplitTextIntoParts/SplitHTMLCommand.php @@ -0,0 +1,270 @@ + + */ + public function execute(string $html): array + { + $this->currentPart = 0; + $this->currentLength = 0; + $this->currentText = ''; + $this->parts = [ + 0 => new PartText(), + ]; + + libxml_use_internal_errors(true); + + $dom = new DOMDocument(); + $dom->loadHTML( + mb_convert_encoding('' . $html . '', 'HTML-ENTITIES', 'UTF-8'), + LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD + ); + + $this->processNodes($dom, $dom->documentElement->childNodes); + if ($this->currentLength > 0) { + $this->addTextToPart(); + $this->currentText = ''; + $this->currentBeforeText = ''; + $this->currentAfterText = ''; + $this->currentLength = 0; + $this->currentPart++; + } + + libxml_clear_errors(); + + return $this->parts; + } + + private function processNodes(DOMDocument $dom, DOMNodeList $nodes): void + { + /** @var DOMNode $node */ + foreach ($nodes as $node) { + if ($node->nodeType === XML_TEXT_NODE || $node->nodeType === XML_CDATA_SECTION_NODE) { + $this->processText($node->textContent); + continue; + } + + if ($node->nodeType === XML_ELEMENT_NODE) { + $this->processHtml($dom, $node); + continue; + } + } + } + + private function processText(string $text): void + { + $currentLength = mb_strlen($text); + + if ($this->currentLength + $currentLength <= $this->maxLength) { + $this->currentText .= $text; + $this->currentLength += $currentLength; + return; + } + + if ($this->currentLength > 0) { + $this->addTextToPart(); + $this->newPart(); + } + + if ($currentLength <= $this->maxLength) { + $this->currentText .= $text; + $this->currentLength += $currentLength; + return; + } + + $this->splitLongText($text); + } + + private function splitLongText(string $text): void + { + // Common expression for searching for sentences. + $sentenceEndings = '/(?<=[.?!])[ \t]+(?=[A-ZА-Я])/u'; + + //Dividing the text into sentences + $sentences = preg_split($sentenceEndings, $text, -1, PREG_SPLIT_NO_EMPTY); + + foreach ($sentences as $sentence) { + $currentLength = mb_strlen($sentence); + if ($this->currentLength > 0) { + $currentLength += 1; + } + + // If adding a sentence does not exceed the limit, add it to the current part. + if ($this->currentLength + $currentLength <= $this->maxLength) { + $this->currentText .= (empty($this->currentText) ? '' : ' ') . $sentence; + $this->currentLength += $currentLength; + continue; + } + + $this->addTextToPart(); + $this->newPart(); + + $this->currentText = $sentence; + $this->currentLength = $currentLength; + } + } + + private function processHtml(DOMDocument $dom, DOMNode $node): void + { + $html = $dom->saveHTML($node); + $currentLength = mb_strlen($html); + + if ($this->currentLength + $currentLength <= $this->maxLength) { + $this->currentText .= $html; + $this->currentLength += $currentLength; + return; + } + + if ($this->currentLength > 0) { + $this->addTextToPart(); + $this->newPart(); + } + + if ($currentLength <= $this->maxLength) { + $this->currentText .= $html; + $this->currentLength += $currentLength; + return; + } + + if ($node->hasChildNodes()) { + $tag = strtolower($node->nodeName); + if ($tag === 'table') { + $this->splitTable($dom, $node); + return; + } + + $this->currentBeforeText .= '<' . $tag . $this->attributesToString($node) . '>'; + + $this->processNodes($dom, $node->childNodes); + + $this->currentAfterText .= ''; + $this->addTextToPart(); + $this->newPart(); + return; + } + } + + private function splitTable(DOMDocument $dom, DOMNode $node): void + { + if ($this->currentLength > 0) { + $this->addTextToPart(); + $this->newPart(); + } + $this->currentBeforeText .= 'attributesToString($node) . '>'; + if ($node->hasChildNodes()) { + foreach ($node->childNodes as $childNode) { + if ($childNode->nodeType === XML_TEXT_NODE || $childNode->nodeType === XML_CDATA_SECTION_NODE) { + continue; + } + + if ($childNode->nodeType !== XML_ELEMENT_NODE) { + continue; + } + + $this->processTableSection($dom, $childNode); + } + } + + $this->currentAfterText .= ''; + $this->addTextToPart(); + $this->newPart(); + } + + private function processTableSection(DOMDocument $dom, DOMNode $node): void + { + $tag = strtolower($node->nodeName); + if ($tag === 'tr') { + $this->processTableTr($dom, $node); + } + + if (in_array($tag, ['thead', 'tbody', 'tfoot'], true)) { + $this->currentBeforeText .= '<' . $tag . $this->attributesToString($node) . '>'; + foreach ($node->childNodes as $childNode) { + $tagNode = strtolower($childNode->nodeName); + if ($tagNode === 'tr') { + $this->processTableTr($dom, $childNode); + } + } + $this->currentAfterText .= ''; + } + } + + private function processTableTr(DOMDocument $dom, DOMNode $node): void + { + $this->currentBeforeText .= 'attributesToString($node) . '>'; + foreach ($node->childNodes as $childNode) { + $tagNode = strtolower($childNode->nodeName); + if ($tagNode === 'td') { + $this->processTableTd($dom, $childNode); + } + } + $this->currentAfterText .= ''; + $this->addTextToPart(); + } + + private function processTableTd(DOMDocument $dom, DOMNode $node): void + { + $this->currentBeforeText .= 'attributesToString($node) . '>'; + $this->processNodes($dom, $node->childNodes); + $this->currentAfterText .= ''; + $this->addTextToPart(); + } + + private function attributesToString(DOMNode $node): string + { + if (!$node->hasAttributes()) { + return ''; + } + + $parts = []; + foreach ($node->attributes as $attr) { + $parts[] = sprintf( + ' %s="%s"', + $attr->nodeName, + htmlspecialchars($attr->nodeValue ?? '', ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8') + ); + } + + return implode('', $parts); + } + + private function addTextToPart(): void + { + $this->parts[$this->currentPart]->add($this->currentText, $this->currentBeforeText, $this->currentAfterText); + $this->currentText = ''; + $this->currentBeforeText = ''; + $this->currentAfterText = ''; + } + + private function newPart(): void + { + $this->currentText = ''; + $this->currentBeforeText = ''; + $this->currentAfterText = ''; + $this->currentLength = 0; + $this->currentPart++; + $this->parts[$this->currentPart] = new PartText(); + } +} \ No newline at end of file diff --git a/src/Translate/SplitTextIntoParts/SplitTextCommand.php b/src/Translate/SplitTextIntoParts/SplitTextCommand.php new file mode 100644 index 0000000..290d6aa --- /dev/null +++ b/src/Translate/SplitTextIntoParts/SplitTextCommand.php @@ -0,0 +1,50 @@ + + */ + public function execute(string $text, int $maxLength): array + { + // Common expression for searching for sentences. + $sentenceEndings = '/(?<=[.?!])[ \t]+(?=[A-ZА-Я])/u'; + + //Dividing the text into sentences + $sentences = preg_split($sentenceEndings, $text, -1, PREG_SPLIT_NO_EMPTY); + + $parts = []; + $currentPart = ''; + + foreach ($sentences as $sentence) { + $part = new PartText(); + + // If adding a sentence does not exceed the limit, add it to the current part. + if (mb_strlen($currentPart . ' ' . $sentence) <= $maxLength) { + $currentPart .= (empty($currentPart) ? '' : ' ') . $sentence; + continue; + } + + // Otherwise, save the current part and start a new one. + if (!empty($currentPart)) { + $part->add($currentPart); + $parts[] = $part; + } + $currentPart = $sentence; + } + + if (!empty($currentPart)) { + $part = new PartText(); + $part->add($currentPart); + $parts[] = $part; + } + + return $parts; + } +} \ No newline at end of file diff --git a/src/Translate/SplitTextIntoPartsCommand.php b/src/Translate/SplitTextIntoPartsCommand.php index ce985c4..4a58b7b 100644 --- a/src/Translate/SplitTextIntoPartsCommand.php +++ b/src/Translate/SplitTextIntoPartsCommand.php @@ -2,114 +2,56 @@ namespace KorElf\TranslateLaravel\Translate; +use KorElf\TranslateLaravel\DTO\PartText; use KorElf\TranslateLaravel\Enums\TextType; use KorElf\TranslateLaravel\Facades\Translate; -use DOMDocument; +use KorElf\TranslateLaravel\Translate\SplitTextIntoParts\SplitHTMLCommand; +use KorElf\TranslateLaravel\Translate\SplitTextIntoParts\SplitTextCommand; final readonly class SplitTextIntoPartsCommand { + /** + * @param string $text + * @param TextType $textType + * @param string|null $driver + * @return array + */ public function execute(string $text, TextType $textType, ?string $driver): array { $maxLength = Translate::getLimit($driver); $maxLength = $maxLength['max_symbols'] ?? null; if ($maxLength === null || $maxLength > mb_strlen($text)) { - return [$text]; + $part = new PartText(); + $part->add($text); + return [$part]; } if ($textType === TextType::Html) { - return $this->splitHtmlText($text, $maxLength); + return $this->splitHtml($text, $maxLength); } - return $this->splitTextBySentences($text, $maxLength); + return $this->splitText($text, $maxLength); } - private function splitTextBySentences(string $text, int $maxLength): array + /** + * @param string $text + * @param int $maxLength + * @return array + */ + private function splitText(string $text, int $maxLength): array { - // Common expression for searching for sentences. - $sentenceEndings = '/(?<=[.?!])\s+(?=[A-ZА-Я])/u'; - - //Dividing the text into sentences - $sentences = preg_split($sentenceEndings, $text, -1, PREG_SPLIT_NO_EMPTY); - - $parts = []; - $currentPart = ''; - - foreach ($sentences as $sentence) { - // If adding a sentence does not exceed the limit, add it to the current part. - if (mb_strlen($currentPart . ' ' . $sentence) <= $maxLength) { - $currentPart .= (empty($currentPart) ? '' : ' ') . $sentence; - continue; - } - - // Otherwise, save the current part and start a new one. - if (!empty($currentPart)) { - $parts[] = $currentPart; - $currentPart = ''; - } - $currentPart = $sentence; - } - - if (!empty($currentPart)) { - $parts[] = $currentPart; - } - - return $parts; + $splitTextCommand = new SplitTextCommand(); + return $splitTextCommand->execute($text, $maxLength); } - private function splitHtmlText(string $html, int $maxLength): array + /** + * @param string $html + * @param int $maxLength + * @return array + */ + private function splitHtml(string $html, int $maxLength): array { - libxml_use_internal_errors(true); - - $dom = new DOMDocument(); - $dom->loadHTML(mb_convert_encoding('' . $html . '', 'HTML-ENTITIES', 'UTF-8'), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); - - $parts = []; - $currentPart = new DOMDocument(); - $currentLength = 0; - - // Iterate through all child nodes of the root element - foreach ($dom->documentElement->childNodes as $node) { - if ($node->nodeType === XML_TEXT_NODE || $node->nodeType === XML_CDATA_SECTION_NODE) { - // Dividing text nodes into sentences - $sentences = preg_split('/(?<=[.?!])\s+(?=[A-ZА-Я])/', $node->textContent, -1, PREG_SPLIT_NO_EMPTY); - - foreach ($sentences as $sentence) { - $sentenceHtml = htmlspecialchars($sentence); - if ($currentLength + mb_strlen($sentenceHtml) > $maxLength) { - $parts[] = $currentPart->saveHTML(); - - // Start new part - $currentPart = new DOMDocument(); - $currentLength = 0; - } - - $textNode = $currentPart->createTextNode($sentence . ' '); - $currentPart->appendChild($textNode); - $currentLength += mb_strlen($sentenceHtml); - } - continue; - } - - $nodeHtml = $dom->saveHTML($node); - - if ($currentLength + mb_strlen($nodeHtml) > $maxLength) { - $parts[] = $currentPart->saveHTML(); - - // Start new part - $currentPart = new DOMDocument(); - $currentLength = mb_strlen($nodeHtml); - } - - $currentPart->appendChild($currentPart->importNode($node, true)); - $currentLength += mb_strlen($nodeHtml); - } - - if ($currentPart->hasChildNodes()) { - $parts[] = $currentPart->saveHTML(); - } - - libxml_clear_errors(); - - return $parts; + $splitHTMLCommand = new SplitHTMLCommand($maxLength); + return $splitHTMLCommand->execute($html); } } \ No newline at end of file