Introduced new Data Transfer Objects (DTOs), exceptions, and jobs to enhance the translation service functionality. Updated namespaces for consistency and added rate limiting to the translation provider. Expanded the README with detailed usage instructions.
115 lines
3.8 KiB
PHP
115 lines
3.8 KiB
PHP
<?php declare(strict_types=1);
|
||
|
||
namespace KorElf\TranslateLaravel\Translate;
|
||
|
||
use KorElf\TranslateLaravel\Enums\TextType;
|
||
use KorElf\TranslateLaravel\Facades\Translate;
|
||
use DOMDocument;
|
||
|
||
final readonly class SplitTextIntoPartsCommand
|
||
{
|
||
public function execute(string $text, TextType $textType, ?string $driver): array
|
||
{
|
||
$maxLength = Translate::getLimit($driver);
|
||
$maxLength = $maxLength['max_symbols'] ?? null;
|
||
if ($maxLength === null || $maxLength > mb_strlen($text)) {
|
||
return [$text];
|
||
}
|
||
|
||
if ($textType === TextType::Html) {
|
||
return $this->splitHtmlText($text, $maxLength);
|
||
}
|
||
|
||
return $this->splitTextBySentences($text, $maxLength);
|
||
}
|
||
|
||
private function splitTextBySentences(string $text, int $maxLength): array
|
||
{
|
||
// Common expression for searching for sentences.
|
||
$sentenceEndings = '/(?<=[.?!])\s+(?=[A-ZА-Я])/u';
|
||
|
||
//Dividing the text into sentences
|
||
$sentences = preg_split($sentenceEndings, $text, -1, PREG_SPLIT_NO_EMPTY);
|
||
|
||
$parts = [];
|
||
$currentPart = '';
|
||
|
||
foreach ($sentences as $sentence) {
|
||
// If adding a sentence does not exceed the limit, add it to the current part.
|
||
if (mb_strlen($currentPart . ' ' . $sentence) <= $maxLength) {
|
||
$currentPart .= (empty($currentPart) ? '' : ' ') . $sentence;
|
||
continue;
|
||
}
|
||
|
||
// Otherwise, save the current part and start a new one.
|
||
if (!empty($currentPart)) {
|
||
$parts[] = $currentPart;
|
||
$currentPart = '';
|
||
}
|
||
$currentPart = $sentence;
|
||
}
|
||
|
||
if (!empty($currentPart)) {
|
||
$parts[] = $currentPart;
|
||
}
|
||
|
||
return $parts;
|
||
}
|
||
|
||
private function splitHtmlText(string $html, int $maxLength): array
|
||
{
|
||
libxml_use_internal_errors(true);
|
||
|
||
$dom = new DOMDocument();
|
||
$dom->loadHTML(mb_convert_encoding('<body>' . $html . '</body>', 'HTML-ENTITIES', 'UTF-8'), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
|
||
|
||
$parts = [];
|
||
$currentPart = new DOMDocument();
|
||
$currentLength = 0;
|
||
|
||
// Iterate through all child nodes of the root element
|
||
foreach ($dom->documentElement->childNodes as $node) {
|
||
if ($node->nodeType === XML_TEXT_NODE || $node->nodeType === XML_CDATA_SECTION_NODE) {
|
||
// Dividing text nodes into sentences
|
||
$sentences = preg_split('/(?<=[.?!])\s+(?=[A-ZА-Я])/', $node->textContent, -1, PREG_SPLIT_NO_EMPTY);
|
||
|
||
foreach ($sentences as $sentence) {
|
||
$sentenceHtml = htmlspecialchars($sentence);
|
||
if ($currentLength + mb_strlen($sentenceHtml) > $maxLength) {
|
||
$parts[] = $currentPart->saveHTML();
|
||
|
||
// Start new part
|
||
$currentPart = new DOMDocument();
|
||
$currentLength = 0;
|
||
}
|
||
|
||
$textNode = $currentPart->createTextNode($sentence . ' ');
|
||
$currentPart->appendChild($textNode);
|
||
$currentLength += mb_strlen($sentenceHtml);
|
||
}
|
||
continue;
|
||
}
|
||
|
||
$nodeHtml = $dom->saveHTML($node);
|
||
|
||
if ($currentLength + mb_strlen($nodeHtml) > $maxLength) {
|
||
$parts[] = $currentPart->saveHTML();
|
||
|
||
// Start new part
|
||
$currentPart = new DOMDocument();
|
||
$currentLength = mb_strlen($nodeHtml);
|
||
}
|
||
|
||
$currentPart->appendChild($currentPart->importNode($node, true));
|
||
$currentLength += mb_strlen($nodeHtml);
|
||
}
|
||
|
||
if ($currentPart->hasChildNodes()) {
|
||
$parts[] = $currentPart->saveHTML();
|
||
}
|
||
|
||
libxml_clear_errors();
|
||
|
||
return $parts;
|
||
}
|
||
} |