Files
translate-laravel/src/Translate/SplitTextIntoPartsCommand.php
Leonid Nikitin 55b04f0eab Add DTOs, exceptions, and jobs for translation service.
Introduced new Data Transfer Objects (DTOs), exceptions, and jobs to enhance the translation service functionality. Updated namespaces for consistency and added rate limiting to the translation provider. Expanded the README with detailed usage instructions.
2024-10-11 00:22:46 +05:00

115 lines
3.8 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php declare(strict_types=1);
namespace KorElf\TranslateLaravel\Translate;
use KorElf\TranslateLaravel\Enums\TextType;
use KorElf\TranslateLaravel\Facades\Translate;
use DOMDocument;
final readonly class SplitTextIntoPartsCommand
{
public function execute(string $text, TextType $textType, ?string $driver): array
{
$maxLength = Translate::getLimit($driver);
$maxLength = $maxLength['max_symbols'] ?? null;
if ($maxLength === null || $maxLength > mb_strlen($text)) {
return [$text];
}
if ($textType === TextType::Html) {
return $this->splitHtmlText($text, $maxLength);
}
return $this->splitTextBySentences($text, $maxLength);
}
private function splitTextBySentences(string $text, int $maxLength): array
{
// Common expression for searching for sentences.
$sentenceEndings = '/(?<=[.?!])\s+(?=[A-ZА-Я])/u';
//Dividing the text into sentences
$sentences = preg_split($sentenceEndings, $text, -1, PREG_SPLIT_NO_EMPTY);
$parts = [];
$currentPart = '';
foreach ($sentences as $sentence) {
// If adding a sentence does not exceed the limit, add it to the current part.
if (mb_strlen($currentPart . ' ' . $sentence) <= $maxLength) {
$currentPart .= (empty($currentPart) ? '' : ' ') . $sentence;
continue;
}
// Otherwise, save the current part and start a new one.
if (!empty($currentPart)) {
$parts[] = $currentPart;
$currentPart = '';
}
$currentPart = $sentence;
}
if (!empty($currentPart)) {
$parts[] = $currentPart;
}
return $parts;
}
private function splitHtmlText(string $html, int $maxLength): array
{
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$dom->loadHTML(mb_convert_encoding('<body>' . $html . '</body>', 'HTML-ENTITIES', 'UTF-8'), LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
$parts = [];
$currentPart = new DOMDocument();
$currentLength = 0;
// Iterate through all child nodes of the root element
foreach ($dom->documentElement->childNodes as $node) {
if ($node->nodeType === XML_TEXT_NODE || $node->nodeType === XML_CDATA_SECTION_NODE) {
// Dividing text nodes into sentences
$sentences = preg_split('/(?<=[.?!])\s+(?=[A-ZА-Я])/', $node->textContent, -1, PREG_SPLIT_NO_EMPTY);
foreach ($sentences as $sentence) {
$sentenceHtml = htmlspecialchars($sentence);
if ($currentLength + mb_strlen($sentenceHtml) > $maxLength) {
$parts[] = $currentPart->saveHTML();
// Start new part
$currentPart = new DOMDocument();
$currentLength = 0;
}
$textNode = $currentPart->createTextNode($sentence . ' ');
$currentPart->appendChild($textNode);
$currentLength += mb_strlen($sentenceHtml);
}
continue;
}
$nodeHtml = $dom->saveHTML($node);
if ($currentLength + mb_strlen($nodeHtml) > $maxLength) {
$parts[] = $currentPart->saveHTML();
// Start new part
$currentPart = new DOMDocument();
$currentLength = mb_strlen($nodeHtml);
}
$currentPart->appendChild($currentPart->importNode($node, true));
$currentLength += mb_strlen($nodeHtml);
}
if ($currentPart->hasChildNodes()) {
$parts[] = $currentPart->saveHTML();
}
libxml_clear_errors();
return $parts;
}
}