Skip to content

Commit

Permalink
Improve txt converter
Browse files Browse the repository at this point in the history
  • Loading branch information
mantas-done committed Apr 5, 2024
1 parent d05dfea commit 38403de
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 3 deletions.
8 changes: 8 additions & 0 deletions src/Code/Converters/CsvConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ public function canParseFileContent($file_content)
}
$last_row = $csv[$count - 1];

// check if each row has the same column count
$last_row_count = count($last_row);
foreach ($csv as $row) {
if (count($row) !== $last_row_count) {
return false; // this is not a csv file
}
}

$has_timestamp = false;
$has_text = false;
foreach ($last_row as $cell) {
Expand Down
3 changes: 2 additions & 1 deletion src/Code/Converters/SrtConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace Done\Subtitles\Code\Converters;

use Done\Subtitles\Code\Helpers;
use Done\Subtitles\Code\UserException;

class SrtConverter implements ConverterContract
Expand Down Expand Up @@ -52,7 +53,7 @@ public function fileContentToInternalFormat($file_content, $original_file_conten
throw new UserException("Arrow should looks like this --> for srt format on line: " . $line . ' (SrtConverter)');
*/
} elseif ($parts['text'] !== null) {
$internal_format[$i]['lines'][] = strip_tags($line);
$internal_format[$i]['lines'][] = Helpers::removeOnlyHtmlTags($line);
}

if (!$saw_start) {
Expand Down
27 changes: 25 additions & 2 deletions src/Code/Converters/TxtConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace Done\Subtitles\Code\Converters;

use Done\Subtitles\Code\Helpers;
use Done\Subtitles\Code\UserException;

class TxtConverter implements ConverterContract
Expand Down Expand Up @@ -122,7 +123,10 @@ public function fileContentToInternalFormat($file_content, $original_file_conten

// strip html
foreach ($internal_format as &$row) {
$row['lines'] = array_map('strip_tags', $row['lines']);
foreach ($row['lines'] as &$line) {
$line = Helpers::removeOnlyHtmlTags($line);
}
unset($line);
}
unset($row);

Expand Down Expand Up @@ -369,7 +373,7 @@ public static function doesFileUseTimestamps(array $lines)
if (isset($timestamps[0][0])) {
$start = $timestamps[0][0];
$before = self::strBefore($line, $start);
if (self::hasText($before) || self::hasDigit($before)) {
if (self::hasText($before)) {
continue;
}
$lines_with_timestamp_count++;
Expand Down Expand Up @@ -422,6 +426,16 @@ public static function withoutTimestampsInternalFormat(array $lines)
$internal_format[] = ['lines' => [$line]];
}
$internal_format = self::fillStartAndEndTimes($internal_format);

// strip html
foreach ($internal_format as &$row) {
foreach ($row['lines'] as &$line) {
$line = Helpers::removeOnlyHtmlTags($line);
}
unset($line);
}
unset($row);

return $internal_format;
}

Expand Down Expand Up @@ -492,6 +506,15 @@ private static function twoLinesSeparatedByEmptyLine(string $file_content)
}
}

// strip html
foreach ($internal_format as &$row) {
foreach ($row['lines'] as &$line) {
$line = Helpers::removeOnlyHtmlTags($line);
}
unset($line);
}
unset($row);

return self::fillStartAndEndTimes($internal_format);
}

Expand Down
79 changes: 79 additions & 0 deletions src/Code/Helpers.php
Original file line number Diff line number Diff line change
Expand Up @@ -253,4 +253,83 @@ public static function strAfterLast($subject, $search)

return substr($subject, $position + strlen($search));
}

public static function strBefore($subject, $search)
{
if ($search === '') {
return $subject;
}

$result = strstr($subject, (string) $search, true);

return $result === false ? $subject : $result;
}

public static function removeOnlyHtmlTags($string)
{
$letters = preg_split('//u', $string, null, PREG_SPLIT_NO_EMPTY);
$parts = [];
$current_text = '';
foreach ($letters as $letter) {
if ($letter === '<') {
if ($current_text !== '') {
$parts[] = $current_text;
$current_text = '<';
} else {
$current_text = '<';
}
} elseif ($letter === '>') {
$current_text .= '>';
$parts[] = $current_text;
$current_text = '';
} else {
$current_text .= $letter;
}
}
if ($current_text !== '') {
$parts[] = $current_text;
}

$text = '';
foreach ($parts as $part) {
if (!Helpers::isRealHtmlTag($part)) {
$text .= $part;
}
}
$text = preg_replace('/\s+/', ' ', $text);
return $text;
}

private static function isRealHtmlTag($tag)
{
$starts = ['div', 'p', 'a', 'b', 'i', 'u', 'strong', 'img', 'ul', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'input', 'br', 'font'];
$attributes = ['id', 'class', 'href', 'src', 'alt', 'title', 'style', 'target', 'rel', 'type', 'color', 'size'];

$found_start = false;
foreach ($starts as $start) {
if (preg_match("/^<\s*\/?\s*$start\s*\/?\s*>/i", $tag)) {
return true;
}

$tag_start = Helpers::strBefore($tag, ' ');
if ($tag_start === "<$start") {
$found_start = true;
break;
}
}
if (!$found_start) {
return false;
}

if (strpos($tag, '>') === false) {
return false; // no closing tag
}

foreach ($attributes as $attribute) {
if (preg_match("/ $attribute\s*=/i", $tag)) {
return true;
}
}
return false;
}
}
27 changes: 27 additions & 0 deletions tests/Helpers/RemoveHtmlTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?php

namespace Tests\Helpers;

use Done\Subtitles\Code\Helpers;
use PHPUnit\Framework\TestCase;

class RemoveHtmlTest extends TestCase
{
public function testRemoveHtml()
{
$this->assertEquals('a', Helpers::removeOnlyHtmlTags('<b>a</b>'), 1);
$this->assertEquals('a', Helpers::removeOnlyHtmlTags('<b style="something">a</b>'), 2);
$this->assertEquals('a', Helpers::removeOnlyHtmlTags('a<br>'), 3);
$this->assertEquals('a', Helpers::removeOnlyHtmlTags('a<br >'), 4);
$this->assertEquals('a', Helpers::removeOnlyHtmlTags('a<br />'), 5);
$this->assertEquals('a', Helpers::removeOnlyHtmlTags('a<br/>'), 6);
$this->assertEquals('a b', Helpers::removeOnlyHtmlTags('a <a href="something">b</a>'), 7);
$this->assertEquals('a <a sentence is here>', Helpers::removeOnlyHtmlTags('a <a sentence is here>'), 8);
$this->assertEquals('a b', Helpers::removeOnlyHtmlTags('a <i> <i> b'));
$this->assertEquals('a ', Helpers::removeOnlyHtmlTags('a </ I>'));
$this->assertEquals(' www.url.net ', Helpers::removeOnlyHtmlTags('<font color = "# 00ffff"> www.url.net </ font> </ font>'));
$this->assertEquals('word', Helpers::removeOnlyHtmlTags('<font COLOR="WHITE">word'));
$this->assertEquals('<font', Helpers::removeOnlyHtmlTags('<font'));
$this->assertEquals('<font color = ', Helpers::removeOnlyHtmlTags('<font color = '));
}
}
14 changes: 14 additions & 0 deletions tests/formats/TxtTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,20 @@ public function testWhenCantGetLinesReturnsUserException()
')->getInternalFormat();
}

public function testNumberBeforeTimestamp()
{
$actual = Subtitles::loadFromString('1 00:00:01:00 00:00:02:00 a')->getInternalFormat();
$expected = (new Subtitles())->add(1, 2, 'a')->getInternalFormat();
$this->assertInternalFormatsEqual($expected, $actual);
}

public function testDoesNotRemoveNotHtmlTag()
{
$actual = Subtitles::loadFromString('text <a sentence> <div>')->getInternalFormat();
$expected = (new Subtitles())->add(0, 1, 'text <a sentence>')->getInternalFormat();
$this->assertInternalFormatsEqual($expected, $actual);
}

// ---------------------------------- private ----------------------------------------------------------------------

private static function generatedSubtitles()
Expand Down

0 comments on commit 38403de

Please sign in to comment.