Skip to content

Commit

Permalink
Improve srt and docx file recognition
Browse files Browse the repository at this point in the history
  • Loading branch information
mantas-done committed Apr 19, 2024
1 parent d4062b9 commit dac4845
Show file tree
Hide file tree
Showing 38 changed files with 86 additions and 48 deletions.
2 changes: 1 addition & 1 deletion src/Code/Converters/AssConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class AssConverter implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return preg_match('/\[Script Info\]\R/m', $file_content) === 1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/ConverterContract.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ interface ConverterContract
* @param string $file_content
* @return bool
*/
public function canParseFileContent($file_content);
public function canParseFileContent($file_content, $original_file_content);

/**
* Converts file content (.srt, .stl... file content) to library's "internal format"
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/CsvConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ private static function timeRegex()
return rtrim(TxtConverter::$time_regexp, '/') . '|(\d+)/';
}

public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
$csv = self::csvToArray(trim($file_content));

Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/DfxpConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class DfxpConverter implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return
(preg_match('/xmlns="http:\/\/www\.w3\.org\/ns\/ttml"/m', $file_content) === 1 && preg_match('/xml:id="d1"/m', $file_content) === 1) // old netflix format;
Expand Down
20 changes: 18 additions & 2 deletions src/Code/Converters/DocxReader.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,25 @@

class DocxReader implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return strpos($file_content, 'PK') === 0 && strpos($file_content, '[Content_Types].xml') !== false;
if (strpos($original_file_content, 'PK') === 0 && strpos($original_file_content, '[Content_Types].xml') !== false) {
$tmp_file = tempnam(sys_get_temp_dir(), 'prefix_');
file_put_contents($tmp_file, $original_file_content);

$zip = new \ZipArchive();
$opened = $zip->open($tmp_file, \ZipArchive::RDONLY); // zip archive can only open real file
if ($opened === true) {
$zip->close();
}
unlink($tmp_file);

if ($opened === true) {
return true;
}
}

return false;
}

public function fileContentToInternalFormat($file_content, $original_file_content)
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/EbuStlReader.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class EbuStlReader implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return substr($file_content, 3, 3) === 'STL' && is_numeric(substr($file_content, 6, 2));
}
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/LrcConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class LrcConverter implements ConverterContract
protected static $regexp = '/\[\s*(\d{2}:\d{2}(?:[:.]\d{1,3})?)\s*]/';
protected static $time_offset_regexp = '/\[offset:\s*\+?(-?\d+)\s*]/s';

public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
// only select when there is text after the timestamp
// do not select files that have timestamp and text somewhere on the other line
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/RtfReader.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class RtfReader implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return strpos($file_content, '{\rtf1') === 0;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/SbvConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class SbvConverter implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return preg_match('/^\d{1,2}:\d{2}:\d{2}\.\d{3},\d{1,2}:\d{2}:\d{2}\.\d{3}\R(.*)/m', $file_content) === 1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/SccConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class SccConverter implements ConverterContract
23.976,
];

public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return preg_match('/Scenarist_SCC V1.0/', $file_content) === 1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/SmiConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class SmiConverter implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return preg_match('/<SAMI>/m', $file_content) === 1;
}
Expand Down
4 changes: 2 additions & 2 deletions src/Code/Converters/SrtConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

class SrtConverter implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return preg_match('/^0*\d?\R(\d{1,2}:\d{2}:\d{2},\d{1,3}\s*-->\s*\d{1,2}:\d{2}:\d{2},\d{1,3})\R(.+)$/m', $file_content) === 1;
return preg_match('/^0*\d?\R(\d{1,2}:\d{2}:\d{2}[,\.]\d{1,3}\s*-->\s*\d{1,2}:\d{2}:\d{2}[,\.]\d{1,3})\R(.+)$/m', $file_content) === 1;
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/StlConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class StlConverter implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return preg_match('/^\d{2}:\d{2}:\d{2}:\d{2}\s,\s\d{2}:\d{2}:\d{2}:\d{2}\s,.+/m', $file_content) === 1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/SubMicroDvdConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class SubMicroDvdConverter implements ConverterContract


static $pattern = '/(?:\{|\[)(?<start>\d+)(?:\}|\])(?:\{|\[)(?<end>\d+)(?:\}|\])(?<text>.+)/';
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return preg_match(self::$pattern, $file_content, $matches);
}
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/SubViewerConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class SubViewerConverter implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return preg_match('/^(\d{2}:\d{2}:\d{2}\.\d{2}),(\d{2}:\d{2}:\d{2}\.\d{2})\R(.*)$/m', $file_content) === 1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/TtmlConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

class TtmlConverter implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
$first_line = explode("\n", $file_content)[0];

Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/TxtConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class TxtConverter implements ConverterContract
public static $time_regexp = '/(?:\d{2}[:;])(?:\d{1,2}[:;])(?:\d{1,2}[:;])\d{1,3}|(?:\d{1,2}[:;])?(?:\d{1,2}[:;])\d{1,3}(?:[.,]\d+)?(?!\d)|\d{1,5}[.,]\d{1,3}/';
private static $any_letter_regex = '/\p{L}/u';

public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return self::hasText($file_content) && !Helpers::strContains($file_content, "\x00"); // not a binary file
}
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/TxtQuickTimeConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
// qt.txt
class TxtQuickTimeConverter implements ConverterContract
{
public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
return preg_match('/{QTtext}/m', $file_content) === 1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Code/Converters/VttConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class VttConverter implements ConverterContract
{
protected static $time_regexp = '((?:\d{2}:){1,2}\d{2}\.\d{3})\s+-->\s+((?:\d{2}:){1,2}\d{2}\.\d{3})';

public function canParseFileContent($file_content)
public function canParseFileContent($file_content, $original_file_content)
{
$lines = explode("\n", $file_content);

Expand Down
4 changes: 2 additions & 2 deletions src/Code/Helpers.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ public static function getConverterByFormat($format)
throw new \Exception("Can't find suitable converter, for format: $format");
}

public static function getConverterByFileContent($file_content)
public static function getConverterByFileContent($file_content, $original_file_content)
{
foreach (Subtitles::$formats as $row) {
$class_name = $row['class'];
$full_class_name = $class_name;
/** @var ConverterContract $converter */
$converter = new $full_class_name();
if ($converter->canParseFileContent($file_content)) {
if ($converter->canParseFileContent($file_content, $original_file_content)) {
return $converter;
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/Code/Other/DocxToText.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ private function __construct($file_content)
$zip = new \ZipArchive();
$opened = $zip->open($tmp_file, \ZipArchive::RDONLY); // zip archive can only open real file
if ($opened !== true) {
throw new \Exception();
unlink($tmp_file);
throw new \Exception("Can't open zip");
}
$this->zip = $zip;
$this->tmp_path = $tmp_file;
Expand Down
2 changes: 1 addition & 1 deletion src/Subtitles.php
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ public static function loadFromString($string, $strict = true)
$modified_string = Helpers::normalizeNewLines($modified_string);
$converter->input = $modified_string;

$input_converter = Helpers::getConverterByFileContent($converter->input);
$input_converter = Helpers::getConverterByFileContent($converter->input, $string);
$internal_format = $input_converter->fileContentToInternalFormat($converter->input, $string);

// remove empty lines
Expand Down
2 changes: 1 addition & 1 deletion tests/PublicInterfaceTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public function testConvertUsingThirdParameter()
@unlink($temporary_srt_path);

Subtitles::convert($srt_path, $temporary_srt_path, ['output_format' => 'vtt']);
$converter = Helpers::getConverterByFileContent(file_get_contents($temporary_srt_path));
$converter = Helpers::getConverterByFileContent(file_get_contents($temporary_srt_path), file_get_contents($temporary_srt_path));
unlink($temporary_srt_path);

$this->assertEquals(VttConverter::class, get_class($converter));
Expand Down
4 changes: 2 additions & 2 deletions tests/formats/AssTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ class AssTest extends TestCase {
public function testAss()
{
$content = file_get_contents('./tests/files/ass.ass');
$converter = Helpers::getConverterByFileContent($content);
$converter = Helpers::getConverterByFileContent($content, $content);
$this->assertTrue(get_class($converter) === AssConverter::class);
}

public function testThisIsNotAssFormat()
{
$content = '[Script Info]';
$converter = Helpers::getConverterByFileContent($content);
$converter = Helpers::getConverterByFileContent($content, $content);
$this->assertTrue(get_class($converter) !== AssConverter::class);
}

Expand Down
8 changes: 4 additions & 4 deletions tests/formats/CsvTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ public function testRecognizesCsvFormat()
$csv = 'Start,End,Text
137.44,140.375,"Senator, we\'re making our final approach into Coruscant."
3740.476,3742.501,"Very good, Lieutenant."';
$converter = Helpers::getConverterByFileContent($csv);
$converter = Helpers::getConverterByFileContent($csv, $csv);
$this->assertTrue(get_class($converter) === CsvConverter::class, get_class($converter));
}

public function testDoesntSelectNonCsvFormat()
{
$csv = '0:00:15.1,0:00:17.4 Herkese merhaba.
0:00:17.4,0:00:20.7 Bu videoda Microsoft office ürünlerinin';
$converter = Helpers::getConverterByFileContent($csv);
$converter = Helpers::getConverterByFileContent($csv, $csv);
$this->assertTrue(get_class($converter) !== CsvConverter::class, get_class($converter));
}

Expand Down Expand Up @@ -214,15 +214,15 @@ public function testDifferentElementCountShouldntBeInterpretedAsCsv()
$csv = 'Start,End,Text
hi
3740.476,3742.501,"Very good, Lieutenant."';
$converter = Helpers::getConverterByFileContent($csv);
$converter = Helpers::getConverterByFileContent($csv, $csv);
$this->assertTrue(get_class($converter) !== CsvConverter::class, get_class($converter));
}
public function testShouldntThrowException()
{
$csv = '1,a
' . ' ' . '
2,b';
$converter = Helpers::getConverterByFileContent($csv);
$converter = Helpers::getConverterByFileContent($csv, $csv);
$this->assertTrue(get_class($converter) !== CsvConverter::class, get_class($converter));
}
}
2 changes: 1 addition & 1 deletion tests/formats/DfxpTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class DfxpTest extends TestCase {
public function testRecognizesDfxp()
{
$content = file_get_contents('./tests/files/dfxp.dfxp');
$converter = Helpers::getConverterByFileContent($content);
$converter = Helpers::getConverterByFileContent($content, $content);
$this->assertTrue(get_class($converter) === DfxpConverter::class);
}

Expand Down
13 changes: 11 additions & 2 deletions tests/formats/DocxTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@

namespace Formats;

use Done\Subtitles\Code\Converters\DocxReader;
use Done\Subtitles\Code\Helpers;
use Done\Subtitles\Code\UserException;
use Done\Subtitles\Subtitles;
use PHPUnit\Framework\TestCase;
use Helpers\AdditionalAssertionsTrait;

class DocxTest extends TestCase
{

use AdditionalAssertionsTrait;

public function testParsesDocxFile()
{
$content = file_get_contents('./tests/files/docx.docx');
Expand All @@ -21,4 +22,12 @@ public function testParsesDocxFile()
->getInternalFormat();
$this->assertInternalFormatsEqual($expected, $actual);
}

public function testCorruptedZip()
{
$this->expectExceptionMessage("Can't find suitable converter for the file");

$content = file_get_contents('./tests/files/corrupted.zip');
Helpers::getConverterByFileContent($content, $content);
}
}
4 changes: 2 additions & 2 deletions tests/formats/LrcTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class LrcTest extends TestCase
public function testRecognizeLrc()
{
$content = file_get_contents('./tests/files/lrc.lrc');
$converter = Helpers::getConverterByFileContent($content);
$converter = Helpers::getConverterByFileContent($content, $content);
$this->assertTrue(get_class($converter) === LrcConverter::class);
}

Expand All @@ -33,7 +33,7 @@ public function testNotLrc() // let other converter handle invalid lrc
Main Chaand Launga
Solah Satrah Sitaare
Sang Baandh Launga';
$converter = Helpers::getConverterByFileContent($content);
$converter = Helpers::getConverterByFileContent($content, $content);
$this->assertTrue(get_class($converter) !== LrcConverter::class);
}

Expand Down
2 changes: 1 addition & 1 deletion tests/formats/SbvTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public function testRecognizesSbv()
Don’t think that you can just ignore them
because they’re not your children or relatives.
TEXT;
$converter = Helpers::getConverterByFileContent($content);
$converter = Helpers::getConverterByFileContent($content, $content);
$this->assertTrue(get_class($converter) === SbvConverter::class);
$this->assertTrue(true);
}
Expand Down
2 changes: 1 addition & 1 deletion tests/formats/SccTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class SccTest extends TestCase {
public function testRecognizesScc()
{
$content = file_get_contents('./tests/files/scc.scc');
$converter = Helpers::getConverterByFileContent($content);
$converter = Helpers::getConverterByFileContent($content, $content);
$this->assertTrue(get_class($converter) === SccConverter::class);
}

Expand Down
2 changes: 1 addition & 1 deletion tests/formats/SmiTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class SmiTest extends TestCase {
public function testRecognizesSmi()
{
$content = file_get_contents('./tests/files/smi.smi');
$converter = Helpers::getConverterByFileContent($content);
$converter = Helpers::getConverterByFileContent($content, $content);
$this->assertTrue(get_class($converter) === SmiConverter::class);
}

Expand Down
Loading

0 comments on commit dac4845

Please sign in to comment.