Skip to content

Commit

Permalink
Added font info to dataTm (#516)
Browse files Browse the repository at this point in the history
* Added font info to dataTm

* Usage.md: small text highlighting

* Added description of Tf command

Co-authored-by: Konrad Abicht <hi@inspirito.de>

Co-authored-by: Konrad Abicht <hi@inspirito.de>
  • Loading branch information
shtayerc and k00ni authored Mar 14, 2022
1 parent 9e5bc21 commit a1c1652
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 2 deletions.
103 changes: 103 additions & 0 deletions doc/Usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,109 @@ $text = $pdf->getText();
$text = $pdf->getPages()[0]->getText();
```

## Extract text positions

You can extract transformation matrix (indexes 0-3) and x,y position of text objects (indexes 4,5).

```php
$data = $pdf->getPages()[0]->getDataTm();

Array
(
[0] => Array
(
[0] => Array
(
[0] => 0.999429
[1] => 0
[2] => 0
[3] => 1
[4] => 201.96
[5] => 720.68
)

[1] => Document title
)

[1] => Array
(
[0] => Array
(
[0] => 0.999402
[1] => 0
[2] => 0
[3] => 1
[4] => 70.8
[5] => 673.64
)

[1] => Calibri : Lorem ipsum dolor sit amet, consectetur a
)
)
```

When activated via Config setting (`Config::setDataTmFontInfoHasToBeIncluded(true)`) font identifier (index 2) and font size (index 3) are added to dataTm.

```php
// create config
$config = new Smalot\PdfParser\Config();
$config->setDataTmFontInfoHasToBeIncluded(true);

// use config and parse file
$parser = new Smalot\PdfParser\Parser([], $config);
$pdf = $parser->parseFile('document.pdf');

$data = $pdf->getPages()[0]->getDataTm();

Array
(
[0] => Array
(
[0] => Array
(
[0] => 0.999429
[1] => 0
[2] => 0
[3] => 1
[4] => 201.96
[5] => 720.68
)

[1] => Document title
[2] => R7
[3] => 27.96
)

[1] => Array
(
[0] => Array
(
[0] => 0.999402
[1] => 0
[2] => 0
[3] => 1
[4] => 70.8
[5] => 673.64
)

[1] => Calibri : Lorem ipsum dolor sit amet, consectetur a
[2] => R9
[3] => 11.04
)
)
```

Text width should be calculated on text from dataTm to make sure all character widths are available.
In next example we are using data from above.

```php
$fonts = $pdf->getFonts();
$font_id = $data[0][2]; //R7
$font = $fonts[$font_id];
$text = $data[0][1];
$width = $font->calculateTextWidth($text, $missing);
```

## Extract metadata

You can also extract metadata. The available data varies from PDF to PDF.
Expand Down
17 changes: 17 additions & 0 deletions src/Smalot/PdfParser/Config.php
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ class Config
*/
private $decodeMemoryLimit = 0;

/**
* Whether to include font id and size in dataTm array
*
* @var bool
*/
private $dataTmFontInfoHasToBeIncluded = false;

public function getFontSpaceLimit()
{
return $this->fontSpaceLimit;
Expand Down Expand Up @@ -132,4 +139,14 @@ public function setDecodeMemoryLimit(int $decodeMemoryLimit): void
{
$this->decodeMemoryLimit = $decodeMemoryLimit;
}

public function getDataTmFontInfoHasToBeIncluded(): bool
{
return $this->dataTmFontInfoHasToBeIncluded;
}

public function setDataTmFontInfoHasToBeIncluded(bool $dataTmFontInfoHasToBeIncluded): void
{
$this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded;
}
}
47 changes: 45 additions & 2 deletions src/Smalot/PdfParser/Page.php
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,13 @@ public function getDataCommands(array $extractedDecodedRawData = null): array
$extractedData[] = $command;
break;

case 'Tf':
case 'TF':
if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
$extractedData[] = $command;
}
break;

/*
* array TJ
* Show one or more text strings allow individual glyph positioning.
Expand Down Expand Up @@ -658,6 +665,12 @@ public function getDataTm(array $dataCommands = null): array
*/
$defaultTl = 0;

/*
* Set default values for font data
*/
$defaultFontId = -1;
$defaultFontSize = 0;

/*
* Setting where are the X and Y coordinates in the matrix (Tm)
*/
Expand All @@ -668,6 +681,8 @@ public function getDataTm(array $dataCommands = null): array

$Tm = $defaultTm;
$Tl = $defaultTl;
$fontId = $defaultFontId;
$fontSize = $defaultFontSize;

$extractedTexts = $this->getTextArray();
$extractedData = [];
Expand All @@ -683,6 +698,8 @@ public function getDataTm(array $dataCommands = null): array
$Tl = $defaultTl; //review this.
$Tx = 0;
$Ty = 0;
$fontId = $defaultFontId;
$fontSize = $defaultFontSize;
break;

/*
Expand All @@ -694,6 +711,8 @@ public function getDataTm(array $dataCommands = null): array
$Tl = $defaultTl; //review this
$Tx = 0;
$Ty = 0;
$fontId = $defaultFontId;
$fontSize = $defaultFontSize;
break;

/*
Expand Down Expand Up @@ -765,7 +784,12 @@ public function getDataTm(array $dataCommands = null): array
* Show a Text String
*/
case 'Tj':
$extractedData[] = [$Tm, $currentText];
$data = [$Tm, $currentText];
if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
$data[] = $fontId;
$data[] = $fontSize;
}
$extractedData[] = $data;
break;

/*
Expand Down Expand Up @@ -799,6 +823,20 @@ public function getDataTm(array $dataCommands = null): array
$extractedData[] = [$Tm, $data[2]]; //Verify
break;

case 'Tf':
/*
* From PDF 1.0 specification, page 106:
* fontname size Tf Set font and size
* Sets the text font and text size in the graphics state. There is no default value for
* either fontname or size; they must be selected using Tf before drawing any text.
* fontname is a resource name. size is a number expressed in text space units.
*
* Source: https://ia902503.us.archive.org/10/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf
* Introduced with https://github.com/smalot/pdfparser/pull/516
*/
list($fontId, $fontSize) = explode(' ', $command['c'], 2);
break;

/*
* array TJ
* Show one or more text strings allow individual glyph positioning.
Expand All @@ -812,7 +850,12 @@ public function getDataTm(array $dataCommands = null): array
* amount.
*/
case 'TJ':
$extractedData[] = [$Tm, $currentText];
$data = [$Tm, $currentText];
if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
$data[] = $fontId;
$data[] = $fontSize;
}
$extractedData[] = $data;
break;
default:
}
Expand Down
41 changes: 41 additions & 0 deletions tests/Integration/PageTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

namespace Tests\Smalot\PdfParser\Integration;

use Smalot\PdfParser\Config;
use Smalot\PdfParser\Document;
use Smalot\PdfParser\Element\ElementMissing;
use Smalot\PdfParser\Font;
Expand Down Expand Up @@ -441,6 +442,46 @@ public function testGetDataTm(): void
$this->assertStringContainsString('Purchase 2', $item[1]);
}

public function testDataTmFontInfoHasToBeIncluded(): void
{
$config = new Config();
$config->setDataTmFontInfoHasToBeIncluded(true);

$filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
$parser = $this->getParserInstance($config);
$document = $parser->parseFile($filename);
$pages = $document->getPages();
$page = $pages[0];
$dataTm = $page->getDataTm();
$fonts = $page->getFonts();

$item = $dataTm[0];
$this->assertCount(4, $item);
$this->assertEquals($item[2], 'R7');
$this->assertEquals($item[3], '27.96');
$this->assertArrayHasKey('R7', $fonts);
$item = $dataTm[80];
$this->assertCount(4, $item);
$this->assertEquals($item[2], 'R14');
$this->assertEquals($item[3], '11.04');
$this->assertArrayHasKey('R7', $fonts);

$filename = $this->rootDir.'/samples/InternationalChars.pdf';
$document = $parser->parseFile($filename);
$pages = $document->getPages();
$page = $pages[0];
$dataTm = $page->getDataTm();
$fonts = $page->getFonts();

$item = $dataTm[88];
$this->assertEquals($item[2], 'C2_0');
$this->assertEquals($item[3], '1');
$this->assertArrayHasKey('C2_0', $fonts);
foreach ($dataTm as $item) {
$this->assertCount(4, $item);
}
}

/**
* Tests getDataTm with hexadecimal encoded document text.
*
Expand Down

0 comments on commit a1c1652

Please sign in to comment.