Skip to content

Commit

Permalink
Extract images with VisitorImage #2
Browse files Browse the repository at this point in the history
- output to file based on format data.{format}
- more documentation
  • Loading branch information
vane committed Jul 23, 2019
1 parent 1b6bfbb commit cc20020
Show file tree
Hide file tree
Showing 15 changed files with 465 additions and 60 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ and see results in ``out`` directory
- specify output directory
- output to xml format
- ~~output to json format~~
- extract images to files
- ~~extract images to files~~
- extract font
- extract tables
- advanced font information
Expand Down
4 changes: 2 additions & 2 deletions demo.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
mkdir -p out
wget -O out/test.pdf -nc https://github.com/mozilla/pdf.js/raw/master/web/compressed.tracemonkey-pldi-09.pdf
node gd.js -i './out/test.pdf' -f json > out/test.json
node gd.js -i './out/test.pdf' -f text > out/test.txt
node gd.js -i './out/test.pdf' -f json
node gd.js -i './out/test.pdf' -f text
28 changes: 20 additions & 8 deletions gd.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const GoldDigger = require('./src/GoldDigger');
const fs = require('fs');
const minimist = require('minimist');
const GoldDigger = require('./src/GoldDigger');


const supportedFormat = ['text', 'json'];
Expand All @@ -10,7 +11,8 @@ Please specify one of those values : "${supportedFormat}"

const helpText = `
--input or -i pdf file location (required)
--debug or -d show debug information (optional - default false)
--output or -o pdf file location (optional default "out")
--debug or -d show debug information (optional - default "false")
--format or -f format (optional - default "text") - ("${supportedFormat}"):
--help or -h display this help message
`
Expand All @@ -26,27 +28,37 @@ if(help) {
console.log(helpText);
return;
}
const fpath = argv['input'] || argv['i'];

const input = argv['input'] || argv['i'];
const output = argv['output'] || argv['o'] || 'out';
let debug = argv['debug'] || argv['d'];
let format = argv['format'] || argv['f'] || 'text';
debug = toBool(debug);
if(format && supportedFormat.indexOf(format) < 0) {
console.error(ERR_INVALID_FORMAT);
return;
}
if(!fpath) {
if(!input) {
console.log(helpText);
console.log(argv);
return;
}
if(debug) console.log(fpath);
if(debug) console.log(input);

// configuration
const config = {};
config.paintFormXObject = false;
config.paintImageMaskXObject = false;
config.paintJpegXObject = false;
config.format = format;
config.outputDir = output;
config.input = input;
config.debug = debug;

const gd = new GoldDigger(config);
gd.dig(fpath, debug)
gd.dig().then(() => {
console.log("-----------------------------------------------");
console.log("Results : ");
fs.readdirSync(output).forEach(file => console.log(file));
console.log("-----------------------------------------------");
});


43 changes: 16 additions & 27 deletions src/GoldDigger.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
const fs = require('fs');
const pdf = require('pdfjs-dist');
const util = require('pdfjs-dist/lib/shared/util');
const Extract = require('./pdf/Extract');
const Visitor = require('./pdf/Visitor');
const Formatter = require('./pdf/Formatter');
const FileManager = require('./pdf/FileManager');

/**
* Generic error
Expand All @@ -20,37 +22,33 @@ class GoldDigger {
constructor(config) {
this.config = config;
this.visitor = new Visitor(config);
this.formatter = new Formatter()
this.formatter = new Formatter(config);
}

/**
* Checks if file exists load file to memory and returns PDFDocument
* @param fpath - pdf file path
* @param debug - debug bool flag
* @returns {Promise<PDFDocument>}
*/
async getDocument(fpath, debug) {
if (!fs.existsSync(fpath)) {
throw new GoldDiggerError(`File not exists ${fpath}`);
async getDocument() {
if (!fs.existsSync(this.config.input)) {
throw new GoldDiggerError(`File not exists ${this.config.input}`);
}
if(debug) console.log('Reading pdf');
if(this.config.debug) console.log('Reading pdf');
// read file
const data = fs.readFileSync(fpath);
if(debug) console.log(data.length);
const data = fs.readFileSync(this.config.input);
if(this.config.debug) console.log(data.length);
const doc = await pdf.getDocument({
data:data,
data: data,
}).promise;
return doc
}

/**
* Main method for pdf-gold-diger
* @param fpath - pdf file path
* @param debug - debug bool flag
* @returns {Promise<void>}
*/
async dig(fpath, debug) {
const doc = await this.getDocument(fpath, debug);
async dig() {
const doc = await this.getDocument();
const debug = this.config.debug;
if(debug) console.log(`Pages : ${doc.numPages}`);
// prepare formatting
const format = this.config.format;
Expand All @@ -67,6 +65,9 @@ class GoldDigger {
if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`)
}
this.formatter.end(format);
// save to file
const fpath = `${this.config.outputDir}/data.${format}`;
await FileManager.saveFileAsync(fpath, this.formatter.data);
}

/**
Expand Down Expand Up @@ -298,18 +299,6 @@ class GoldDigger {
if(debug) console.log('paintSolidColorImageMask');
//this.paintSolidColorImageMask();
break;
case pdf.OPS.paintJpegXObject:
if(debug) console.log('paintJpegXObject');
//this.paintJpegXObject(args[0], args[1], args[2]);
break;
case pdf.OPS.paintImageXObject:
if(debug) console.log('paintImageXObject');
//this.paintImageXObject(args[0]);
break;
case pdf.OPS.paintInlineImageXObject:
if(debug) console.log('paintInlineImageXObject');
//this.paintInlineImageXObject(args[0]);
break;
case pdf.OPS.paintImageMaskXObject:
if(debug) console.log('paintImageMaskXObject');
//this.paintImageMaskXObject(args[0]);
Expand Down
13 changes: 2 additions & 11 deletions src/pdf/Extract.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const FontObject = require('./model/FontObject');
const Constraints = require('./Constraints');
const util = require('pdfjs-dist/lib/shared/util');

/**
* Extracts text information from glyphs
Expand All @@ -20,7 +21,7 @@ class ExtractText {
// Word break
x += font.direction * line.wordSpacing;
continue;
} else if (this.isNum(glyph)) {
} else if (util.isNum(glyph)) {
x += -glyph * font.size * 0.001;
if (glyph < 0) {
partial += " ";
Expand Down Expand Up @@ -79,16 +80,6 @@ class ExtractText {
}
return font;
}

/**
* Checks if v is number
* see pdf.js/shared/util.js isNum
* @param v number value
* @returns {boolean} true if it's number
*/
isNum(v) {
return typeof v === 'number';
}
}

module.exports = {
Expand Down
28 changes: 28 additions & 0 deletions src/pdf/FileManager.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
const fs = require('fs');

/**
* Save file data to directory
* @param fpath - path to file with extension
* @param data - file content
* @returns {Promise<void>}
*/
const saveFileAsync = async (fpath, data) => {
const stream = fs.createWriteStream(fpath);
await stream.write(data);
await stream.end();
}

/**
* Make directory if not exists in given path
* @param path - directory path
*/
const mkdirNotExists = (path) => {
if(!fs.existsSync(path)) {
fs.mkdirSync(path);
}
}

module.exports = {
saveFileAsync,
mkdirNotExists,
}
13 changes: 9 additions & 4 deletions src/pdf/Formatter.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ const FormatterText = require('./formatter/FormatterText');
*/
class Formatter {

constructor() {
constructor(config) {
this.debug = config.debug;
this.formatters = {
json: new FormatterJSON(),
xml: new FormatterXML(),
text: new FormatterText(),
}
this.data = "";
}

/**
Expand All @@ -23,7 +25,8 @@ class Formatter {
*/
start(format, doc, metadata) {
const o = this.formatters[format].start(doc, metadata);
console.log(o);
this.data += o;
if (this.debug) console.log(o);
}

/**
Expand All @@ -35,7 +38,8 @@ class Formatter {
*/
format(format, page, data, last) {
const o = this.formatters[format].format(page, data, last);
console.log(o);
this.data += o;
if (this.debug) console.log(o);
}

/**
Expand All @@ -44,7 +48,8 @@ class Formatter {
*/
end(format) {
const o = this.formatters[format].end();
console.log(o);
this.data += o;
if (this.debug) console.log(o);
}
}

Expand Down
6 changes: 6 additions & 0 deletions src/pdf/Visitor.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
const VisitorText = require('./visitor/VisitorText');
const VisitorXObject = require('./visitor/VisitorXObject');
const VisitorImage = require('./visitor/VisitorImage');

const FN_TEXT = ['beginText', 'setFont', 'showText', 'showSpacedText', 'endText', 'moveText'];
const FN_XOBJECT = ['setTextMatrix', 'paintFormXObjectBegin', 'paintFormXObjectEnd'];
const FN_IMAGE = ['paintJpegXObject', 'paintImageXObject', 'paintInlineImageXObject', 'paintImageMaskXObject'];

/**
* Visits pdf.OPT.* methods using pdf page data
Expand All @@ -14,6 +17,7 @@ class Visitor {
this.config.skip = false;
this.txt = new VisitorText(config, debug, this.objectList);
this.xobject = new VisitorXObject(config, debug, this.objectList);
this.image = new VisitorImage(config, debug, this.objectList);
this.debug = config.debug;
}

Expand All @@ -29,6 +33,8 @@ class Visitor {
this.txt[fname](args, page, dependencies);
} else if(FN_XOBJECT.indexOf(fname) > -1) {
this.xobject[fname](args, page, dependencies);
} else if (FN_IMAGE.indexOf(fname) > -1) {
this.image[fname](args, page, dependencies);
} else {
console.warn(`Unimplemented operator ${fn}`);
}
Expand Down
6 changes: 3 additions & 3 deletions src/pdf/formatter/FormatterJSON.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class FormatterJSON {
/**
* Formats {TextObject} to JSON serializable object
* @param textObject {TextObject}
* @returns {{lines: Array, x: *, y: *}}
* @returns {object}
*/
formatTextObject(textObject) {
const txtObjOut = {lines: [], x: textObject.x, y: textObject.y};
Expand All @@ -28,7 +28,7 @@ class FormatterJSON {
/**
* Formats {TextLine} to JSON serializable object
* @param textLine {TextLine}
* @returns {{text: Array, x: *, y: *, w: *, h: *, textMatrix: *}}
* @returns {object}
*/
formatTextLine(textLine) {
const txtLineOut = {
Expand All @@ -49,7 +49,7 @@ class FormatterJSON {
/**
* Formats {TextFont} to JSON serializable object
* @param textFont {TextFont}
* @returns {{font: {size: (number|*), direction: (number|*), family: null, style: null, weight: null}, text: (string|TextFont|*), charSpacing: (number|*), wordSpacing: *}}
* @returns {object}
*/
formatTextFont(textFont) {
const font = textFont.getFont();
Expand Down
Loading

0 comments on commit cc20020

Please sign in to comment.