From c08dac2343228c7173be9a3fb99d895fed21e925 Mon Sep 17 00:00:00 2001 From: Matthijs Wesseling Date: Mon, 30 Oct 2023 10:25:38 +0100 Subject: [PATCH 01/10] It no work yet --- src/xpydf/PdfLoader.cc | 286 ++++++++++++++++++++++++++++++++++ src/xpydf/PdfLoader.h | 6 + src/xpydf/PdfLoaderWrapper.cc | 17 ++ src/xpydf/cXpdfPython.pyi | 1 + src/xpydf/pdf_loader.py | 14 ++ src/xpydf/pdf_loader.pyi | 1 + 6 files changed, 325 insertions(+) diff --git a/src/xpydf/PdfLoader.cc b/src/xpydf/PdfLoader.cc index e3a248e..4066bfa 100644 --- a/src/xpydf/PdfLoader.cc +++ b/src/xpydf/PdfLoader.cc @@ -28,6 +28,8 @@ #include "config.h" #include "SplashOutputDev.h" #include "SplashBitmap.h" +#include "Annot.h" +#include "AcroForm.h" #include "PdfLoader.h" #include "ImageDataDev.h" @@ -178,6 +180,279 @@ std::vector PdfLoader::extractPageInfo() { return pagesInfo; } +static Ref *fonts; +static int fontsLen; +static int fontsSize; + +static char *seenObjs; +static int numObjects; + +void PdfLoader::scanFonts(Object *obj) { + Object obj2; + + if (checkFontObject(obj, &obj2) && obj2.isDict()) { + scanFonts(obj2.getDict()); + } + obj2.free(); +} + +void PdfLoader::scanFonts(Dict *resDict) { + Object fontDict1, fontDict2, xObjDict1, xObjDict2, xObj1, xObj2; + Object patternDict1, patternDict2, pattern1, pattern2; + Object gsDict1, gsDict2, gs1, gs2, smask1, smask2, smaskGroup1, smaskGroup2; + Object resObj; + Ref r; + GfxFontDict *gfxFontDict; + GfxFont *font; + int i; + + // scan the fonts in this resource dictionary + gfxFontDict = NULL; + resDict->lookupNF("Font", &fontDict1); + if (checkFontObject(&fontDict1, &fontDict2) && fontDict2.isDict()) { + fprintf(stderr, "checkFontObject success\n"); + if (fontDict1.isRef()) { + fprintf(stderr, "isRef\n"); + r = fontDict1.getRef(); + gfxFontDict = new GfxFontDict(doc->getXRef(), &r, fontDict2.getDict()); + } else { + fprintf(stderr, "noRef\n"); + gfxFontDict = new GfxFontDict(doc->getXRef(), NULL, fontDict2.getDict()); + } + if (gfxFontDict) { + fprintf(stderr, "gfxFontDict\n"); + for (i = 0; i < gfxFontDict->getNumFonts(); ++i) { + if ((font = gfxFontDict->getFont(i))) { + fprintf(stderr, "Scanning font:\n"); + scanFont(font); + } + } + delete gfxFontDict; + } + } else { + fprintf(stderr, "Sad :(\n"); + } + fontDict2.free(); + fontDict1.free(); + + // recursively scan any resource dictionaries in XObjects in this + // resource dictionary + resDict->lookupNF("XObject", &xObjDict1); + if (checkFontObject(&xObjDict1, &xObjDict2) && xObjDict2.isDict()) { + for (i = 0; i < xObjDict2.dictGetLength(); ++i) { + xObjDict2.dictGetValNF(i, &xObj1); + if (checkFontObject(&xObj1, &xObj2) && xObj2.isStream()) { + xObj2.streamGetDict()->lookupNF("Resources", &resObj); + scanFonts(&resObj); + resObj.free(); + } + xObj2.free(); + xObj1.free(); + } + } + xObjDict2.free(); + xObjDict1.free(); + + // recursively scan any resource dictionaries in Patterns in this + // resource dictionary + resDict->lookupNF("Pattern", &patternDict1); + if (checkFontObject(&patternDict1, &patternDict2) && patternDict2.isDict()) { + for (i = 0; i < patternDict2.dictGetLength(); ++i) { + patternDict2.dictGetValNF(i, &pattern1); + if (checkFontObject(&pattern1, &pattern2) && pattern2.isStream()) { + pattern2.streamGetDict()->lookupNF("Resources", &resObj); + scanFonts(&resObj); + resObj.free(); + } + pattern2.free(); + pattern1.free(); + } + } + patternDict2.free(); + patternDict1.free(); + + // recursively scan any resource dictionaries in ExtGStates in this + // resource dictionary + resDict->lookupNF("ExtGState", &gsDict1); + if (checkFontObject(&gsDict1, &gsDict2) && gsDict2.isDict()) { + for (i = 0; i < gsDict2.dictGetLength(); ++i) { + gsDict2.dictGetValNF(i, &gs1); + if (checkFontObject(&gs1, &gs2) && gs2.isDict()) { + gs2.dictLookupNF("SMask", &smask1); + if (checkFontObject(&smask1, &smask2) && smask2.isDict()) { + smask2.dictLookupNF("G", &smaskGroup1); + if (checkFontObject(&smaskGroup1, &smaskGroup2) && + smaskGroup2.isStream()) { + smaskGroup2.streamGetDict()->lookupNF("Resources", &resObj); + scanFonts(&resObj); + resObj.free(); + } + smaskGroup2.free(); + smaskGroup1.free(); + } + smask2.free(); + smask1.free(); + } + gs2.free(); + gs1.free(); + } + } + gsDict2.free(); + gsDict1.free(); +} + +void PdfLoader::scanFont(GfxFont *font) { + Ref fontRef, embRef; + Object fontObj, toUnicodeObj; + GString *name; + GBool emb, subset, hasToUnicode; + GfxFontLoc *loc; + int i; + + fontRef = *font->getID(); + + // check for an already-seen font + for (i = 0; i < fontsLen; ++i) { + if (fontRef.num == fonts[i].num && fontRef.gen == fonts[i].gen) { + return; + } + } + + // font name + name = font->getName(); + + // check for an embedded font + if (font->getType() == fontType3) { + emb = gTrue; + } else { + emb = font->getEmbeddedFontID(&embRef); + } + + // look for a ToUnicode map + hasToUnicode = gFalse; + if (doc->getXRef()->fetch(fontRef.num, fontRef.gen, &fontObj)->isDict()) { + hasToUnicode = fontObj.dictLookup("ToUnicode", &toUnicodeObj)->isStream(); + toUnicodeObj.free(); + } + fontObj.free(); + + // check for a font subset name: capital letters followed by a '+' + // sign + subset = gFalse; + if (name) { + for (i = 0; i < name->getLength(); ++i) { + if (name->getChar(i) < 'A' || name->getChar(i) > 'Z') { + break; + } + } + subset = i > 0 && i < name->getLength() && name->getChar(i) == '+'; + } + + // print the font info + printf("%-46s", + name ? name->getCString() : "[none]"); + if (fontRef.gen >= 100000) { + printf(" [none]"); + } else { + printf(" %6d %2d", fontRef.num, fontRef.gen); + } + printf("\n"); + + // add this font to the list + if (fontsLen == fontsSize) { + if (fontsSize <= INT_MAX - 32) { + fontsSize += 32; + } else { + // let greallocn throw an exception + fontsSize = -1; + } + fonts = (Ref *)greallocn(fonts, fontsSize, sizeof(Ref)); + } + fonts[fontsLen++] = *font->getID(); +} + +GBool PdfLoader::checkFontObject(Object *in, Object *out) { + int objNum; + + if (!in->isRef()) { + in->copy(out); + return gTrue; + } + objNum = in->getRefNum(); + if (objNum < 0 || objNum >= numObjects) { + out->initNull(); + return gTrue; + } + if (seenObjs[objNum]) { + out->initNull(); + return gFalse; + } + seenObjs[objNum] = (char)1; + in->fetch(doc->getXRef(), out); + return gTrue; +} + +std::vector PdfLoader::extractFonts() { + int firstPage, lastPage; + std::vector fontInfo; + char dummyFile[1] = ""; + + Dict *resDict; + Annots *annots; + AcroForm *form; + Object obj1, obj2; + + if (!doc->isOk()) { + goto err; + } + + firstPage = 1; + lastPage = doc->getNumPages(); + + for (int page = firstPage; page <= lastPage; page++) { + Page *pdfPage = doc->getCatalog()->getPage(page); + + if ((resDict = pdfPage->getResourceDict())) { + fprintf(stderr, "scanFonts\n"); + scanFonts(resDict); + } + + annots = new Annots(doc, pdfPage->getAnnots(&obj1)); + obj1.free(); + + for (int i = 0; i < annots->getNumAnnots(); i++) { + if (annots->getAnnot(i)->getAppearance(&obj1)->isStream()) { + obj1.streamGetDict()->lookupNF("Resources", &obj2); + scanFonts(&obj2); + obj2.free(); + } + obj1.free(); + } + delete annots; + } + if ((form = doc->getCatalog()->getForm())) { + for (int i = 0; i < form->getNumFields(); ++i) { + form->getField(i)->getResources(&obj1); + if (obj1.isArray()) { + for (int j = 0; j < obj1.arrayGetLength(); ++j) { + obj1.arrayGetNF(j, &obj2); + scanFonts(&obj2); + obj2.free(); + } + } else if (obj1.isDict()) { + scanFonts(obj1.getDict()); + } + obj1.free(); + } + } + +err: + Object::memCheck(stderr); + gMemReport(stderr); + + return fontInfo; +} + std::vector PdfLoader::extractImages(int pageNum) { ImageDataDev *imageOut; std::vector images; @@ -255,3 +530,14 @@ bool PdfLoader::isOk() { int PdfLoader::getErrorCode() { return (int)doc->getErrorCode(); } + +#include + +using namespace std; + +int main() { + LoaderConfig config; + PdfLoader *l = new PdfLoader(config, "skf.pdf"); + cout << "Extracting fonts:" << endl; + l->extractFonts(); +} diff --git a/src/xpydf/PdfLoader.h b/src/xpydf/PdfLoader.h index ac583e6..1386b9e 100644 --- a/src/xpydf/PdfLoader.h +++ b/src/xpydf/PdfLoader.h @@ -37,6 +37,7 @@ class PdfLoader { ~PdfLoader(); std::vector extractText(); std::vector extractPageInfo(); + std::vector extractFonts(); std::vector extractImages(int pageNum); Image pageToImage(int pageNum, int dpi); bool isOk(); @@ -45,6 +46,11 @@ class PdfLoader { TextOutputControl textOutControl; PDFDoc *doc; GString *textFileName; + + GBool checkFontObject(Object *in, Object *out); + void scanFont(GfxFont *font); + void scanFonts(Object *obj); + void scanFonts(Dict *resDict); }; #endif diff --git a/src/xpydf/PdfLoaderWrapper.cc b/src/xpydf/PdfLoaderWrapper.cc index c1f3bf2..38d40cb 100644 --- a/src/xpydf/PdfLoaderWrapper.cc +++ b/src/xpydf/PdfLoaderWrapper.cc @@ -125,6 +125,19 @@ PyObject *extractPageInfo(PyObject *self, PyObject *args) { return Py_BuildValue("O", converted); } +PyObject *extractFonts(PyObject *self, PyObject *args) { + vector res; + + PyObject *loaderCapsule; + PyArg_ParseTuple(args, "O", &loaderCapsule); + + PdfLoader *loader = (PdfLoader *)PyCapsule_GetPointer(loaderCapsule, "loaderPtr"); + vector result = loader->extractFonts(); + + PyObject *converted = vectorStringToList(result); + return Py_BuildValue("O", converted); +} + PyObject *extractImages(PyObject *self, PyObject *args) { vector res; @@ -214,6 +227,10 @@ PyMethodDef cXpdfPythonFunctions[] = { extractPageInfo, METH_VARARGS, "Extract image metadata"}, + {"extractFonts", + extractFonts, METH_VARARGS, + "Extract font metadata"}, + {"extractImages", extractImages, METH_VARARGS, "Extract images"}, diff --git a/src/xpydf/cXpdfPython.pyi b/src/xpydf/cXpdfPython.pyi index a2766d7..5414d03 100644 --- a/src/xpydf/cXpdfPython.pyi +++ b/src/xpydf/cXpdfPython.pyi @@ -21,6 +21,7 @@ def construct( ) -> XpdfPythonCapsule: ... def extractText(capsule: XpdfPythonCapsule) -> List[bytes]: ... def extractPageInfo(capsule: XpdfPythonCapsule) -> List[PageInfo]: ... +def extractFonts(capsule: XpdfPythonCapsule) -> List[str]: ... def extractImages(capsule: XpdfPythonCapsule, page_number: int) -> List[npt.NDArray[Any]]: ... def pageToImage(capsule: XpdfPythonCapsule, page_number: int, dpi: int) -> npt.NDArray[Any]: ... def deleteObject(capsule: XpdfPythonCapsule) -> None: ... diff --git a/src/xpydf/pdf_loader.py b/src/xpydf/pdf_loader.py index 76c9748..c848976 100644 --- a/src/xpydf/pdf_loader.py +++ b/src/xpydf/pdf_loader.py @@ -147,6 +147,20 @@ def extract_page_info(self) -> List[PageInfo]: return images + def extract_fonts(self) -> List[str]: + """Return image related metadata from the pdf + + Returns + ------- + List[PageInfo] + A PageInfo object for each page + """ + images: List[str] = [] + if self.capsule is not None: + images = cXpdfPython.extractFonts(self.capsule) + + return images + def extract_images(self, page_number: int) -> List[npt.NDArray[Any]]: """Extract raw image data from a page, as a numpy array. diff --git a/src/xpydf/pdf_loader.pyi b/src/xpydf/pdf_loader.pyi index ec3c7c4..272eb2b 100644 --- a/src/xpydf/pdf_loader.pyi +++ b/src/xpydf/pdf_loader.pyi @@ -37,6 +37,7 @@ class PdfLoader: def extract_bytes(self) -> List[bytes]: ... def extract_strings(self) -> List[str]: ... def extract_page_info(self) -> List[PageInfo]: ... + def extract_fonts(self) -> List[str]: ... def extract_images(self, page_number: int) -> List[npt.NDArray[Any]]: ... def page_to_image(self, page_number: int, dpi: int = 150) -> npt.NDArray[Any]: ... def __del__(self) -> None: ... From ecd50a6f7bd422e3d5c7072271ab72b714b4f490 Mon Sep 17 00:00:00 2001 From: Matthijs Wesseling Date: Mon, 30 Oct 2023 11:41:04 +0100 Subject: [PATCH 02/10] Move class def into header --- src/xpdf-4.04/xpdf/TextOutputDev.cc | 35 +++--------------------- src/xpdf-4.04/xpdf/TextOutputDev.h | 42 +++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 34 deletions(-) diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.cc b/src/xpdf-4.04/xpdf/TextOutputDev.cc index 0411db8..c957493 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.cc +++ b/src/xpdf-4.04/xpdf/TextOutputDev.cc @@ -203,38 +203,6 @@ static inline double dmax(double x, double y) { // TextChar //------------------------------------------------------------------------ -class TextChar { -public: - - TextChar(Unicode cA, int charPosA, int charLenA, - double xMinA, double yMinA, double xMaxA, double yMaxA, - int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, - TextFontInfo *fontA, double fontSizeA, - double colorRA, double colorGA, double colorBA); - - static int cmpX(const void *p1, const void *p2); - static int cmpY(const void *p1, const void *p2); - static int cmpCharPos(const void *p1, const void *p2); - - Unicode c; - int charPos; - int charLen; - double xMin, yMin, xMax, yMax; - TextFontInfo *font; - double fontSize; - double colorR, - colorG, - colorB; - - // group the byte-size fields to minimize object size - Guchar rot; - char rotated; - char clipped; - char invisible; - char spaceAfter; - char overlap; -}; - TextChar::TextChar(Unicode cA, int charPosA, int charLenA, double xMinA, double yMinA, double xMaxA, double yMaxA, int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, @@ -1484,6 +1452,9 @@ void TextPage::addChar(GfxState *state, double x, double y, // add the characters if (uBufLen > 0) { + Ref *ref = state->getFont()->getID(); + fprintf(stderr, "Adding %d characters of font (%d, %d) at size %.1f\n", uBufLen, ref->gen, ref->num, state->getFontSize()); + // handle right-to-left ligatures: if there are multiple Unicode // characters, and they're all right-to-left, insert them in // right-to-left order diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.h b/src/xpdf-4.04/xpdf/TextOutputDev.h index 302975c..8017cf6 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.h +++ b/src/xpdf-4.04/xpdf/TextOutputDev.h @@ -34,6 +34,42 @@ class TextPage; typedef void (*TextOutputFunc)(void *stream, const char *text, int len); +//------------------------------------------------------------------------ +// TextChar +//------------------------------------------------------------------------ + +class TextChar { +public: + + TextChar(Unicode cA, int charPosA, int charLenA, + double xMinA, double yMinA, double xMaxA, double yMaxA, + int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, + TextFontInfo *fontA, double fontSizeA, + double colorRA, double colorGA, double colorBA); + + static int cmpX(const void *p1, const void *p2); + static int cmpY(const void *p1, const void *p2); + static int cmpCharPos(const void *p1, const void *p2); + + Unicode c; + int charPos; + int charLen; + double xMin, yMin, xMax, yMax; + TextFontInfo *font; + double fontSize; + double colorR, + colorG, + colorB; + + // group the byte-size fields to minimize object size + Guchar rot; + char rotated; + char clipped; + char invisible; + char spaceAfter; + char overlap; +}; + //------------------------------------------------------------------------ // TextOutputControl //------------------------------------------------------------------------ @@ -784,6 +820,10 @@ class TextOutputDev: public OutputDev { // Turn extra processing for HTML conversion on or off. void enableHTMLExtras(GBool html) { control.html = html; } +protected: + TextPage *text; // text for the current page + TextOutputControl control; // formatting parameters + private: void generateBOM(); @@ -792,8 +832,6 @@ class TextOutputDev: public OutputDev { void *outputStream; // output stream GBool needClose; // need to close the output file? // (only if outputStream is a FILE*) - TextPage *text; // text for the current page - TextOutputControl control; // formatting parameters GBool ok; // set up ok? }; From 6cf2a6689468906e935875b10ac6d03c46b58ba8 Mon Sep 17 00:00:00 2001 From: Matthijs Wesseling Date: Mon, 30 Oct 2023 13:38:43 +0100 Subject: [PATCH 03/10] Add fontoutputdev --- src/xpydf/FontOutputDev.cc | 10 ++++++++++ src/xpydf/FontOutputDev.h | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 src/xpydf/FontOutputDev.cc create mode 100644 src/xpydf/FontOutputDev.h diff --git a/src/xpydf/FontOutputDev.cc b/src/xpydf/FontOutputDev.cc new file mode 100644 index 0000000..18d5b2a --- /dev/null +++ b/src/xpydf/FontOutputDev.cc @@ -0,0 +1,10 @@ +#include + +#include "GList.h" +#include "GlobalParams.h" +#include "UnicodeRemapping.h" +#include "UnicodeTypeTable.h" +#include "GfxState.h" + +#include "FontOutputDev.h" + diff --git a/src/xpydf/FontOutputDev.h b/src/xpydf/FontOutputDev.h new file mode 100644 index 0000000..b0ac755 --- /dev/null +++ b/src/xpydf/FontOutputDev.h @@ -0,0 +1,40 @@ +#ifndef FONT_OUTPUT_DEV_H +#define FONT_OUTPUT_DEV_H + +#include "TextOutputDev.h" + +typedef TextChar* (*CharConstructor)(Unicode cA, int charPosA, int charLenA, + double xMinA, double yMinA, double xMaxA, double yMaxA, + int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, + TextFontInfo *fontA, double fontSizeA, + double colorRA, double colorGA, double colorBA); + +class TextPageFont: public TextPage { +public: + TextPageFont(TextOutputControl *controlA) : TextPage(controlA) {}; + + void TextPageFont::addChar(GfxState *state, double x, double y, + double dx, double dy, + CharCode c, int nBytes, Unicode *u, int uLen); + + TextChar *TextCharType(Unicode cA, int charPosA, int charLenA, + double xMinA, double yMinA, double xMaxA, double yMaxA, + int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, + TextFontInfo *fontA, double fontSizeA, + double colorRA, double colorGA, double colorBA) { + return new TextChar(cA, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, + rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, + colorRA, colorGA, colorBA); + } +}; + +class FontOutputDev: public TextOutputDev { +public: + FontOutputDev(TextOutputFunc func, void *stream, + TextOutputControl *controlA) : TextOutputDev(func, stream, controlA) { + delete text; + text = new TextPageFont(&control); + } +}; + +#endif From 8ed1b8759dac375fb1f9f8f92f90fbce51c6cb0b Mon Sep 17 00:00:00 2001 From: Matthijs Wesseling Date: Mon, 30 Oct 2023 15:00:18 +0100 Subject: [PATCH 04/10] Move TextChar to header, allow for overloading TextChar construction --- src/xpdf-4.04/xpdf/TextOutputDev.cc | 17 +++--- src/xpdf-4.04/xpdf/TextOutputDev.h | 84 ++++++++++++++++------------- 2 files changed, 55 insertions(+), 46 deletions(-) diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.cc b/src/xpdf-4.04/xpdf/TextOutputDev.cc index c957493..9a59da5 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.cc +++ b/src/xpdf-4.04/xpdf/TextOutputDev.cc @@ -1452,9 +1452,6 @@ void TextPage::addChar(GfxState *state, double x, double y, // add the characters if (uBufLen > 0) { - Ref *ref = state->getFont()->getID(); - fprintf(stderr, "Adding %d characters of font (%d, %d) at size %.1f\n", uBufLen, ref->gen, ref->num, state->getFontSize()); - // handle right-to-left ligatures: if there are multiple Unicode // characters, and they're all right-to-left, insert them in // right-to-left order @@ -1530,13 +1527,13 @@ void TextPage::addChar(GfxState *state, double x, double y, } else { j = i; } - chars->append(new TextChar(uBuf[j], charPos, nBytes, - xMin, yMin, xMax, yMax, - curRot, rotated, clipped, - state->getRender() == 3 || alpha < 0.001, - curFont, curFontSize, - colToDbl(rgb.r), colToDbl(rgb.g), - colToDbl(rgb.b))); + chars->append(textCharType(uBuf[j], charPos, nBytes, + xMin, yMin, xMax, yMax, + curRot, rotated, clipped, + state->getRender() == 3 || alpha < 0.001, + curFont, curFontSize, + colToDbl(rgb.r), colToDbl(rgb.g), + colToDbl(rgb.b))); } } diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.h b/src/xpdf-4.04/xpdf/TextOutputDev.h index 8017cf6..2f8021b 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.h +++ b/src/xpdf-4.04/xpdf/TextOutputDev.h @@ -34,42 +34,6 @@ class TextPage; typedef void (*TextOutputFunc)(void *stream, const char *text, int len); -//------------------------------------------------------------------------ -// TextChar -//------------------------------------------------------------------------ - -class TextChar { -public: - - TextChar(Unicode cA, int charPosA, int charLenA, - double xMinA, double yMinA, double xMaxA, double yMaxA, - int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, - TextFontInfo *fontA, double fontSizeA, - double colorRA, double colorGA, double colorBA); - - static int cmpX(const void *p1, const void *p2); - static int cmpY(const void *p1, const void *p2); - static int cmpCharPos(const void *p1, const void *p2); - - Unicode c; - int charPos; - int charLen; - double xMin, yMin, xMax, yMax; - TextFontInfo *font; - double fontSize; - double colorR, - colorG, - colorB; - - // group the byte-size fields to minimize object size - Guchar rot; - char rotated; - char clipped; - char invisible; - char spaceAfter; - char overlap; -}; - //------------------------------------------------------------------------ // TextOutputControl //------------------------------------------------------------------------ @@ -173,6 +137,42 @@ class TextFontInfo { friend class TextWord; }; +//------------------------------------------------------------------------ +// TextChar +//------------------------------------------------------------------------ + +class TextChar { +public: + + TextChar(Unicode cA, int charPosA, int charLenA, + double xMinA, double yMinA, double xMaxA, double yMaxA, + int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, + TextFontInfo *fontA, double fontSizeA, + double colorRA, double colorGA, double colorBA); + + static int cmpX(const void *p1, const void *p2); + static int cmpY(const void *p1, const void *p2); + static int cmpCharPos(const void *p1, const void *p2); + + Unicode c; + int charPos; + int charLen; + double xMin, yMin, xMax, yMax; + TextFontInfo *font; + double fontSize; + double colorR, + colorG, + colorB; + + // group the byte-size fields to minimize object size + Guchar rot; + char rotated; + char clipped; + char invisible; + char spaceAfter; + char overlap; +}; + //------------------------------------------------------------------------ // TextWord //------------------------------------------------------------------------ @@ -528,6 +528,18 @@ class TextPage { void removeChars(double xMin, double yMin, double xMax, double yMax, double xOverlapThresh, double yOverlapThresh); +protected: + TextChar *textCharType(Unicode cA, int charPosA, int charLenA, + double xMinA, double yMinA, double xMaxA, double yMaxA, + int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, + TextFontInfo *fontA, double fontSizeA, + double colorRA, double colorGA, double colorBA) + { + return new TextChar(cA, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, + rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, + colorRA, colorGA, colorBA); + } + private: void startPage(GfxState *state); From 232c8c9a7026b28304cb6d7d90425b8b279f3e30 Mon Sep 17 00:00:00 2001 From: Matthijs Wesseling Date: Mon, 30 Oct 2023 15:00:44 +0100 Subject: [PATCH 05/10] Implement fontoutputdev --- src/xpydf/FontOutputDev.cc | 36 ++++++++++++++++++++++++++++++++++++ src/xpydf/FontOutputDev.h | 33 ++++++++++++++++----------------- 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/src/xpydf/FontOutputDev.cc b/src/xpydf/FontOutputDev.cc index 18d5b2a..c624f89 100644 --- a/src/xpydf/FontOutputDev.cc +++ b/src/xpydf/FontOutputDev.cc @@ -8,3 +8,39 @@ #include "FontOutputDev.h" + +TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, + double xMinA, double yMinA, double xMaxA, double yMaxA, + int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, + TextFontInfo *fontA, double fontSizeA, + double colorRA, double colorGA, double colorBA) +{ + GString *name = fontA->getFontName(); + Unicode fontId = 0; + + if (name) { + char fontCode[1000], fontName[1000], fontType[1000]; + + if (sscanf(name->getCString(), "%[^+]+%[^-]-%s", fontCode, fontName, fontType) != EOF) { + if (fontNameIds.find(fontName) == fontNameIds.end()) { + fontNameIds[fontName] = fontNameIds.size(); + } + + if (fontTypeIds.find(fontType) == fontTypeIds.end()) { + fontTypeIds[fontType] = fontTypeIds.size(); + } + + FontSpec spec = {fontNameIds[fontName], fontTypeIds[fontType], fontSizeA}; + + if (fontIds.find(spec) == fontIds.end()) { + fontIds[spec] = fontIds.size() + 1; + } + + fontId = fontIds[spec]; + } + } + + return new TextChar(fontId, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, + rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, + colorRA, colorGA, colorBA); +} diff --git a/src/xpydf/FontOutputDev.h b/src/xpydf/FontOutputDev.h index b0ac755..edf0c12 100644 --- a/src/xpydf/FontOutputDev.h +++ b/src/xpydf/FontOutputDev.h @@ -1,33 +1,32 @@ #ifndef FONT_OUTPUT_DEV_H #define FONT_OUTPUT_DEV_H -#include "TextOutputDev.h" +#include +#include -typedef TextChar* (*CharConstructor)(Unicode cA, int charPosA, int charLenA, - double xMinA, double yMinA, double xMaxA, double yMaxA, - int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, - TextFontInfo *fontA, double fontSizeA, - double colorRA, double colorGA, double colorBA); +#include "TextOutputDev.h" class TextPageFont: public TextPage { public: - TextPageFont(TextOutputControl *controlA) : TextPage(controlA) {}; - - void TextPageFont::addChar(GfxState *state, double x, double y, - double dx, double dy, - CharCode c, int nBytes, Unicode *u, int uLen); + TextPageFont(TextOutputControl *controlA) : TextPage(controlA) { }; - TextChar *TextCharType(Unicode cA, int charPosA, int charLenA, +protected: + TextChar *textCharType(Unicode cA, int charPosA, int charLenA, double xMinA, double yMinA, double xMaxA, double yMaxA, int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, TextFontInfo *fontA, double fontSizeA, - double colorRA, double colorGA, double colorBA) { - return new TextChar(cA, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, - rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, - colorRA, colorGA, colorBA); - } + double colorRA, double colorGA, double colorBA); +private: + std::map fontNameIds; + std::map fontTypeIds; + std::map fontIds; }; +typedef struct FontSpec { + unsigned int fontNameId, fontTypeId; + int fontSize; +} FontSpec; + class FontOutputDev: public TextOutputDev { public: FontOutputDev(TextOutputFunc func, void *stream, From 3a6a5192c753788338a682b7ff8bbc52180b16a1 Mon Sep 17 00:00:00 2001 From: Matthijs Wesseling Date: Mon, 30 Oct 2023 15:01:01 +0100 Subject: [PATCH 06/10] Add fontmap functionality --- src/xpydf/PdfLoader.cc | 115 ++++++++++++++++++++++++----------------- src/xpydf/PdfLoader.h | 1 + 2 files changed, 70 insertions(+), 46 deletions(-) diff --git a/src/xpydf/PdfLoader.cc b/src/xpydf/PdfLoader.cc index 4066bfa..01070ed 100644 --- a/src/xpydf/PdfLoader.cc +++ b/src/xpydf/PdfLoader.cc @@ -1,4 +1,6 @@ #include +#include +#include #include #include #include @@ -32,6 +34,7 @@ #include "AcroForm.h" #include "PdfLoader.h" +#include "FontOutputDev.h" #include "ImageDataDev.h" #include "ImageInfoDev.h" @@ -136,6 +139,39 @@ std::vector PdfLoader::extractText() { return pages; } +std::vector PdfLoader::extractFontMap() { + FontOutputDev *fontOut; + std::stringstream *stream = new std::stringstream(); + std::vector pages; + int firstPage, lastPage; + + if (!doc->isOk()) { + goto err; + } + + firstPage = 1; + lastPage = doc->getNumPages(); + + fontOut = new FontOutputDev(&outputToStringStream, stream, &textOutControl); + + if (fontOut->isOk()) { + for (int page = firstPage; page <= lastPage; page++) { + stream->str(""); + doc->displayPages(fontOut, page, page, 72, 72, 0, gFalse, gTrue, gFalse); + pages.push_back(stream->str()); + } + } + + delete fontOut; +err: + delete stream; + + Object::memCheck(stderr); + gMemReport(stderr); + + return pages; +} + std::vector PdfLoader::extractPageInfo() { ImageInfoDev *imageOut; int firstPage, lastPage; @@ -172,7 +208,7 @@ std::vector PdfLoader::extractPageInfo() { } delete imageOut; - err: +err: Object::memCheck(stderr); gMemReport(stderr); @@ -210,28 +246,22 @@ void PdfLoader::scanFonts(Dict *resDict) { gfxFontDict = NULL; resDict->lookupNF("Font", &fontDict1); if (checkFontObject(&fontDict1, &fontDict2) && fontDict2.isDict()) { - fprintf(stderr, "checkFontObject success\n"); if (fontDict1.isRef()) { - fprintf(stderr, "isRef\n"); r = fontDict1.getRef(); gfxFontDict = new GfxFontDict(doc->getXRef(), &r, fontDict2.getDict()); } else { - fprintf(stderr, "noRef\n"); gfxFontDict = new GfxFontDict(doc->getXRef(), NULL, fontDict2.getDict()); } if (gfxFontDict) { - fprintf(stderr, "gfxFontDict\n"); for (i = 0; i < gfxFontDict->getNumFonts(); ++i) { if ((font = gfxFontDict->getFont(i))) { - fprintf(stderr, "Scanning font:\n"); scanFont(font); } } delete gfxFontDict; } - } else { - fprintf(stderr, "Sad :(\n"); } + fontDict2.free(); fontDict1.free(); @@ -301,12 +331,12 @@ void PdfLoader::scanFonts(Dict *resDict) { gsDict1.free(); } +std::map> fontDict; + void PdfLoader::scanFont(GfxFont *font) { - Ref fontRef, embRef; + Ref fontRef; Object fontObj, toUnicodeObj; GString *name; - GBool emb, subset, hasToUnicode; - GfxFontLoc *loc; int i; fontRef = *font->getID(); @@ -321,42 +351,18 @@ void PdfLoader::scanFont(GfxFont *font) { // font name name = font->getName(); - // check for an embedded font - if (font->getType() == fontType3) { - emb = gTrue; - } else { - emb = font->getEmbeddedFontID(&embRef); - } - - // look for a ToUnicode map - hasToUnicode = gFalse; - if (doc->getXRef()->fetch(fontRef.num, fontRef.gen, &fontObj)->isDict()) { - hasToUnicode = fontObj.dictLookup("ToUnicode", &toUnicodeObj)->isStream(); - toUnicodeObj.free(); - } - fontObj.free(); - - // check for a font subset name: capital letters followed by a '+' - // sign - subset = gFalse; + // print the font info if (name) { - for (i = 0; i < name->getLength(); ++i) { - if (name->getChar(i) < 'A' || name->getChar(i) > 'Z') { - break; + char fontCode[1000], fontName[1000], fontType[1000]; + + if (sscanf(name->getCString(), "%[^+]+%[^-]-%s", fontCode, fontName, fontType) != EOF) { + if (fontDict.find(fontName) == fontDict.end()) { + fontDict[fontName] = std::set(); } - } - subset = i > 0 && i < name->getLength() && name->getChar(i) == '+'; - } - // print the font info - printf("%-46s", - name ? name->getCString() : "[none]"); - if (fontRef.gen >= 100000) { - printf(" [none]"); - } else { - printf(" %6d %2d", fontRef.num, fontRef.gen); + fontDict[fontName].insert(fontType); + } } - printf("\n"); // add this font to the list if (fontsLen == fontsSize) { @@ -395,7 +401,6 @@ GBool PdfLoader::checkFontObject(Object *in, Object *out) { std::vector PdfLoader::extractFonts() { int firstPage, lastPage; std::vector fontInfo; - char dummyFile[1] = ""; Dict *resDict; Annots *annots; @@ -409,11 +414,17 @@ std::vector PdfLoader::extractFonts() { firstPage = 1; lastPage = doc->getNumPages(); + fonts = NULL; + fontsLen = fontsSize = 0; + numObjects = doc->getXRef()->getNumObjects(); + seenObjs = (char *)gmalloc(numObjects); + memset(seenObjs, 0, numObjects); + for (int page = firstPage; page <= lastPage; page++) { Page *pdfPage = doc->getCatalog()->getPage(page); if ((resDict = pdfPage->getResourceDict())) { - fprintf(stderr, "scanFonts\n"); + // fprintf(stderr, "scanFonts\n"); scanFonts(resDict); } @@ -446,6 +457,18 @@ std::vector PdfLoader::extractFonts() { } } + for (auto pair : fontDict) { + fprintf(stderr, "%s has types:\n", pair.first.c_str()); + for (auto ft : pair.second) { + fprintf(stderr, " - %s\n", ft.c_str()); + } + } + + fprintf(stderr, "Found %d fonts\n", fontsSize); + + gfree(fonts); + gfree(seenObjs); + err: Object::memCheck(stderr); gMemReport(stderr); @@ -512,7 +535,7 @@ Image PdfLoader::pageToImage(int pageNum, int dpi) { memcpy(pageImage.data, bitmap->getDataPtr(), pageImage.size); delete splashOut; - err: +err: Object::memCheck(stderr); gMemReport(stderr); diff --git a/src/xpydf/PdfLoader.h b/src/xpydf/PdfLoader.h index 1386b9e..5115047 100644 --- a/src/xpydf/PdfLoader.h +++ b/src/xpydf/PdfLoader.h @@ -36,6 +36,7 @@ class PdfLoader { PdfLoader(LoaderConfig config, char *fileName, char *ownerPw = NULL, char *userPw = NULL); ~PdfLoader(); std::vector extractText(); + std::vector extractFontMap(); std::vector extractPageInfo(); std::vector extractFonts(); std::vector extractImages(int pageNum); From 60977d8140f502dd98205cbad1e4094052e4808a Mon Sep 17 00:00:00 2001 From: Matthijs Wesseling Date: Mon, 30 Oct 2023 15:27:27 +0100 Subject: [PATCH 07/10] Seems to work --- Makefile | 2 +- src/xpdf-4.04/xpdf/TextOutputDev.h | 15 ++++++++------- src/xpydf/FontOutputDev.cc | 11 ++++++++++- src/xpydf/FontOutputDev.h | 11 +++++------ 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index c28dc5c..1071091 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ FOFI_DIR = $(XDIR)fofi/ GOO_DIR = $(XDIR)goo/ XPDF_DIR = $(XDIR)xpdf/ -PY_SRC = src/xpydf/PdfLoader.cc src/xpydf/ImageInfoDev.cc src/xpydf/ImageDataDev.cc +PY_SRC = src/xpydf/PdfLoader.cc src/xpydf/ImageInfoDev.cc src/xpydf/ImageDataDev.cc src/xpydf/FontOutputDev.cc SPLASH_SRC = $(wildcard $(SPLASH_DIR)*.cc) FOFI_SRC = $(wildcard $(FOFI_DIR)*.cc) diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.h b/src/xpdf-4.04/xpdf/TextOutputDev.h index 2f8021b..7ba70cf 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.h +++ b/src/xpdf-4.04/xpdf/TextOutputDev.h @@ -422,7 +422,7 @@ class TextPage { public: TextPage(TextOutputControl *controlA); - ~TextPage(); + virtual ~TextPage(); // Write contents of page to a stream. void write(void *outputStream, TextOutputFunc outputFunc); @@ -529,25 +529,26 @@ class TextPage { double xOverlapThresh, double yOverlapThresh); protected: - TextChar *textCharType(Unicode cA, int charPosA, int charLenA, + virtual TextChar *textCharType(Unicode cA, int charPosA, int charLenA, double xMinA, double yMinA, double xMaxA, double yMaxA, int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, TextFontInfo *fontA, double fontSizeA, - double colorRA, double colorGA, double colorBA) - { + double colorRA, double colorGA, double colorBA + ) { return new TextChar(cA, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, colorRA, colorGA, colorBA); } + void addChar(GfxState *state, double x, double y, + double dx, double dy, + CharCode c, int nBytes, Unicode *u, int uLen); + private: void startPage(GfxState *state); void clear(); void updateFont(GfxState *state); - void addChar(GfxState *state, double x, double y, - double dx, double dy, - CharCode c, int nBytes, Unicode *u, int uLen); void incCharCount(int nChars); void beginActualText(GfxState *state, Unicode *u, int uLen); void endActualText(GfxState *state); diff --git a/src/xpydf/FontOutputDev.cc b/src/xpydf/FontOutputDev.cc index c624f89..adbaf2e 100644 --- a/src/xpydf/FontOutputDev.cc +++ b/src/xpydf/FontOutputDev.cc @@ -9,6 +9,15 @@ #include "FontOutputDev.h" +bool operator<(const FontSpec& l, const FontSpec& r) { + return ( + l.fontNameId < r.fontNameId + || (l.fontNameId == r.fontNameId && l.fontTypeId < r.fontTypeId) + || (l.fontNameId == r.fontNameId && l.fontTypeId == r.fontTypeId && l.fontSize < r.fontSize) + ); +} + + TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, double xMinA, double yMinA, double xMaxA, double yMaxA, int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, @@ -30,7 +39,7 @@ TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, fontTypeIds[fontType] = fontTypeIds.size(); } - FontSpec spec = {fontNameIds[fontName], fontTypeIds[fontType], fontSizeA}; + FontSpec spec = {fontNameIds[fontName], fontTypeIds[fontType], (unsigned int)fontSizeA}; if (fontIds.find(spec) == fontIds.end()) { fontIds[spec] = fontIds.size() + 1; diff --git a/src/xpydf/FontOutputDev.h b/src/xpydf/FontOutputDev.h index edf0c12..176ef38 100644 --- a/src/xpydf/FontOutputDev.h +++ b/src/xpydf/FontOutputDev.h @@ -6,9 +6,13 @@ #include "TextOutputDev.h" +typedef struct FontSpec { + unsigned int fontNameId, fontTypeId, fontSize; +} FontSpec; + class TextPageFont: public TextPage { public: - TextPageFont(TextOutputControl *controlA) : TextPage(controlA) { }; + TextPageFont(TextOutputControl *controlA) : TextPage(controlA) {}; protected: TextChar *textCharType(Unicode cA, int charPosA, int charLenA, @@ -22,11 +26,6 @@ class TextPageFont: public TextPage { std::map fontIds; }; -typedef struct FontSpec { - unsigned int fontNameId, fontTypeId; - int fontSize; -} FontSpec; - class FontOutputDev: public TextOutputDev { public: FontOutputDev(TextOutputFunc func, void *stream, From 89a43ff98dd3c5eca0627e1b49c5af5f35e6f41d Mon Sep 17 00:00:00 2001 From: Matthijs Wesseling Date: Mon, 30 Oct 2023 16:49:13 +0100 Subject: [PATCH 08/10] Replace encoder --- src/xpdf-4.04/xpdf/TextOutputDev.cc | 2 ++ src/xpdf-4.04/xpdf/TextOutputDev.h | 32 +++++++++++++++-------------- src/xpydf/FontOutputDev.cc | 25 +++++++++++++++++++++- src/xpydf/FontOutputDev.h | 25 +++++++++++++++++++--- 4 files changed, 65 insertions(+), 19 deletions(-) diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.cc b/src/xpdf-4.04/xpdf/TextOutputDev.cc index 9a59da5..aa06156 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.cc +++ b/src/xpdf-4.04/xpdf/TextOutputDev.cc @@ -2249,6 +2249,8 @@ void TextPage::encodeFragment(Unicode *text, int len, UnicodeMap *uMap, int lreLen, rleLen, popdfLen, n; int i, j, k; + fprintf(stderr, "Old...\n"); + if (uMap->isUnicode()) { lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.h b/src/xpdf-4.04/xpdf/TextOutputDev.h index 7ba70cf..cdc2fd0 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.h +++ b/src/xpdf-4.04/xpdf/TextOutputDev.h @@ -528,34 +528,26 @@ class TextPage { void removeChars(double xMin, double yMin, double xMax, double yMax, double xOverlapThresh, double yOverlapThresh); -protected: +public: virtual TextChar *textCharType(Unicode cA, int charPosA, int charLenA, double xMinA, double yMinA, double xMaxA, double yMaxA, int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, TextFontInfo *fontA, double fontSizeA, double colorRA, double colorGA, double colorBA ) { + fprintf(stderr, "Old textCharType\n"); return new TextChar(cA, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, colorRA, colorGA, colorBA); } + virtual void encodeFragment(Unicode *text, int len, UnicodeMap *uMap, + GBool primaryLR, GString *s); + void addChar(GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, Unicode *u, int uLen); -private: - - void startPage(GfxState *state); - void clear(); - void updateFont(GfxState *state); - void incCharCount(int nChars); - void beginActualText(GfxState *state, Unicode *u, int uLen); - void endActualText(GfxState *state); - void addUnderline(double x0, double y0, double x1, double y1); - void addLink(double xMin, double yMin, double xMax, double yMax, - Link *link); - // output void writeReadingOrder(void *outputStream, TextOutputFunc outputFunc, @@ -587,8 +579,18 @@ class TextPage { UnicodeMap *uMap, char *space, int spaceLen, char *eol, int eolLen); - void encodeFragment(Unicode *text, int len, UnicodeMap *uMap, - GBool primaryLR, GString *s); + +private: + + void startPage(GfxState *state); + void clear(); + void updateFont(GfxState *state); + void incCharCount(int nChars); + void beginActualText(GfxState *state, Unicode *u, int uLen); + void endActualText(GfxState *state); + void addUnderline(double x0, double y0, double x1, double y1); + void addLink(double xMin, double yMin, double xMax, double yMax, + Link *link); GBool unicodeEffectiveTypeLOrNum(Unicode u, Unicode left, Unicode right); GBool unicodeEffectiveTypeR(Unicode u, Unicode left, Unicode right); diff --git a/src/xpydf/FontOutputDev.cc b/src/xpydf/FontOutputDev.cc index adbaf2e..ec002b3 100644 --- a/src/xpydf/FontOutputDev.cc +++ b/src/xpydf/FontOutputDev.cc @@ -2,6 +2,7 @@ #include "GList.h" #include "GlobalParams.h" +#include "UnicodeMap.h" #include "UnicodeRemapping.h" #include "UnicodeTypeTable.h" #include "GfxState.h" @@ -17,6 +18,14 @@ bool operator<(const FontSpec& l, const FontSpec& r) { ); } +bool operator==(const FontSpec& l, const FontSpec& r) { + return (l.fontNameId == r.fontNameId && l.fontTypeId == r.fontTypeId && l.fontSize == r.fontSize); +} + +bool operator!=(const FontSpec& l, const FontSpec& r) { + return (l.fontNameId != r.fontNameId || l.fontTypeId != r.fontTypeId || l.fontSize != r.fontSize); +} + TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, double xMinA, double yMinA, double xMaxA, double yMaxA, @@ -24,6 +33,7 @@ TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, TextFontInfo *fontA, double fontSizeA, double colorRA, double colorGA, double colorBA) { + fprintf(stderr, "New textCharType\n"); GString *name = fontA->getFontName(); Unicode fontId = 0; @@ -42,7 +52,7 @@ TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, FontSpec spec = {fontNameIds[fontName], fontTypeIds[fontType], (unsigned int)fontSizeA}; if (fontIds.find(spec) == fontIds.end()) { - fontIds[spec] = fontIds.size() + 1; + fontIds[spec] = fontIds.size() + 256; } fontId = fontIds[spec]; @@ -53,3 +63,16 @@ TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, colorRA, colorGA, colorBA); } + +void TextPageFont::encodeFragment(Unicode *text, int len, UnicodeMap *uMap, GBool primaryLR, GString *s) { + char buf[8]; + + fprintf(stderr, "Yay\n"); + + for (int i = 0; i < len; ++i) { + for (int j = 0; j < sizeof(Unicode); j++) { + buf[j] = text[i] >> (j * 8); + } + s->append(buf, sizeof(Unicode)); + } +} diff --git a/src/xpydf/FontOutputDev.h b/src/xpydf/FontOutputDev.h index 176ef38..5cdd230 100644 --- a/src/xpydf/FontOutputDev.h +++ b/src/xpydf/FontOutputDev.h @@ -4,22 +4,34 @@ #include #include +#include "UnicodeMap.h" #include "TextOutputDev.h" typedef struct FontSpec { unsigned int fontNameId, fontTypeId, fontSize; } FontSpec; +bool operator<(const FontSpec& l, const FontSpec& r); +bool operator==(const FontSpec& l, const FontSpec& r); +bool operator!=(const FontSpec& l, const FontSpec& r); + class TextPageFont: public TextPage { public: - TextPageFont(TextOutputControl *controlA) : TextPage(controlA) {}; + TextPageFont(TextOutputControl *controlA) : TextPage(controlA) {fprintf(stderr, "Making TextPageFont\n");}; -protected: + std::map getFontIds() { + return fontIds; + } + +// protected: TextChar *textCharType(Unicode cA, int charPosA, int charLenA, double xMinA, double yMinA, double xMaxA, double yMaxA, int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, TextFontInfo *fontA, double fontSizeA, - double colorRA, double colorGA, double colorBA); + double colorRA, double colorGA, double colorBA) override; + + void encodeFragment(Unicode *text, int len, UnicodeMap *uMap, GBool primaryLR, GString *s) override; + private: std::map fontNameIds; std::map fontTypeIds; @@ -33,6 +45,13 @@ class FontOutputDev: public TextOutputDev { delete text; text = new TextPageFont(&control); } + + std::map getFontIds() { + return text->getFontIds(); + } + +protected: + TextPageFont *text; }; #endif From 8b225d45bb88be24927edb487fb30772f4b90bcf Mon Sep 17 00:00:00 2001 From: Matthijs Wesseling Date: Thu, 2 Nov 2023 14:44:39 +0100 Subject: [PATCH 09/10] Merge --- src/xpdf-4.04/xpdf/GfxFont.cc | 10 ++- src/xpdf-4.04/xpdf/GlobalParams.cc | 17 +++++ src/xpdf-4.04/xpdf/GlobalParams.h | 3 + src/xpdf-4.04/xpdf/TextOutputDev.cc | 2 - src/xpdf-4.04/xpdf/TextOutputDev.h | 7 +- src/xpydf/FontOutputDev.cc | 64 +++++++++++++--- src/xpydf/FontOutputDev.h | 111 +++++++++++++++++++++++++--- src/xpydf/PdfLoader.cc | 77 ++++++++++++++++--- src/xpydf/PdfLoader.h | 4 +- src/xpydf/PdfLoaderWrapper.cc | 25 ++++++- src/xpydf/PyCppConversion.cc | 22 ++++++ src/xpydf/PyCppConversion.h | 3 + src/xpydf/cXpdfPython.pyi | 11 ++- src/xpydf/pdf_loader.py | 28 ++++++- src/xpydf/pdf_loader.pyi | 11 ++- 15 files changed, 341 insertions(+), 54 deletions(-) diff --git a/src/xpdf-4.04/xpdf/GfxFont.cc b/src/xpdf-4.04/xpdf/GfxFont.cc index 30e8bd1..d68aaec 100644 --- a/src/xpdf-4.04/xpdf/GfxFont.cc +++ b/src/xpdf-4.04/xpdf/GfxFont.cc @@ -1283,15 +1283,17 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GString *nameA, } } } - - // construct the char code -> Unicode mapping object + + // construct the char code -> Unicode mapping object ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode); // merge in a ToUnicode CMap, if there is one -- this overwrites // existing entries in ctu, i.e., the ToUnicode CMap takes // precedence, but the other encoding info is allowed to fill in any // holes - readToUnicodeCMap(fontDict, 8, ctu); + if (globalParams->getReadUnicodeCMap()) { + readToUnicodeCMap(fontDict, 8, ctu); + } // look for a Unicode-to-Unicode mapping if (name && (utu = globalParams->getUnicodeToUnicode(name))) { @@ -1423,7 +1425,7 @@ int Gfx8BitFont::getNextChar(char *s, int len, CharCode *code, *code = c = (CharCode)(*s & 0xff); *uLen = ctu->mapToUnicode(c, u, uSize); - *dx = widths[c]; + *dx = widths[c]; *dy = *ox = *oy = 0; return 1; } diff --git a/src/xpdf-4.04/xpdf/GlobalParams.cc b/src/xpdf-4.04/xpdf/GlobalParams.cc index 5fd4393..64deb76 100644 --- a/src/xpdf-4.04/xpdf/GlobalParams.cc +++ b/src/xpdf-4.04/xpdf/GlobalParams.cc @@ -749,6 +749,7 @@ GlobalParams::GlobalParams(const char *cfgFileName) { defaultPrinter = NULL; mapNumericCharNames = gTrue; mapUnknownCharNames = gFalse; + readUnicodeCMap = gTrue; mapExtTrueTypeFontsViaUnicode = gTrue; useTrueTypeUnicodeMapping = gFalse; droppedFonts = new GHash(gTrue); @@ -3268,6 +3269,16 @@ GBool GlobalParams::getMapUnknownCharNames() { return map; } +GBool GlobalParams::getReadUnicodeCMap() { + GBool read; + + lockGlobalParams; + read = readUnicodeCMap; + unlockGlobalParams; + + return read; +} + GBool GlobalParams::getMapExtTrueTypeFontsViaUnicode() { GBool map; @@ -3793,6 +3804,12 @@ void GlobalParams::setMapUnknownCharNames(GBool map) { unlockGlobalParams; } +void GlobalParams::setReadUnicodeCMap(GBool read) { + lockGlobalParams; + readUnicodeCMap = read; + unlockGlobalParams; +} + void GlobalParams::setMapExtTrueTypeFontsViaUnicode(GBool map) { lockGlobalParams; mapExtTrueTypeFontsViaUnicode = map; diff --git a/src/xpdf-4.04/xpdf/GlobalParams.h b/src/xpdf-4.04/xpdf/GlobalParams.h index b8299fc..fadac85 100644 --- a/src/xpdf-4.04/xpdf/GlobalParams.h +++ b/src/xpdf-4.04/xpdf/GlobalParams.h @@ -338,6 +338,7 @@ class GlobalParams { GString *getDefaultPrinter(); GBool getMapNumericCharNames(); GBool getMapUnknownCharNames(); + GBool getReadUnicodeCMap(); GBool getMapExtTrueTypeFontsViaUnicode(); GBool getUseTrueTypeUnicodeMapping(); GBool isDroppedFont(const char *fontName); @@ -400,6 +401,7 @@ class GlobalParams { void setOverprintPreview(GBool preview); void setMapNumericCharNames(GBool map); void setMapUnknownCharNames(GBool map); + void setReadUnicodeCMap(GBool map); void setMapExtTrueTypeFontsViaUnicode(GBool map); void setTabStateFile(char *tabStateFileA); void setPrintCommands(GBool printCommandsA); @@ -589,6 +591,7 @@ class GlobalParams { // from the viewer) GBool mapNumericCharNames; // map numeric char names (from font subsets)? GBool mapUnknownCharNames; // map unknown char names? + GBool readUnicodeCMap; // Read the unicode c map? GBool mapExtTrueTypeFontsViaUnicode; // map char codes to GID via Unicode // for external TrueType fonts? GBool useTrueTypeUnicodeMapping; // use the Unicode cmaps in TrueType diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.cc b/src/xpdf-4.04/xpdf/TextOutputDev.cc index aa06156..9a59da5 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.cc +++ b/src/xpdf-4.04/xpdf/TextOutputDev.cc @@ -2249,8 +2249,6 @@ void TextPage::encodeFragment(Unicode *text, int len, UnicodeMap *uMap, int lreLen, rleLen, popdfLen, n; int i, j, k; - fprintf(stderr, "Old...\n"); - if (uMap->isUnicode()) { lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.h b/src/xpdf-4.04/xpdf/TextOutputDev.h index cdc2fd0..5413b1c 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.h +++ b/src/xpdf-4.04/xpdf/TextOutputDev.h @@ -271,7 +271,7 @@ class TextLine { double getEdge(int idx) { return edge[idx]; } GBool getHyphenated() { return hyphenated; } -private: +public: static int cmpX(const void *p1, const void *p2); @@ -535,15 +535,15 @@ class TextPage { TextFontInfo *fontA, double fontSizeA, double colorRA, double colorGA, double colorBA ) { - fprintf(stderr, "Old textCharType\n"); return new TextChar(cA, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, colorRA, colorGA, colorBA); } - virtual void encodeFragment(Unicode *text, int len, UnicodeMap *uMap, GBool primaryLR, GString *s); + virtual void computeLinePhysWidth(TextLine *line, UnicodeMap *uMap); + void addChar(GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, Unicode *u, int uLen); @@ -644,7 +644,6 @@ class TextPage { int getCharDirection(TextChar *ch, TextChar *left, TextChar *right); int assignPhysLayoutPositions(GList *columns); void assignLinePhysPositions(GList *columns); - void computeLinePhysWidth(TextLine *line, UnicodeMap *uMap); int assignColumnPhysPositions(GList *columns); void buildSuperLines(TextBlock *blk, GList *superLines); void assignSimpleLayoutPositions(GList *superLines, UnicodeMap *uMap); diff --git a/src/xpydf/FontOutputDev.cc b/src/xpydf/FontOutputDev.cc index ec002b3..a7afc98 100644 --- a/src/xpydf/FontOutputDev.cc +++ b/src/xpydf/FontOutputDev.cc @@ -1,5 +1,6 @@ #include +#include "Error.h" #include "GList.h" #include "GlobalParams.h" #include "UnicodeMap.h" @@ -33,11 +34,18 @@ TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, TextFontInfo *fontA, double fontSizeA, double colorRA, double colorGA, double colorBA) { - fprintf(stderr, "New textCharType\n"); GString *name = fontA->getFontName(); - Unicode fontId = 0; + Unicode fontId = FONT_UNKNOWN; - if (name) { + char buf[8]; + int n = uMap->mapUnicode(cA, buf, sizeof(buf)); + + // fprintf(stderr, "Char [%c] font [%s] size [%.0f] color [%.1f, %.1f, %.1f]\n", buf[0], name->getCString(), fontSizeA, colorRA, colorGA, colorBA); + + if (n == 0) { + fontId = FONT_INVALID; + } + else if (name) { char fontCode[1000], fontName[1000], fontType[1000]; if (sscanf(name->getCString(), "%[^+]+%[^-]-%s", fontCode, fontName, fontType) != EOF) { @@ -51,14 +59,25 @@ TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, FontSpec spec = {fontNameIds[fontName], fontTypeIds[fontType], (unsigned int)fontSizeA}; - if (fontIds.find(spec) == fontIds.end()) { - fontIds[spec] = fontIds.size() + 256; + if (fontSpecIds.find(spec) == fontSpecIds.end()) { + if (availableIds.size() == 0) { + fprintf(stderr, "ERROR: Font id overflow\n"); + error(errInternal, -1, "ERROR: Font id overflow"); + } + fontSpecIds[spec] = availableIds.front(); + availableIds.pop(); } - fontId = fontIds[spec]; + fontId = fontSpecIds[spec]; } } + // fprintf(stderr, "fontId %d >> ", fontId); + + fontId = fontId ^ (n << 16); + + // fprintf(stderr, "%d (%d , %d)\n", fontId, (char)fontId, (fontId >> 16)); + return new TextChar(fontId, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, colorRA, colorGA, colorBA); @@ -66,13 +85,36 @@ TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, void TextPageFont::encodeFragment(Unicode *text, int len, UnicodeMap *uMap, GBool primaryLR, GString *s) { char buf[8]; - - fprintf(stderr, "Yay\n"); + int n; for (int i = 0; i < len; ++i) { - for (int j = 0; j < sizeof(Unicode); j++) { - buf[j] = text[i] >> (j * 8); + if (text[i] != FONT_INVALID) { + n = (text[i] >> 16); + if (n == 0) n = 1; + buf[0] = (char)text[i]; + for (int j = 0; j < n; j++) { + s->append(buf, 1); + } } - s->append(buf, sizeof(Unicode)); } + + // fprintf(stderr, "Encoded %d\n", s->getLength()); +} + +void TextPageFont::computeLinePhysWidth(TextLine *line, UnicodeMap *uMap) { + int n; + if (uMap->isUnicode()) { + line->pw = line->len; + } else { + line->pw = 0; + for (int i = 0; i < line->len; ++i) { + if (line->text[i] != FONT_INVALID) { + n = (line->text[i] >> 16); + if (n == 0) n = 1; + line->pw += n; + } + } + } + + // fprintf(stderr, "Line length %d // %s\n", line->pw, line->text); } diff --git a/src/xpydf/FontOutputDev.h b/src/xpydf/FontOutputDev.h index 5cdd230..183a061 100644 --- a/src/xpydf/FontOutputDev.h +++ b/src/xpydf/FontOutputDev.h @@ -2,25 +2,107 @@ #define FONT_OUTPUT_DEV_H #include +#include #include #include "UnicodeMap.h" #include "TextOutputDev.h" +#define FONT_INVALID 256 +#define FONT_UNKNOWN 255 + typedef struct FontSpec { unsigned int fontNameId, fontTypeId, fontSize; } FontSpec; +typedef struct NamedFontSpec { + std::string fontName, fontType; + unsigned int fontSize; +} NamedFontSpec; + bool operator<(const FontSpec& l, const FontSpec& r); bool operator==(const FontSpec& l, const FontSpec& r); bool operator!=(const FontSpec& l, const FontSpec& r); class TextPageFont: public TextPage { public: - TextPageFont(TextOutputControl *controlA) : TextPage(controlA) {fprintf(stderr, "Making TextPageFont\n");}; + TextPageFont(TextOutputControl *controlA) : TextPage(controlA) { + if (!(uMap = globalParams->getTextEncoding())) { + fprintf(stderr, "WARNING: Encoding not found"); + return; + } + + spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); + eolLen = 0; // make gcc happy + + switch (globalParams->getTextEOL()) { + case eolUnix: + eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); + break; + case eolDOS: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + eolLen += uMap->mapUnicode(0x0a, eol + eolLen, (int)sizeof(eol) - eolLen); + break; + case eolMac: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + break; + } + + if (eolLen != 1 || spaceLen != 1) { + fprintf(stderr, "WARNING: Unexpected delimiter lengths: [spacelen] = %d, [eollen] = %d\n", spaceLen, eolLen); + } + + fontNameIds["__space__"] = -1U; + fontTypeIds["__space__"] = -1U; + FontSpec dummy = {-1U, -1U, 0}; + fontSpecIds[dummy] = space[0]; + + fontNameIds["__eol__"] = -2U; + fontTypeIds["__eol__"] = -2U; + dummy = (FontSpec){-2U, -2U, 0}; + fontSpecIds[dummy] = eol[0]; + + fontNameIds["__invalid__"] = -3U; + fontTypeIds["__invalid__"] = -3U; + dummy = (FontSpec){-3U, -3U, 0}; + fontSpecIds[dummy] = FONT_INVALID; + + fontNameIds["__unknown__"] = -4U; + fontTypeIds["__unknown__"] = -4U; + dummy = (FontSpec){-4U, -4U, 0}; + fontSpecIds[dummy] = FONT_UNKNOWN; + + + for(int i = 0; i <= 253; i++) { + if (i != space[0] && i != eol[0] && i != FONT_UNKNOWN) { + availableIds.push(i); + } + }; + }; - std::map getFontIds() { - return fontIds; + std::map getFontSpecs() { + std::map result; + + std::map fontNames; + std::map fontTypes; + + for (auto pair : fontNameIds) { + fontNames[pair.second] = pair.first; + } + + for (auto pair : fontTypeIds) { + fontTypes[pair.second] = pair.first; + } + + for (auto pair : fontSpecIds) { + result[pair.second] = (NamedFontSpec) { + fontNames[pair.first.fontNameId], + fontTypes[pair.first.fontTypeId], + pair.first.fontSize + }; + } + + return result; } // protected: @@ -29,13 +111,21 @@ class TextPageFont: public TextPage { int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, TextFontInfo *fontA, double fontSizeA, double colorRA, double colorGA, double colorBA) override; - - void encodeFragment(Unicode *text, int len, UnicodeMap *uMap, GBool primaryLR, GString *s) override; - + + void encodeFragment(Unicode *text, int len, UnicodeMap *uMap, + GBool primaryLR, GString *s) override; + + void computeLinePhysWidth(TextLine *line, UnicodeMap *uMap) override; private: std::map fontNameIds; std::map fontTypeIds; - std::map fontIds; + std::map fontSpecIds; + + std::queue availableIds; + + char space[8], eol[16]; + int spaceLen, eolLen; + UnicodeMap *uMap; }; class FontOutputDev: public TextOutputDev { @@ -46,12 +136,9 @@ class FontOutputDev: public TextOutputDev { text = new TextPageFont(&control); } - std::map getFontIds() { - return text->getFontIds(); + std::map getFontSpecs() { + return ((TextPageFont *)text)->getFontSpecs(); } - -protected: - TextPageFont *text; }; #endif diff --git a/src/xpydf/PdfLoader.cc b/src/xpydf/PdfLoader.cc index 01070ed..b8afd40 100644 --- a/src/xpydf/PdfLoader.cc +++ b/src/xpydf/PdfLoader.cc @@ -8,6 +8,8 @@ #include #include +#include + #include "gmem.h" #include "gmempp.h" #include "parseargs.h" @@ -38,8 +40,31 @@ #include "ImageDataDev.h" #include "ImageInfoDev.h" +#define PAGES 1062 +#define PAGEE 1062 + static void outputToStringStream(void *stream, const char *text, int len) { + // fprintf(stderr, "Adding %d characters\n", len); + // if (len > 1) + // fprintf(stderr, "(%.3d) %s\n", len, text); + ((std::stringstream *)stream)->write(text, len); +} + +static void outputToStringStream2(void *stream, const char *text, int len) { + // fprintf(stderr, "Adding %d characters\n", len); + // if (len > 1) { + // fprintf(stderr, "(%.3d) ", len); + // for (int i = 0; i < len; i++) { + // if (text[i] == ' ') { + // fprintf(stderr, " "); + // } else { + // fprintf(stderr, "x"); + // } + // } + + // fprintf(stderr, "\n"); + // } ((std::stringstream *)stream)->write(text, len); } @@ -53,6 +78,7 @@ PdfLoader::PdfLoader(LoaderConfig config, char *fileName, char *ownerPw, char *u globalParams->setErrQuiet(config.quiet); globalParams->setMapNumericCharNames(config.mapNumericCharNames); globalParams->setMapUnknownCharNames(config.mapUnknownCharNames); + globalParams->setReadUnicodeCMap(config.readUnicodeCMap); globalParams->setupBaseFonts(NULL); switch (config.mode) { @@ -116,6 +142,8 @@ std::vector PdfLoader::extractText() { goto err; } + firstPage = PAGES; + lastPage = PAGEE; firstPage = 1; lastPage = doc->getNumPages(); @@ -123,6 +151,7 @@ std::vector PdfLoader::extractText() { if (textOut->isOk()) { for (int page = firstPage; page <= lastPage; page++) { + // fprintf(stderr, "Processing page %d\n", page); stream->str(""); doc->displayPages(textOut, page, page, 72, 72, 0, gFalse, gTrue, gFalse); pages.push_back(stream->str()); @@ -139,7 +168,7 @@ std::vector PdfLoader::extractText() { return pages; } -std::vector PdfLoader::extractFontMap() { +std::vector PdfLoader::extractFontMap(std::map &fontSpecs) { FontOutputDev *fontOut; std::stringstream *stream = new std::stringstream(); std::vector pages; @@ -149,18 +178,23 @@ std::vector PdfLoader::extractFontMap() { goto err; } + firstPage = PAGES; + lastPage = PAGEE; firstPage = 1; lastPage = doc->getNumPages(); - fontOut = new FontOutputDev(&outputToStringStream, stream, &textOutControl); + fontOut = new FontOutputDev(&outputToStringStream2, stream, &textOutControl); if (fontOut->isOk()) { for (int page = firstPage; page <= lastPage; page++) { + // fprintf(stderr, "Processing page %d\n", page); stream->str(""); doc->displayPages(fontOut, page, page, 72, 72, 0, gFalse, gTrue, gFalse); pages.push_back(stream->str()); } - } + } + + fontSpecs = fontOut->getFontSpecs(); delete fontOut; err: @@ -554,13 +588,34 @@ int PdfLoader::getErrorCode() { return (int)doc->getErrorCode(); } -#include +// #include -using namespace std; +// using namespace std; -int main() { - LoaderConfig config; - PdfLoader *l = new PdfLoader(config, "skf.pdf"); - cout << "Extracting fonts:" << endl; - l->extractFonts(); -} +// int main() { +// LoaderConfig config; +// map fontSpecs; + +// PdfLoader *l = new PdfLoader(config, "skf.pdf"); +// vector pageText = l->extractText(); +// vector fontMap = l->extractFontMap(fontSpecs); + +// // fprintf(stderr, "Read %lu pages text, %lu pages fontmap\n", pageText.size(), fontMap.size()); + +// int diff = 0; + +// for (int i = 0; i < pageText.size(); i++) { +// diff += fabs((int)pageText[i].length() - (int)fontMap[i].length()); +// if (pageText[i].length() != fontMap[i].length()) { +// fprintf(stderr, "Page %d mismatch: %lu text, %lu font\n", i, pageText[i].length(), fontMap[i].length()); +// } +// } + +// fprintf(stderr, "Total diff %d\n", diff); + +// // cerr << pageText[0] << endl; + +// // for (auto pair : fontSpecs) { +// // fprintf(stderr, "Font id %d had name '%s', type '%s', size %d\n", pair.first, pair.second.fontName.c_str(), pair.second.fontType.c_str(), pair.second.fontSize); +// // } +// } diff --git a/src/xpydf/PdfLoader.h b/src/xpydf/PdfLoader.h index 5115047..e898dd3 100644 --- a/src/xpydf/PdfLoader.h +++ b/src/xpydf/PdfLoader.h @@ -9,6 +9,7 @@ #include "PDFDoc.h" #include "TextOutputDev.h" +#include "FontOutputDev.h" #include "ImageDataDev.h" #include "ImageInfoDev.h" @@ -22,6 +23,7 @@ typedef struct LoaderConfig { GBool quiet = gTrue; GBool mapNumericCharNames = gFalse; GBool mapUnknownCharNames = gTrue; + GBool readUnicodeCMap = gTrue; unsigned int mode = 0; } LoaderConfig; @@ -36,7 +38,7 @@ class PdfLoader { PdfLoader(LoaderConfig config, char *fileName, char *ownerPw = NULL, char *userPw = NULL); ~PdfLoader(); std::vector extractText(); - std::vector extractFontMap(); + std::vector extractFontMap(std::map &fontSpecs); std::vector extractPageInfo(); std::vector extractFonts(); std::vector extractImages(int pageNum); diff --git a/src/xpydf/PdfLoaderWrapper.cc b/src/xpydf/PdfLoaderWrapper.cc index 38d40cb..06b8ceb 100644 --- a/src/xpydf/PdfLoaderWrapper.cc +++ b/src/xpydf/PdfLoaderWrapper.cc @@ -23,15 +23,16 @@ PyObject *construct(PyObject *self, PyObject *args) { char *ownerPw = NULL; char *userPw = NULL; - PyArg_ParseTuple(args, "Opppppppbzz", &pobj0, + PyArg_ParseTuple(args, "OppppppppIzz", &pobj0, &(config.clipText), &(config.discardDiag), &(config.discardRotatedText), &(config.verbose), &(config.quiet), - &(config.mode), &(config.mapNumericCharNames), &(config.mapUnknownCharNames), + &(config.readUnicodeCMap), + &(config.mode), &ownerPw, &userPw ); @@ -112,6 +113,22 @@ PyObject *extractText(PyObject *self, PyObject *args) { return Py_BuildValue("O", converted); } +PyObject *extractFontMap(PyObject *self, PyObject *args) { + vector res; + + PyObject *loaderCapsule; + PyArg_ParseTuple(args, "O", &loaderCapsule); + + PdfLoader *loader = (PdfLoader *)PyCapsule_GetPointer(loaderCapsule, "loaderPtr"); + map fontSpecs; + vector result = loader->extractFontMap(fontSpecs); + + PyObject *fontMap = vectorStringToList(result); + PyObject *fontDict = mapFontSpecsToDict(fontSpecs); + + return Py_BuildValue("OO", fontMap, fontDict); +} + PyObject *extractPageInfo(PyObject *self, PyObject *args) { vector res; @@ -223,6 +240,10 @@ PyMethodDef cXpdfPythonFunctions[] = { extractText, METH_VARARGS, "Extract text as bytes"}, + {"extractFontMap", + extractFontMap, METH_VARARGS, + "Extract font map as bytes"}, + {"extractPageInfo", extractPageInfo, METH_VARARGS, "Extract image metadata"}, diff --git a/src/xpydf/PyCppConversion.cc b/src/xpydf/PyCppConversion.cc index 540d6b4..3e824a7 100644 --- a/src/xpydf/PyCppConversion.cc +++ b/src/xpydf/PyCppConversion.cc @@ -64,3 +64,25 @@ PyObject *vectorImagesToList(const std::vector &data) { return listObj; } + +PyObject *mapFontSpecsToDict(const std::map &data) { + PyObject *dict = PyDict_New(); + if (!dict) throw logic_error("Unable to allocate memory for Python dict"); + + for (auto pair : data) { + PyObject *item = PyDict_New(); + if (!item) throw logic_error("Unable to allocate memory for Python dict"); + + PyObject *id = PyLong_FromLong(pair.first); + PyObject *name = PyUnicode_FromString(pair.second.fontName.c_str()); + PyObject *type = PyUnicode_FromString(pair.second.fontType.c_str()); + + PyDict_SetItemString(item, "name", name); + PyDict_SetItemString(item, "type", type); + PyDict_SetItemString(item, "size", PyLong_FromLong(pair.second.fontSize)); + + PyDict_SetItem(dict, id, item); + } + + return dict; +} diff --git a/src/xpydf/PyCppConversion.h b/src/xpydf/PyCppConversion.h index 169523c..6e6f48c 100644 --- a/src/xpydf/PyCppConversion.h +++ b/src/xpydf/PyCppConversion.h @@ -1,6 +1,7 @@ #ifndef PY_CPP_CONVERSION_H #define PY_CPP_CONVERSION_H +#include #include #include @@ -16,4 +17,6 @@ PyObject *vectorPagesToList(const std::vector &data); PyObject *vectorImagesToList(const std::vector &data); +PyObject *mapFontSpecsToDict(const std::map &data); + #endif \ No newline at end of file diff --git a/src/xpydf/cXpdfPython.pyi b/src/xpydf/cXpdfPython.pyi index 5414d03..0eccc99 100644 --- a/src/xpydf/cXpdfPython.pyi +++ b/src/xpydf/cXpdfPython.pyi @@ -1,11 +1,16 @@ -from typing import Any, List, Optional +from typing import Any, Dict, List, Optional, Tuple, TypedDict import numpy.typing as npt from xpydf.pdf_loader import PageInfo class XpdfPythonCapsule: ... +class Font(TypedDict): + name: str + type: str + size: str + def construct( filename: str, cliptext: bool, @@ -13,13 +18,15 @@ def construct( discard_rotated_text: bool, verbose: bool, quiet: bool, - mode: int, mapNumericCharNames: bool = False, mapUnknownCharNames: bool = True, + readUnicodeCMap: bool = True, + mode: int = 0, ownerPw: Optional[str] = None, userPw: Optional[str] = None, ) -> XpdfPythonCapsule: ... def extractText(capsule: XpdfPythonCapsule) -> List[bytes]: ... +def extractFontMap(capsule: XpdfPythonCapsule) -> Tuple[List[bytes], Dict[int, Font]]: ... def extractPageInfo(capsule: XpdfPythonCapsule) -> List[PageInfo]: ... def extractFonts(capsule: XpdfPythonCapsule) -> List[str]: ... def extractImages(capsule: XpdfPythonCapsule, page_number: int) -> List[npt.NDArray[Any]]: ... diff --git a/src/xpydf/pdf_loader.py b/src/xpydf/pdf_loader.py index c848976..f6a8a2e 100644 --- a/src/xpydf/pdf_loader.py +++ b/src/xpydf/pdf_loader.py @@ -1,8 +1,12 @@ -from typing import Any, List, Optional, TypedDict +from typing import Any, Dict, List, Optional, Tuple, TypedDict import cXpdfPython import numpy.typing as npt +class Font(TypedDict): + name: str + type: str + size: str class ImageInfo(TypedDict): """Container for image metadata @@ -60,9 +64,10 @@ def __init__( discard_rotated_text: bool = True, verbose: bool = False, quiet: bool = True, - mode: str = "table", map_numeric_char_names: bool = False, map_unknown_char_names: bool = True, + read_unicode_cmap: bool = True, + mode: str = "table", owner_password: Optional[str] = None, user_password: Optional[str] = None, ): @@ -105,7 +110,18 @@ def __init__( self.filename = filename self.capsule = cXpdfPython.construct( - filename, cliptext, discard_diag, discard_rotated_text, verbose, quiet, xpdf_mode, map_numeric_char_names, map_unknown_char_names, owner_password, user_password + filename, + cliptext, + discard_diag, + discard_rotated_text, + verbose, + quiet, + map_numeric_char_names, + map_unknown_char_names, + read_unicode_cmap, + xpdf_mode, + owner_password, + user_password ) def extract_bytes(self) -> List[bytes]: @@ -121,6 +137,12 @@ def extract_bytes(self) -> List[bytes]: pages = cXpdfPython.extractText(self.capsule) return pages + + def extract_font_map(self) -> Tuple[List[bytes], Dict[int, Font]]: + if self.capsule is not None: + pages, fonts = cXpdfPython.extractFontMap(self.capsule) + + return pages, fonts def extract_strings(self) -> List[str]: """Extract and decode text from the pdf diff --git a/src/xpydf/pdf_loader.pyi b/src/xpydf/pdf_loader.pyi index 272eb2b..c594491 100644 --- a/src/xpydf/pdf_loader.pyi +++ b/src/xpydf/pdf_loader.pyi @@ -1,4 +1,4 @@ -from typing import Any, List, Optional, TypedDict +from typing import Any, Dict, List, Optional, Tuple, TypedDict import numpy.typing as npt @@ -14,6 +14,11 @@ class PageInfo(TypedDict): height: float images: List[ImageInfo] +class Font(TypedDict): + name: str + type: str + size: str + class PdfLoader: filename: str capsule: Optional[XpdfPythonCapsule] = None @@ -28,13 +33,15 @@ class PdfLoader: insert_bom: bool = False, verbose: bool = False, quiet: bool = True, - mode: str = "table", map_numeric_char_names: bool = False, map_unknown_char_names: bool = True, + read_unicode_cmap: bool = True, + mode: str = "table", owner_password: Optional[str] = None, user_password: Optional[str] = None, ) -> None: ... def extract_bytes(self) -> List[bytes]: ... + def extract_font_map(self) -> Tuple[List[bytes], Dict[int, Font]]: ... def extract_strings(self) -> List[str]: ... def extract_page_info(self) -> List[PageInfo]: ... def extract_fonts(self) -> List[str]: ... From fd8877665ef7d43a33b0ab794b869934d0bf5939 Mon Sep 17 00:00:00 2001 From: Matthijs Wesseling Date: Thu, 2 Nov 2023 14:51:13 +0100 Subject: [PATCH 10/10] Cleanup --- src/xpdf-4.04/xpdf/GfxFont.cc | 4 +-- src/xpydf/FontOutputDev.cc | 63 +++++++++++++++++++++++++++++------ src/xpydf/FontOutputDev.h | 56 ++----------------------------- src/xpydf/PdfLoader.cc | 44 ++++++++++++------------ 4 files changed, 79 insertions(+), 88 deletions(-) diff --git a/src/xpdf-4.04/xpdf/GfxFont.cc b/src/xpdf-4.04/xpdf/GfxFont.cc index d68aaec..75913bc 100644 --- a/src/xpdf-4.04/xpdf/GfxFont.cc +++ b/src/xpdf-4.04/xpdf/GfxFont.cc @@ -1284,7 +1284,7 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GString *nameA, } } - // construct the char code -> Unicode mapping object + // construct the char code -> Unicode mapping object ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode); // merge in a ToUnicode CMap, if there is one -- this overwrites @@ -1425,7 +1425,7 @@ int Gfx8BitFont::getNextChar(char *s, int len, CharCode *code, *code = c = (CharCode)(*s & 0xff); *uLen = ctu->mapToUnicode(c, u, uSize); - *dx = widths[c]; + *dx = widths[c]; *dy = *ox = *oy = 0; return 1; } diff --git a/src/xpydf/FontOutputDev.cc b/src/xpydf/FontOutputDev.cc index a7afc98..d78981c 100644 --- a/src/xpydf/FontOutputDev.cc +++ b/src/xpydf/FontOutputDev.cc @@ -27,6 +27,59 @@ bool operator!=(const FontSpec& l, const FontSpec& r) { return (l.fontNameId != r.fontNameId || l.fontTypeId != r.fontTypeId || l.fontSize != r.fontSize); } +TextPageFont::TextPageFont(TextOutputControl *controlA) : TextPage(controlA) { + if (!(uMap = globalParams->getTextEncoding())) { + fprintf(stderr, "WARNING: Encoding not found"); + return; + } + + spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); + eolLen = 0; // make gcc happy + + switch (globalParams->getTextEOL()) { + case eolUnix: + eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); + break; + case eolDOS: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + eolLen += uMap->mapUnicode(0x0a, eol + eolLen, (int)sizeof(eol) - eolLen); + break; + case eolMac: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + break; + } + + if (eolLen != 1 || spaceLen != 1) { + fprintf(stderr, "WARNING: Unexpected delimiter lengths: [spacelen] = %d, [eollen] = %d\n", spaceLen, eolLen); + } + + fontNameIds["__space__"] = -1U; + fontTypeIds["__space__"] = -1U; + FontSpec dummy = {-1U, -1U, 0}; + fontSpecIds[dummy] = space[0]; + + fontNameIds["__eol__"] = -2U; + fontTypeIds["__eol__"] = -2U; + dummy = (FontSpec){-2U, -2U, 0}; + fontSpecIds[dummy] = eol[0]; + + fontNameIds["__invalid__"] = -3U; + fontTypeIds["__invalid__"] = -3U; + dummy = (FontSpec){-3U, -3U, 0}; + fontSpecIds[dummy] = FONT_INVALID; + + fontNameIds["__unknown__"] = -4U; + fontTypeIds["__unknown__"] = -4U; + dummy = (FontSpec){-4U, -4U, 0}; + fontSpecIds[dummy] = FONT_UNKNOWN; + + + for(int i = 0; i <= 253; i++) { + if (i != space[0] && i != eol[0] && i != FONT_UNKNOWN) { + availableIds.push(i); + } + }; +}; TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, double xMinA, double yMinA, double xMaxA, double yMaxA, @@ -39,8 +92,6 @@ TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, char buf[8]; int n = uMap->mapUnicode(cA, buf, sizeof(buf)); - - // fprintf(stderr, "Char [%c] font [%s] size [%.0f] color [%.1f, %.1f, %.1f]\n", buf[0], name->getCString(), fontSizeA, colorRA, colorGA, colorBA); if (n == 0) { fontId = FONT_INVALID; @@ -72,12 +123,8 @@ TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, } } - // fprintf(stderr, "fontId %d >> ", fontId); - fontId = fontId ^ (n << 16); - // fprintf(stderr, "%d (%d , %d)\n", fontId, (char)fontId, (fontId >> 16)); - return new TextChar(fontId, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, colorRA, colorGA, colorBA); @@ -97,8 +144,6 @@ void TextPageFont::encodeFragment(Unicode *text, int len, UnicodeMap *uMap, GBoo } } } - - // fprintf(stderr, "Encoded %d\n", s->getLength()); } void TextPageFont::computeLinePhysWidth(TextLine *line, UnicodeMap *uMap) { @@ -115,6 +160,4 @@ void TextPageFont::computeLinePhysWidth(TextLine *line, UnicodeMap *uMap) { } } } - - // fprintf(stderr, "Line length %d // %s\n", line->pw, line->text); } diff --git a/src/xpydf/FontOutputDev.h b/src/xpydf/FontOutputDev.h index 183a061..6a728aa 100644 --- a/src/xpydf/FontOutputDev.h +++ b/src/xpydf/FontOutputDev.h @@ -26,59 +26,7 @@ bool operator!=(const FontSpec& l, const FontSpec& r); class TextPageFont: public TextPage { public: - TextPageFont(TextOutputControl *controlA) : TextPage(controlA) { - if (!(uMap = globalParams->getTextEncoding())) { - fprintf(stderr, "WARNING: Encoding not found"); - return; - } - - spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); - eolLen = 0; // make gcc happy - - switch (globalParams->getTextEOL()) { - case eolUnix: - eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); - break; - case eolDOS: - eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); - eolLen += uMap->mapUnicode(0x0a, eol + eolLen, (int)sizeof(eol) - eolLen); - break; - case eolMac: - eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); - break; - } - - if (eolLen != 1 || spaceLen != 1) { - fprintf(stderr, "WARNING: Unexpected delimiter lengths: [spacelen] = %d, [eollen] = %d\n", spaceLen, eolLen); - } - - fontNameIds["__space__"] = -1U; - fontTypeIds["__space__"] = -1U; - FontSpec dummy = {-1U, -1U, 0}; - fontSpecIds[dummy] = space[0]; - - fontNameIds["__eol__"] = -2U; - fontTypeIds["__eol__"] = -2U; - dummy = (FontSpec){-2U, -2U, 0}; - fontSpecIds[dummy] = eol[0]; - - fontNameIds["__invalid__"] = -3U; - fontTypeIds["__invalid__"] = -3U; - dummy = (FontSpec){-3U, -3U, 0}; - fontSpecIds[dummy] = FONT_INVALID; - - fontNameIds["__unknown__"] = -4U; - fontTypeIds["__unknown__"] = -4U; - dummy = (FontSpec){-4U, -4U, 0}; - fontSpecIds[dummy] = FONT_UNKNOWN; - - - for(int i = 0; i <= 253; i++) { - if (i != space[0] && i != eol[0] && i != FONT_UNKNOWN) { - availableIds.push(i); - } - }; - }; + TextPageFont(TextOutputControl *controlA); std::map getFontSpecs() { std::map result; @@ -105,7 +53,7 @@ class TextPageFont: public TextPage { return result; } -// protected: +protected: TextChar *textCharType(Unicode cA, int charPosA, int charLenA, double xMinA, double yMinA, double xMaxA, double yMaxA, int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, diff --git a/src/xpydf/PdfLoader.cc b/src/xpydf/PdfLoader.cc index b8afd40..0fc766c 100644 --- a/src/xpydf/PdfLoader.cc +++ b/src/xpydf/PdfLoader.cc @@ -588,34 +588,34 @@ int PdfLoader::getErrorCode() { return (int)doc->getErrorCode(); } -// #include +#include -// using namespace std; +using namespace std; -// int main() { -// LoaderConfig config; -// map fontSpecs; +int main() { + LoaderConfig config; + map fontSpecs; -// PdfLoader *l = new PdfLoader(config, "skf.pdf"); -// vector pageText = l->extractText(); -// vector fontMap = l->extractFontMap(fontSpecs); + PdfLoader *l = new PdfLoader(config, "skf.pdf"); + vector pageText = l->extractText(); + vector fontMap = l->extractFontMap(fontSpecs); -// // fprintf(stderr, "Read %lu pages text, %lu pages fontmap\n", pageText.size(), fontMap.size()); + // fprintf(stderr, "Read %lu pages text, %lu pages fontmap\n", pageText.size(), fontMap.size()); -// int diff = 0; + int diff = 0; -// for (int i = 0; i < pageText.size(); i++) { -// diff += fabs((int)pageText[i].length() - (int)fontMap[i].length()); -// if (pageText[i].length() != fontMap[i].length()) { -// fprintf(stderr, "Page %d mismatch: %lu text, %lu font\n", i, pageText[i].length(), fontMap[i].length()); -// } -// } + for (int i = 0; i < pageText.size(); i++) { + diff += fabs((int)pageText[i].length() - (int)fontMap[i].length()); + if (pageText[i].length() != fontMap[i].length()) { + fprintf(stderr, "Page %d mismatch: %lu text, %lu font\n", i, pageText[i].length(), fontMap[i].length()); + } + } -// fprintf(stderr, "Total diff %d\n", diff); + fprintf(stderr, "Total diff %d\n", diff); -// // cerr << pageText[0] << endl; + // cerr << pageText[0] << endl; -// // for (auto pair : fontSpecs) { -// // fprintf(stderr, "Font id %d had name '%s', type '%s', size %d\n", pair.first, pair.second.fontName.c_str(), pair.second.fontType.c_str(), pair.second.fontSize); -// // } -// } + // for (auto pair : fontSpecs) { + // fprintf(stderr, "Font id %d had name '%s', type '%s', size %d\n", pair.first, pair.second.fontName.c_str(), pair.second.fontType.c_str(), pair.second.fontSize); + // } +}