diff --git a/Makefile b/Makefile index c28dc5c..1071091 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ FOFI_DIR = $(XDIR)fofi/ GOO_DIR = $(XDIR)goo/ XPDF_DIR = $(XDIR)xpdf/ -PY_SRC = src/xpydf/PdfLoader.cc src/xpydf/ImageInfoDev.cc src/xpydf/ImageDataDev.cc +PY_SRC = src/xpydf/PdfLoader.cc src/xpydf/ImageInfoDev.cc src/xpydf/ImageDataDev.cc src/xpydf/FontOutputDev.cc SPLASH_SRC = $(wildcard $(SPLASH_DIR)*.cc) FOFI_SRC = $(wildcard $(FOFI_DIR)*.cc) diff --git a/src/xpdf-4.04/xpdf/GfxFont.cc b/src/xpdf-4.04/xpdf/GfxFont.cc index 30e8bd1..75913bc 100644 --- a/src/xpdf-4.04/xpdf/GfxFont.cc +++ b/src/xpdf-4.04/xpdf/GfxFont.cc @@ -1283,7 +1283,7 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GString *nameA, } } } - + // construct the char code -> Unicode mapping object ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode); @@ -1291,7 +1291,9 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GString *nameA, // existing entries in ctu, i.e., the ToUnicode CMap takes // precedence, but the other encoding info is allowed to fill in any // holes - readToUnicodeCMap(fontDict, 8, ctu); + if (globalParams->getReadUnicodeCMap()) { + readToUnicodeCMap(fontDict, 8, ctu); + } // look for a Unicode-to-Unicode mapping if (name && (utu = globalParams->getUnicodeToUnicode(name))) { diff --git a/src/xpdf-4.04/xpdf/GlobalParams.cc b/src/xpdf-4.04/xpdf/GlobalParams.cc index 5fd4393..64deb76 100644 --- a/src/xpdf-4.04/xpdf/GlobalParams.cc +++ b/src/xpdf-4.04/xpdf/GlobalParams.cc @@ -749,6 +749,7 @@ GlobalParams::GlobalParams(const char *cfgFileName) { defaultPrinter = NULL; mapNumericCharNames = gTrue; mapUnknownCharNames = gFalse; + readUnicodeCMap = gTrue; mapExtTrueTypeFontsViaUnicode = gTrue; useTrueTypeUnicodeMapping = gFalse; droppedFonts = new GHash(gTrue); @@ -3268,6 +3269,16 @@ GBool GlobalParams::getMapUnknownCharNames() { return map; } +GBool GlobalParams::getReadUnicodeCMap() { + GBool read; + + lockGlobalParams; + read = readUnicodeCMap; + unlockGlobalParams; + + return read; +} + GBool GlobalParams::getMapExtTrueTypeFontsViaUnicode() { GBool map; @@ -3793,6 +3804,12 @@ void GlobalParams::setMapUnknownCharNames(GBool map) { unlockGlobalParams; } +void GlobalParams::setReadUnicodeCMap(GBool read) { + lockGlobalParams; + readUnicodeCMap = read; + unlockGlobalParams; +} + void GlobalParams::setMapExtTrueTypeFontsViaUnicode(GBool map) { lockGlobalParams; mapExtTrueTypeFontsViaUnicode = map; diff --git a/src/xpdf-4.04/xpdf/GlobalParams.h b/src/xpdf-4.04/xpdf/GlobalParams.h index b8299fc..fadac85 100644 --- a/src/xpdf-4.04/xpdf/GlobalParams.h +++ b/src/xpdf-4.04/xpdf/GlobalParams.h @@ -338,6 +338,7 @@ class GlobalParams { GString *getDefaultPrinter(); GBool getMapNumericCharNames(); GBool getMapUnknownCharNames(); + GBool getReadUnicodeCMap(); GBool getMapExtTrueTypeFontsViaUnicode(); GBool getUseTrueTypeUnicodeMapping(); GBool isDroppedFont(const char *fontName); @@ -400,6 +401,7 @@ class GlobalParams { void setOverprintPreview(GBool preview); void setMapNumericCharNames(GBool map); void setMapUnknownCharNames(GBool map); + void setReadUnicodeCMap(GBool map); void setMapExtTrueTypeFontsViaUnicode(GBool map); void setTabStateFile(char *tabStateFileA); void setPrintCommands(GBool printCommandsA); @@ -589,6 +591,7 @@ class GlobalParams { // from the viewer) GBool mapNumericCharNames; // map numeric char names (from font subsets)? GBool mapUnknownCharNames; // map unknown char names? + GBool readUnicodeCMap; // Read the unicode c map? GBool mapExtTrueTypeFontsViaUnicode; // map char codes to GID via Unicode // for external TrueType fonts? GBool useTrueTypeUnicodeMapping; // use the Unicode cmaps in TrueType diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.cc b/src/xpdf-4.04/xpdf/TextOutputDev.cc index 0411db8..9a59da5 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.cc +++ b/src/xpdf-4.04/xpdf/TextOutputDev.cc @@ -203,38 +203,6 @@ static inline double dmax(double x, double y) { // TextChar //------------------------------------------------------------------------ -class TextChar { -public: - - TextChar(Unicode cA, int charPosA, int charLenA, - double xMinA, double yMinA, double xMaxA, double yMaxA, - int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, - TextFontInfo *fontA, double fontSizeA, - double colorRA, double colorGA, double colorBA); - - static int cmpX(const void *p1, const void *p2); - static int cmpY(const void *p1, const void *p2); - static int cmpCharPos(const void *p1, const void *p2); - - Unicode c; - int charPos; - int charLen; - double xMin, yMin, xMax, yMax; - TextFontInfo *font; - double fontSize; - double colorR, - colorG, - colorB; - - // group the byte-size fields to minimize object size - Guchar rot; - char rotated; - char clipped; - char invisible; - char spaceAfter; - char overlap; -}; - TextChar::TextChar(Unicode cA, int charPosA, int charLenA, double xMinA, double yMinA, double xMaxA, double yMaxA, int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, @@ -1559,13 +1527,13 @@ void TextPage::addChar(GfxState *state, double x, double y, } else { j = i; } - chars->append(new TextChar(uBuf[j], charPos, nBytes, - xMin, yMin, xMax, yMax, - curRot, rotated, clipped, - state->getRender() == 3 || alpha < 0.001, - curFont, curFontSize, - colToDbl(rgb.r), colToDbl(rgb.g), - colToDbl(rgb.b))); + chars->append(textCharType(uBuf[j], charPos, nBytes, + xMin, yMin, xMax, yMax, + curRot, rotated, clipped, + state->getRender() == 3 || alpha < 0.001, + curFont, curFontSize, + colToDbl(rgb.r), colToDbl(rgb.g), + colToDbl(rgb.b))); } } diff --git a/src/xpdf-4.04/xpdf/TextOutputDev.h b/src/xpdf-4.04/xpdf/TextOutputDev.h index 302975c..5413b1c 100644 --- a/src/xpdf-4.04/xpdf/TextOutputDev.h +++ b/src/xpdf-4.04/xpdf/TextOutputDev.h @@ -137,6 +137,42 @@ class TextFontInfo { friend class TextWord; }; +//------------------------------------------------------------------------ +// TextChar +//------------------------------------------------------------------------ + +class TextChar { +public: + + TextChar(Unicode cA, int charPosA, int charLenA, + double xMinA, double yMinA, double xMaxA, double yMaxA, + int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, + TextFontInfo *fontA, double fontSizeA, + double colorRA, double colorGA, double colorBA); + + static int cmpX(const void *p1, const void *p2); + static int cmpY(const void *p1, const void *p2); + static int cmpCharPos(const void *p1, const void *p2); + + Unicode c; + int charPos; + int charLen; + double xMin, yMin, xMax, yMax; + TextFontInfo *font; + double fontSize; + double colorR, + colorG, + colorB; + + // group the byte-size fields to minimize object size + Guchar rot; + char rotated; + char clipped; + char invisible; + char spaceAfter; + char overlap; +}; + //------------------------------------------------------------------------ // TextWord //------------------------------------------------------------------------ @@ -235,7 +271,7 @@ class TextLine { double getEdge(int idx) { return edge[idx]; } GBool getHyphenated() { return hyphenated; } -private: +public: static int cmpX(const void *p1, const void *p2); @@ -386,7 +422,7 @@ class TextPage { public: TextPage(TextOutputControl *controlA); - ~TextPage(); + virtual ~TextPage(); // Write contents of page to a stream. void write(void *outputStream, TextOutputFunc outputFunc); @@ -492,20 +528,25 @@ class TextPage { void removeChars(double xMin, double yMin, double xMax, double yMax, double xOverlapThresh, double yOverlapThresh); -private: +public: + virtual TextChar *textCharType(Unicode cA, int charPosA, int charLenA, + double xMinA, double yMinA, double xMaxA, double yMaxA, + int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, + TextFontInfo *fontA, double fontSizeA, + double colorRA, double colorGA, double colorBA + ) { + return new TextChar(cA, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, + rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, + colorRA, colorGA, colorBA); + } + virtual void encodeFragment(Unicode *text, int len, UnicodeMap *uMap, + GBool primaryLR, GString *s); + + virtual void computeLinePhysWidth(TextLine *line, UnicodeMap *uMap); - void startPage(GfxState *state); - void clear(); - void updateFont(GfxState *state); void addChar(GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, Unicode *u, int uLen); - void incCharCount(int nChars); - void beginActualText(GfxState *state, Unicode *u, int uLen); - void endActualText(GfxState *state); - void addUnderline(double x0, double y0, double x1, double y1); - void addLink(double xMin, double yMin, double xMax, double yMax, - Link *link); // output void writeReadingOrder(void *outputStream, @@ -538,8 +579,18 @@ class TextPage { UnicodeMap *uMap, char *space, int spaceLen, char *eol, int eolLen); - void encodeFragment(Unicode *text, int len, UnicodeMap *uMap, - GBool primaryLR, GString *s); + +private: + + void startPage(GfxState *state); + void clear(); + void updateFont(GfxState *state); + void incCharCount(int nChars); + void beginActualText(GfxState *state, Unicode *u, int uLen); + void endActualText(GfxState *state); + void addUnderline(double x0, double y0, double x1, double y1); + void addLink(double xMin, double yMin, double xMax, double yMax, + Link *link); GBool unicodeEffectiveTypeLOrNum(Unicode u, Unicode left, Unicode right); GBool unicodeEffectiveTypeR(Unicode u, Unicode left, Unicode right); @@ -593,7 +644,6 @@ class TextPage { int getCharDirection(TextChar *ch, TextChar *left, TextChar *right); int assignPhysLayoutPositions(GList *columns); void assignLinePhysPositions(GList *columns); - void computeLinePhysWidth(TextLine *line, UnicodeMap *uMap); int assignColumnPhysPositions(GList *columns); void buildSuperLines(TextBlock *blk, GList *superLines); void assignSimpleLayoutPositions(GList *superLines, UnicodeMap *uMap); @@ -784,6 +834,10 @@ class TextOutputDev: public OutputDev { // Turn extra processing for HTML conversion on or off. void enableHTMLExtras(GBool html) { control.html = html; } +protected: + TextPage *text; // text for the current page + TextOutputControl control; // formatting parameters + private: void generateBOM(); @@ -792,8 +846,6 @@ class TextOutputDev: public OutputDev { void *outputStream; // output stream GBool needClose; // need to close the output file? // (only if outputStream is a FILE*) - TextPage *text; // text for the current page - TextOutputControl control; // formatting parameters GBool ok; // set up ok? }; diff --git a/src/xpydf/FontOutputDev.cc b/src/xpydf/FontOutputDev.cc new file mode 100644 index 0000000..d78981c --- /dev/null +++ b/src/xpydf/FontOutputDev.cc @@ -0,0 +1,163 @@ +#include + +#include "Error.h" +#include "GList.h" +#include "GlobalParams.h" +#include "UnicodeMap.h" +#include "UnicodeRemapping.h" +#include "UnicodeTypeTable.h" +#include "GfxState.h" + +#include "FontOutputDev.h" + + +bool operator<(const FontSpec& l, const FontSpec& r) { + return ( + l.fontNameId < r.fontNameId + || (l.fontNameId == r.fontNameId && l.fontTypeId < r.fontTypeId) + || (l.fontNameId == r.fontNameId && l.fontTypeId == r.fontTypeId && l.fontSize < r.fontSize) + ); +} + +bool operator==(const FontSpec& l, const FontSpec& r) { + return (l.fontNameId == r.fontNameId && l.fontTypeId == r.fontTypeId && l.fontSize == r.fontSize); +} + +bool operator!=(const FontSpec& l, const FontSpec& r) { + return (l.fontNameId != r.fontNameId || l.fontTypeId != r.fontTypeId || l.fontSize != r.fontSize); +} + +TextPageFont::TextPageFont(TextOutputControl *controlA) : TextPage(controlA) { + if (!(uMap = globalParams->getTextEncoding())) { + fprintf(stderr, "WARNING: Encoding not found"); + return; + } + + spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); + eolLen = 0; // make gcc happy + + switch (globalParams->getTextEOL()) { + case eolUnix: + eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); + break; + case eolDOS: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + eolLen += uMap->mapUnicode(0x0a, eol + eolLen, (int)sizeof(eol) - eolLen); + break; + case eolMac: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + break; + } + + if (eolLen != 1 || spaceLen != 1) { + fprintf(stderr, "WARNING: Unexpected delimiter lengths: [spacelen] = %d, [eollen] = %d\n", spaceLen, eolLen); + } + + fontNameIds["__space__"] = -1U; + fontTypeIds["__space__"] = -1U; + FontSpec dummy = {-1U, -1U, 0}; + fontSpecIds[dummy] = space[0]; + + fontNameIds["__eol__"] = -2U; + fontTypeIds["__eol__"] = -2U; + dummy = (FontSpec){-2U, -2U, 0}; + fontSpecIds[dummy] = eol[0]; + + fontNameIds["__invalid__"] = -3U; + fontTypeIds["__invalid__"] = -3U; + dummy = (FontSpec){-3U, -3U, 0}; + fontSpecIds[dummy] = FONT_INVALID; + + fontNameIds["__unknown__"] = -4U; + fontTypeIds["__unknown__"] = -4U; + dummy = (FontSpec){-4U, -4U, 0}; + fontSpecIds[dummy] = FONT_UNKNOWN; + + + for(int i = 0; i <= 253; i++) { + if (i != space[0] && i != eol[0] && i != FONT_UNKNOWN) { + availableIds.push(i); + } + }; +}; + +TextChar *TextPageFont::textCharType(Unicode cA, int charPosA, int charLenA, + double xMinA, double yMinA, double xMaxA, double yMaxA, + int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, + TextFontInfo *fontA, double fontSizeA, + double colorRA, double colorGA, double colorBA) +{ + GString *name = fontA->getFontName(); + Unicode fontId = FONT_UNKNOWN; + + char buf[8]; + int n = uMap->mapUnicode(cA, buf, sizeof(buf)); + + if (n == 0) { + fontId = FONT_INVALID; + } + else if (name) { + char fontCode[1000], fontName[1000], fontType[1000]; + + if (sscanf(name->getCString(), "%[^+]+%[^-]-%s", fontCode, fontName, fontType) != EOF) { + if (fontNameIds.find(fontName) == fontNameIds.end()) { + fontNameIds[fontName] = fontNameIds.size(); + } + + if (fontTypeIds.find(fontType) == fontTypeIds.end()) { + fontTypeIds[fontType] = fontTypeIds.size(); + } + + FontSpec spec = {fontNameIds[fontName], fontTypeIds[fontType], (unsigned int)fontSizeA}; + + if (fontSpecIds.find(spec) == fontSpecIds.end()) { + if (availableIds.size() == 0) { + fprintf(stderr, "ERROR: Font id overflow\n"); + error(errInternal, -1, "ERROR: Font id overflow"); + } + fontSpecIds[spec] = availableIds.front(); + availableIds.pop(); + } + + fontId = fontSpecIds[spec]; + } + } + + fontId = fontId ^ (n << 16); + + return new TextChar(fontId, charPosA, charLenA, xMinA, yMinA, xMaxA, yMaxA, + rotA, rotatedA, clippedA, invisibleA, fontA, fontSizeA, + colorRA, colorGA, colorBA); +} + +void TextPageFont::encodeFragment(Unicode *text, int len, UnicodeMap *uMap, GBool primaryLR, GString *s) { + char buf[8]; + int n; + + for (int i = 0; i < len; ++i) { + if (text[i] != FONT_INVALID) { + n = (text[i] >> 16); + if (n == 0) n = 1; + buf[0] = (char)text[i]; + for (int j = 0; j < n; j++) { + s->append(buf, 1); + } + } + } +} + +void TextPageFont::computeLinePhysWidth(TextLine *line, UnicodeMap *uMap) { + int n; + if (uMap->isUnicode()) { + line->pw = line->len; + } else { + line->pw = 0; + for (int i = 0; i < line->len; ++i) { + if (line->text[i] != FONT_INVALID) { + n = (line->text[i] >> 16); + if (n == 0) n = 1; + line->pw += n; + } + } + } +} diff --git a/src/xpydf/FontOutputDev.h b/src/xpydf/FontOutputDev.h new file mode 100644 index 0000000..6a728aa --- /dev/null +++ b/src/xpydf/FontOutputDev.h @@ -0,0 +1,92 @@ +#ifndef FONT_OUTPUT_DEV_H +#define FONT_OUTPUT_DEV_H + +#include +#include +#include + +#include "UnicodeMap.h" +#include "TextOutputDev.h" + +#define FONT_INVALID 256 +#define FONT_UNKNOWN 255 + +typedef struct FontSpec { + unsigned int fontNameId, fontTypeId, fontSize; +} FontSpec; + +typedef struct NamedFontSpec { + std::string fontName, fontType; + unsigned int fontSize; +} NamedFontSpec; + +bool operator<(const FontSpec& l, const FontSpec& r); +bool operator==(const FontSpec& l, const FontSpec& r); +bool operator!=(const FontSpec& l, const FontSpec& r); + +class TextPageFont: public TextPage { +public: + TextPageFont(TextOutputControl *controlA); + + std::map getFontSpecs() { + std::map result; + + std::map fontNames; + std::map fontTypes; + + for (auto pair : fontNameIds) { + fontNames[pair.second] = pair.first; + } + + for (auto pair : fontTypeIds) { + fontTypes[pair.second] = pair.first; + } + + for (auto pair : fontSpecIds) { + result[pair.second] = (NamedFontSpec) { + fontNames[pair.first.fontNameId], + fontTypes[pair.first.fontTypeId], + pair.first.fontSize + }; + } + + return result; + } + +protected: + TextChar *textCharType(Unicode cA, int charPosA, int charLenA, + double xMinA, double yMinA, double xMaxA, double yMaxA, + int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA, + TextFontInfo *fontA, double fontSizeA, + double colorRA, double colorGA, double colorBA) override; + + void encodeFragment(Unicode *text, int len, UnicodeMap *uMap, + GBool primaryLR, GString *s) override; + + void computeLinePhysWidth(TextLine *line, UnicodeMap *uMap) override; +private: + std::map fontNameIds; + std::map fontTypeIds; + std::map fontSpecIds; + + std::queue availableIds; + + char space[8], eol[16]; + int spaceLen, eolLen; + UnicodeMap *uMap; +}; + +class FontOutputDev: public TextOutputDev { +public: + FontOutputDev(TextOutputFunc func, void *stream, + TextOutputControl *controlA) : TextOutputDev(func, stream, controlA) { + delete text; + text = new TextPageFont(&control); + } + + std::map getFontSpecs() { + return ((TextPageFont *)text)->getFontSpecs(); + } +}; + +#endif diff --git a/src/xpydf/PdfLoader.cc b/src/xpydf/PdfLoader.cc index e3a248e..0fc766c 100644 --- a/src/xpydf/PdfLoader.cc +++ b/src/xpydf/PdfLoader.cc @@ -1,4 +1,6 @@ #include +#include +#include #include #include #include @@ -6,6 +8,8 @@ #include #include +#include + #include "gmem.h" #include "gmempp.h" #include "parseargs.h" @@ -28,13 +32,39 @@ #include "config.h" #include "SplashOutputDev.h" #include "SplashBitmap.h" +#include "Annot.h" +#include "AcroForm.h" #include "PdfLoader.h" +#include "FontOutputDev.h" #include "ImageDataDev.h" #include "ImageInfoDev.h" +#define PAGES 1062 +#define PAGEE 1062 + static void outputToStringStream(void *stream, const char *text, int len) { + // fprintf(stderr, "Adding %d characters\n", len); + // if (len > 1) + // fprintf(stderr, "(%.3d) %s\n", len, text); + ((std::stringstream *)stream)->write(text, len); +} + +static void outputToStringStream2(void *stream, const char *text, int len) { + // fprintf(stderr, "Adding %d characters\n", len); + // if (len > 1) { + // fprintf(stderr, "(%.3d) ", len); + // for (int i = 0; i < len; i++) { + // if (text[i] == ' ') { + // fprintf(stderr, " "); + // } else { + // fprintf(stderr, "x"); + // } + // } + + // fprintf(stderr, "\n"); + // } ((std::stringstream *)stream)->write(text, len); } @@ -48,6 +78,7 @@ PdfLoader::PdfLoader(LoaderConfig config, char *fileName, char *ownerPw, char *u globalParams->setErrQuiet(config.quiet); globalParams->setMapNumericCharNames(config.mapNumericCharNames); globalParams->setMapUnknownCharNames(config.mapUnknownCharNames); + globalParams->setReadUnicodeCMap(config.readUnicodeCMap); globalParams->setupBaseFonts(NULL); switch (config.mode) { @@ -111,6 +142,8 @@ std::vector PdfLoader::extractText() { goto err; } + firstPage = PAGES; + lastPage = PAGEE; firstPage = 1; lastPage = doc->getNumPages(); @@ -118,6 +151,7 @@ std::vector PdfLoader::extractText() { if (textOut->isOk()) { for (int page = firstPage; page <= lastPage; page++) { + // fprintf(stderr, "Processing page %d\n", page); stream->str(""); doc->displayPages(textOut, page, page, 72, 72, 0, gFalse, gTrue, gFalse); pages.push_back(stream->str()); @@ -134,6 +168,44 @@ std::vector PdfLoader::extractText() { return pages; } +std::vector PdfLoader::extractFontMap(std::map &fontSpecs) { + FontOutputDev *fontOut; + std::stringstream *stream = new std::stringstream(); + std::vector pages; + int firstPage, lastPage; + + if (!doc->isOk()) { + goto err; + } + + firstPage = PAGES; + lastPage = PAGEE; + firstPage = 1; + lastPage = doc->getNumPages(); + + fontOut = new FontOutputDev(&outputToStringStream2, stream, &textOutControl); + + if (fontOut->isOk()) { + for (int page = firstPage; page <= lastPage; page++) { + // fprintf(stderr, "Processing page %d\n", page); + stream->str(""); + doc->displayPages(fontOut, page, page, 72, 72, 0, gFalse, gTrue, gFalse); + pages.push_back(stream->str()); + } + } + + fontSpecs = fontOut->getFontSpecs(); + + delete fontOut; +err: + delete stream; + + Object::memCheck(stderr); + gMemReport(stderr); + + return pages; +} + std::vector PdfLoader::extractPageInfo() { ImageInfoDev *imageOut; int firstPage, lastPage; @@ -170,7 +242,7 @@ std::vector PdfLoader::extractPageInfo() { } delete imageOut; - err: +err: Object::memCheck(stderr); gMemReport(stderr); @@ -178,6 +250,266 @@ std::vector PdfLoader::extractPageInfo() { return pagesInfo; } +static Ref *fonts; +static int fontsLen; +static int fontsSize; + +static char *seenObjs; +static int numObjects; + +void PdfLoader::scanFonts(Object *obj) { + Object obj2; + + if (checkFontObject(obj, &obj2) && obj2.isDict()) { + scanFonts(obj2.getDict()); + } + obj2.free(); +} + +void PdfLoader::scanFonts(Dict *resDict) { + Object fontDict1, fontDict2, xObjDict1, xObjDict2, xObj1, xObj2; + Object patternDict1, patternDict2, pattern1, pattern2; + Object gsDict1, gsDict2, gs1, gs2, smask1, smask2, smaskGroup1, smaskGroup2; + Object resObj; + Ref r; + GfxFontDict *gfxFontDict; + GfxFont *font; + int i; + + // scan the fonts in this resource dictionary + gfxFontDict = NULL; + resDict->lookupNF("Font", &fontDict1); + if (checkFontObject(&fontDict1, &fontDict2) && fontDict2.isDict()) { + if (fontDict1.isRef()) { + r = fontDict1.getRef(); + gfxFontDict = new GfxFontDict(doc->getXRef(), &r, fontDict2.getDict()); + } else { + gfxFontDict = new GfxFontDict(doc->getXRef(), NULL, fontDict2.getDict()); + } + if (gfxFontDict) { + for (i = 0; i < gfxFontDict->getNumFonts(); ++i) { + if ((font = gfxFontDict->getFont(i))) { + scanFont(font); + } + } + delete gfxFontDict; + } + } + + fontDict2.free(); + fontDict1.free(); + + // recursively scan any resource dictionaries in XObjects in this + // resource dictionary + resDict->lookupNF("XObject", &xObjDict1); + if (checkFontObject(&xObjDict1, &xObjDict2) && xObjDict2.isDict()) { + for (i = 0; i < xObjDict2.dictGetLength(); ++i) { + xObjDict2.dictGetValNF(i, &xObj1); + if (checkFontObject(&xObj1, &xObj2) && xObj2.isStream()) { + xObj2.streamGetDict()->lookupNF("Resources", &resObj); + scanFonts(&resObj); + resObj.free(); + } + xObj2.free(); + xObj1.free(); + } + } + xObjDict2.free(); + xObjDict1.free(); + + // recursively scan any resource dictionaries in Patterns in this + // resource dictionary + resDict->lookupNF("Pattern", &patternDict1); + if (checkFontObject(&patternDict1, &patternDict2) && patternDict2.isDict()) { + for (i = 0; i < patternDict2.dictGetLength(); ++i) { + patternDict2.dictGetValNF(i, &pattern1); + if (checkFontObject(&pattern1, &pattern2) && pattern2.isStream()) { + pattern2.streamGetDict()->lookupNF("Resources", &resObj); + scanFonts(&resObj); + resObj.free(); + } + pattern2.free(); + pattern1.free(); + } + } + patternDict2.free(); + patternDict1.free(); + + // recursively scan any resource dictionaries in ExtGStates in this + // resource dictionary + resDict->lookupNF("ExtGState", &gsDict1); + if (checkFontObject(&gsDict1, &gsDict2) && gsDict2.isDict()) { + for (i = 0; i < gsDict2.dictGetLength(); ++i) { + gsDict2.dictGetValNF(i, &gs1); + if (checkFontObject(&gs1, &gs2) && gs2.isDict()) { + gs2.dictLookupNF("SMask", &smask1); + if (checkFontObject(&smask1, &smask2) && smask2.isDict()) { + smask2.dictLookupNF("G", &smaskGroup1); + if (checkFontObject(&smaskGroup1, &smaskGroup2) && + smaskGroup2.isStream()) { + smaskGroup2.streamGetDict()->lookupNF("Resources", &resObj); + scanFonts(&resObj); + resObj.free(); + } + smaskGroup2.free(); + smaskGroup1.free(); + } + smask2.free(); + smask1.free(); + } + gs2.free(); + gs1.free(); + } + } + gsDict2.free(); + gsDict1.free(); +} + +std::map> fontDict; + +void PdfLoader::scanFont(GfxFont *font) { + Ref fontRef; + Object fontObj, toUnicodeObj; + GString *name; + int i; + + fontRef = *font->getID(); + + // check for an already-seen font + for (i = 0; i < fontsLen; ++i) { + if (fontRef.num == fonts[i].num && fontRef.gen == fonts[i].gen) { + return; + } + } + + // font name + name = font->getName(); + + // print the font info + if (name) { + char fontCode[1000], fontName[1000], fontType[1000]; + + if (sscanf(name->getCString(), "%[^+]+%[^-]-%s", fontCode, fontName, fontType) != EOF) { + if (fontDict.find(fontName) == fontDict.end()) { + fontDict[fontName] = std::set(); + } + + fontDict[fontName].insert(fontType); + } + } + + // add this font to the list + if (fontsLen == fontsSize) { + if (fontsSize <= INT_MAX - 32) { + fontsSize += 32; + } else { + // let greallocn throw an exception + fontsSize = -1; + } + fonts = (Ref *)greallocn(fonts, fontsSize, sizeof(Ref)); + } + fonts[fontsLen++] = *font->getID(); +} + +GBool PdfLoader::checkFontObject(Object *in, Object *out) { + int objNum; + + if (!in->isRef()) { + in->copy(out); + return gTrue; + } + objNum = in->getRefNum(); + if (objNum < 0 || objNum >= numObjects) { + out->initNull(); + return gTrue; + } + if (seenObjs[objNum]) { + out->initNull(); + return gFalse; + } + seenObjs[objNum] = (char)1; + in->fetch(doc->getXRef(), out); + return gTrue; +} + +std::vector PdfLoader::extractFonts() { + int firstPage, lastPage; + std::vector fontInfo; + + Dict *resDict; + Annots *annots; + AcroForm *form; + Object obj1, obj2; + + if (!doc->isOk()) { + goto err; + } + + firstPage = 1; + lastPage = doc->getNumPages(); + + fonts = NULL; + fontsLen = fontsSize = 0; + numObjects = doc->getXRef()->getNumObjects(); + seenObjs = (char *)gmalloc(numObjects); + memset(seenObjs, 0, numObjects); + + for (int page = firstPage; page <= lastPage; page++) { + Page *pdfPage = doc->getCatalog()->getPage(page); + + if ((resDict = pdfPage->getResourceDict())) { + // fprintf(stderr, "scanFonts\n"); + scanFonts(resDict); + } + + annots = new Annots(doc, pdfPage->getAnnots(&obj1)); + obj1.free(); + + for (int i = 0; i < annots->getNumAnnots(); i++) { + if (annots->getAnnot(i)->getAppearance(&obj1)->isStream()) { + obj1.streamGetDict()->lookupNF("Resources", &obj2); + scanFonts(&obj2); + obj2.free(); + } + obj1.free(); + } + delete annots; + } + if ((form = doc->getCatalog()->getForm())) { + for (int i = 0; i < form->getNumFields(); ++i) { + form->getField(i)->getResources(&obj1); + if (obj1.isArray()) { + for (int j = 0; j < obj1.arrayGetLength(); ++j) { + obj1.arrayGetNF(j, &obj2); + scanFonts(&obj2); + obj2.free(); + } + } else if (obj1.isDict()) { + scanFonts(obj1.getDict()); + } + obj1.free(); + } + } + + for (auto pair : fontDict) { + fprintf(stderr, "%s has types:\n", pair.first.c_str()); + for (auto ft : pair.second) { + fprintf(stderr, " - %s\n", ft.c_str()); + } + } + + fprintf(stderr, "Found %d fonts\n", fontsSize); + + gfree(fonts); + gfree(seenObjs); + +err: + Object::memCheck(stderr); + gMemReport(stderr); + + return fontInfo; +} + std::vector PdfLoader::extractImages(int pageNum) { ImageDataDev *imageOut; std::vector images; @@ -237,7 +569,7 @@ Image PdfLoader::pageToImage(int pageNum, int dpi) { memcpy(pageImage.data, bitmap->getDataPtr(), pageImage.size); delete splashOut; - err: +err: Object::memCheck(stderr); gMemReport(stderr); @@ -255,3 +587,35 @@ bool PdfLoader::isOk() { int PdfLoader::getErrorCode() { return (int)doc->getErrorCode(); } + +#include + +using namespace std; + +int main() { + LoaderConfig config; + map fontSpecs; + + PdfLoader *l = new PdfLoader(config, "skf.pdf"); + vector pageText = l->extractText(); + vector fontMap = l->extractFontMap(fontSpecs); + + // fprintf(stderr, "Read %lu pages text, %lu pages fontmap\n", pageText.size(), fontMap.size()); + + int diff = 0; + + for (int i = 0; i < pageText.size(); i++) { + diff += fabs((int)pageText[i].length() - (int)fontMap[i].length()); + if (pageText[i].length() != fontMap[i].length()) { + fprintf(stderr, "Page %d mismatch: %lu text, %lu font\n", i, pageText[i].length(), fontMap[i].length()); + } + } + + fprintf(stderr, "Total diff %d\n", diff); + + // cerr << pageText[0] << endl; + + // for (auto pair : fontSpecs) { + // fprintf(stderr, "Font id %d had name '%s', type '%s', size %d\n", pair.first, pair.second.fontName.c_str(), pair.second.fontType.c_str(), pair.second.fontSize); + // } +} diff --git a/src/xpydf/PdfLoader.h b/src/xpydf/PdfLoader.h index ac583e6..e898dd3 100644 --- a/src/xpydf/PdfLoader.h +++ b/src/xpydf/PdfLoader.h @@ -9,6 +9,7 @@ #include "PDFDoc.h" #include "TextOutputDev.h" +#include "FontOutputDev.h" #include "ImageDataDev.h" #include "ImageInfoDev.h" @@ -22,6 +23,7 @@ typedef struct LoaderConfig { GBool quiet = gTrue; GBool mapNumericCharNames = gFalse; GBool mapUnknownCharNames = gTrue; + GBool readUnicodeCMap = gTrue; unsigned int mode = 0; } LoaderConfig; @@ -36,7 +38,9 @@ class PdfLoader { PdfLoader(LoaderConfig config, char *fileName, char *ownerPw = NULL, char *userPw = NULL); ~PdfLoader(); std::vector extractText(); + std::vector extractFontMap(std::map &fontSpecs); std::vector extractPageInfo(); + std::vector extractFonts(); std::vector extractImages(int pageNum); Image pageToImage(int pageNum, int dpi); bool isOk(); @@ -45,6 +49,11 @@ class PdfLoader { TextOutputControl textOutControl; PDFDoc *doc; GString *textFileName; + + GBool checkFontObject(Object *in, Object *out); + void scanFont(GfxFont *font); + void scanFonts(Object *obj); + void scanFonts(Dict *resDict); }; #endif diff --git a/src/xpydf/PdfLoaderWrapper.cc b/src/xpydf/PdfLoaderWrapper.cc index c1f3bf2..06b8ceb 100644 --- a/src/xpydf/PdfLoaderWrapper.cc +++ b/src/xpydf/PdfLoaderWrapper.cc @@ -23,15 +23,16 @@ PyObject *construct(PyObject *self, PyObject *args) { char *ownerPw = NULL; char *userPw = NULL; - PyArg_ParseTuple(args, "Opppppppbzz", &pobj0, + PyArg_ParseTuple(args, "OppppppppIzz", &pobj0, &(config.clipText), &(config.discardDiag), &(config.discardRotatedText), &(config.verbose), &(config.quiet), - &(config.mode), &(config.mapNumericCharNames), &(config.mapUnknownCharNames), + &(config.readUnicodeCMap), + &(config.mode), &ownerPw, &userPw ); @@ -112,6 +113,22 @@ PyObject *extractText(PyObject *self, PyObject *args) { return Py_BuildValue("O", converted); } +PyObject *extractFontMap(PyObject *self, PyObject *args) { + vector res; + + PyObject *loaderCapsule; + PyArg_ParseTuple(args, "O", &loaderCapsule); + + PdfLoader *loader = (PdfLoader *)PyCapsule_GetPointer(loaderCapsule, "loaderPtr"); + map fontSpecs; + vector result = loader->extractFontMap(fontSpecs); + + PyObject *fontMap = vectorStringToList(result); + PyObject *fontDict = mapFontSpecsToDict(fontSpecs); + + return Py_BuildValue("OO", fontMap, fontDict); +} + PyObject *extractPageInfo(PyObject *self, PyObject *args) { vector res; @@ -125,6 +142,19 @@ PyObject *extractPageInfo(PyObject *self, PyObject *args) { return Py_BuildValue("O", converted); } +PyObject *extractFonts(PyObject *self, PyObject *args) { + vector res; + + PyObject *loaderCapsule; + PyArg_ParseTuple(args, "O", &loaderCapsule); + + PdfLoader *loader = (PdfLoader *)PyCapsule_GetPointer(loaderCapsule, "loaderPtr"); + vector result = loader->extractFonts(); + + PyObject *converted = vectorStringToList(result); + return Py_BuildValue("O", converted); +} + PyObject *extractImages(PyObject *self, PyObject *args) { vector res; @@ -210,10 +240,18 @@ PyMethodDef cXpdfPythonFunctions[] = { extractText, METH_VARARGS, "Extract text as bytes"}, + {"extractFontMap", + extractFontMap, METH_VARARGS, + "Extract font map as bytes"}, + {"extractPageInfo", extractPageInfo, METH_VARARGS, "Extract image metadata"}, + {"extractFonts", + extractFonts, METH_VARARGS, + "Extract font metadata"}, + {"extractImages", extractImages, METH_VARARGS, "Extract images"}, diff --git a/src/xpydf/PyCppConversion.cc b/src/xpydf/PyCppConversion.cc index 540d6b4..3e824a7 100644 --- a/src/xpydf/PyCppConversion.cc +++ b/src/xpydf/PyCppConversion.cc @@ -64,3 +64,25 @@ PyObject *vectorImagesToList(const std::vector &data) { return listObj; } + +PyObject *mapFontSpecsToDict(const std::map &data) { + PyObject *dict = PyDict_New(); + if (!dict) throw logic_error("Unable to allocate memory for Python dict"); + + for (auto pair : data) { + PyObject *item = PyDict_New(); + if (!item) throw logic_error("Unable to allocate memory for Python dict"); + + PyObject *id = PyLong_FromLong(pair.first); + PyObject *name = PyUnicode_FromString(pair.second.fontName.c_str()); + PyObject *type = PyUnicode_FromString(pair.second.fontType.c_str()); + + PyDict_SetItemString(item, "name", name); + PyDict_SetItemString(item, "type", type); + PyDict_SetItemString(item, "size", PyLong_FromLong(pair.second.fontSize)); + + PyDict_SetItem(dict, id, item); + } + + return dict; +} diff --git a/src/xpydf/PyCppConversion.h b/src/xpydf/PyCppConversion.h index 169523c..6e6f48c 100644 --- a/src/xpydf/PyCppConversion.h +++ b/src/xpydf/PyCppConversion.h @@ -1,6 +1,7 @@ #ifndef PY_CPP_CONVERSION_H #define PY_CPP_CONVERSION_H +#include #include #include @@ -16,4 +17,6 @@ PyObject *vectorPagesToList(const std::vector &data); PyObject *vectorImagesToList(const std::vector &data); +PyObject *mapFontSpecsToDict(const std::map &data); + #endif \ No newline at end of file diff --git a/src/xpydf/cXpdfPython.pyi b/src/xpydf/cXpdfPython.pyi index a2766d7..0eccc99 100644 --- a/src/xpydf/cXpdfPython.pyi +++ b/src/xpydf/cXpdfPython.pyi @@ -1,11 +1,16 @@ -from typing import Any, List, Optional +from typing import Any, Dict, List, Optional, Tuple, TypedDict import numpy.typing as npt from xpydf.pdf_loader import PageInfo class XpdfPythonCapsule: ... +class Font(TypedDict): + name: str + type: str + size: str + def construct( filename: str, cliptext: bool, @@ -13,14 +18,17 @@ def construct( discard_rotated_text: bool, verbose: bool, quiet: bool, - mode: int, mapNumericCharNames: bool = False, mapUnknownCharNames: bool = True, + readUnicodeCMap: bool = True, + mode: int = 0, ownerPw: Optional[str] = None, userPw: Optional[str] = None, ) -> XpdfPythonCapsule: ... def extractText(capsule: XpdfPythonCapsule) -> List[bytes]: ... +def extractFontMap(capsule: XpdfPythonCapsule) -> Tuple[List[bytes], Dict[int, Font]]: ... def extractPageInfo(capsule: XpdfPythonCapsule) -> List[PageInfo]: ... +def extractFonts(capsule: XpdfPythonCapsule) -> List[str]: ... def extractImages(capsule: XpdfPythonCapsule, page_number: int) -> List[npt.NDArray[Any]]: ... def pageToImage(capsule: XpdfPythonCapsule, page_number: int, dpi: int) -> npt.NDArray[Any]: ... def deleteObject(capsule: XpdfPythonCapsule) -> None: ... diff --git a/src/xpydf/pdf_loader.py b/src/xpydf/pdf_loader.py index 76c9748..f6a8a2e 100644 --- a/src/xpydf/pdf_loader.py +++ b/src/xpydf/pdf_loader.py @@ -1,8 +1,12 @@ -from typing import Any, List, Optional, TypedDict +from typing import Any, Dict, List, Optional, Tuple, TypedDict import cXpdfPython import numpy.typing as npt +class Font(TypedDict): + name: str + type: str + size: str class ImageInfo(TypedDict): """Container for image metadata @@ -60,9 +64,10 @@ def __init__( discard_rotated_text: bool = True, verbose: bool = False, quiet: bool = True, - mode: str = "table", map_numeric_char_names: bool = False, map_unknown_char_names: bool = True, + read_unicode_cmap: bool = True, + mode: str = "table", owner_password: Optional[str] = None, user_password: Optional[str] = None, ): @@ -105,7 +110,18 @@ def __init__( self.filename = filename self.capsule = cXpdfPython.construct( - filename, cliptext, discard_diag, discard_rotated_text, verbose, quiet, xpdf_mode, map_numeric_char_names, map_unknown_char_names, owner_password, user_password + filename, + cliptext, + discard_diag, + discard_rotated_text, + verbose, + quiet, + map_numeric_char_names, + map_unknown_char_names, + read_unicode_cmap, + xpdf_mode, + owner_password, + user_password ) def extract_bytes(self) -> List[bytes]: @@ -121,6 +137,12 @@ def extract_bytes(self) -> List[bytes]: pages = cXpdfPython.extractText(self.capsule) return pages + + def extract_font_map(self) -> Tuple[List[bytes], Dict[int, Font]]: + if self.capsule is not None: + pages, fonts = cXpdfPython.extractFontMap(self.capsule) + + return pages, fonts def extract_strings(self) -> List[str]: """Extract and decode text from the pdf @@ -147,6 +169,20 @@ def extract_page_info(self) -> List[PageInfo]: return images + def extract_fonts(self) -> List[str]: + """Return image related metadata from the pdf + + Returns + ------- + List[PageInfo] + A PageInfo object for each page + """ + images: List[str] = [] + if self.capsule is not None: + images = cXpdfPython.extractFonts(self.capsule) + + return images + def extract_images(self, page_number: int) -> List[npt.NDArray[Any]]: """Extract raw image data from a page, as a numpy array. diff --git a/src/xpydf/pdf_loader.pyi b/src/xpydf/pdf_loader.pyi index ec3c7c4..c594491 100644 --- a/src/xpydf/pdf_loader.pyi +++ b/src/xpydf/pdf_loader.pyi @@ -1,4 +1,4 @@ -from typing import Any, List, Optional, TypedDict +from typing import Any, Dict, List, Optional, Tuple, TypedDict import numpy.typing as npt @@ -14,6 +14,11 @@ class PageInfo(TypedDict): height: float images: List[ImageInfo] +class Font(TypedDict): + name: str + type: str + size: str + class PdfLoader: filename: str capsule: Optional[XpdfPythonCapsule] = None @@ -28,15 +33,18 @@ class PdfLoader: insert_bom: bool = False, verbose: bool = False, quiet: bool = True, - mode: str = "table", map_numeric_char_names: bool = False, map_unknown_char_names: bool = True, + read_unicode_cmap: bool = True, + mode: str = "table", owner_password: Optional[str] = None, user_password: Optional[str] = None, ) -> None: ... def extract_bytes(self) -> List[bytes]: ... + def extract_font_map(self) -> Tuple[List[bytes], Dict[int, Font]]: ... def extract_strings(self) -> List[str]: ... def extract_page_info(self) -> List[PageInfo]: ... + def extract_fonts(self) -> List[str]: ... def extract_images(self, page_number: int) -> List[npt.NDArray[Any]]: ... def page_to_image(self, page_number: int, dpi: int = 150) -> npt.NDArray[Any]: ... def __del__(self) -> None: ...