Skip to content
This repository has been archived by the owner on Dec 9, 2018. It is now read-only.

Convert illegal html unicode without glyph to space or zero-width space. #493

Open
wants to merge 1 commit into
base: incoming
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions 3rdparty/poppler/git/CairoFontEngine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -377,14 +377,18 @@ _ft_new_face (FT_Library lib,

CairoFreeTypeFont::CairoFreeTypeFont(Ref ref,
cairo_font_face_t *cairo_font_face,
FT_Face ft_face,
int *codeToGID,
Guint codeToGIDLen,
GBool substitute) : CairoFont(ref,
cairo_font_face,
codeToGID,
codeToGIDLen,
substitute,
gTrue) { }
gTrue),
// Caution: this field is added by pdf2htmlEX to determine whitespace. Please merge during update.
ft_face(ft_face)
{ }

CairoFreeTypeFont::~CairoFreeTypeFont() { }

Expand Down Expand Up @@ -546,7 +550,7 @@ CairoFreeTypeFont *CairoFreeTypeFont::create(GfxFont *gfxFont, XRef *xref,

delete fontLoc;
return new CairoFreeTypeFont(ref,
font_face,
font_face, face,
codeToGID, codeToGIDLen,
substitute);

Expand Down
6 changes: 4 additions & 2 deletions 3rdparty/poppler/git/CairoFontEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,12 @@ class CairoFreeTypeFont : public CairoFont {
public:
static CairoFreeTypeFont *create(GfxFont *gfxFont, XRef *xref, FT_Library lib, GBool useCIDs);
virtual ~CairoFreeTypeFont();

// Caution: this function is added by pdf2htmlEX to determine whitespace. Please merge during update.
FT_Face get_ft_face() { return ft_face; }
private:
CairoFreeTypeFont(Ref ref, cairo_font_face_t *cairo_font_face,
CairoFreeTypeFont(Ref ref, cairo_font_face_t *cairo_font_face, FT_Face ft_face,
int *codeToGID, Guint codeToGIDLen, GBool substitute);
FT_Face ft_face;
};

//------------------------------------------------------------------------
Expand Down
12 changes: 12 additions & 0 deletions src/HTMLRenderer/HTMLRenderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <fstream>
#include <memory>

#include <ft2build.h>
#include FT_FREETYPE_H
#include <OutputDev.h>
#include <GfxState.h>
#include <Stream.h>
Expand Down Expand Up @@ -42,6 +44,7 @@
#include "util/const.h"
#include "util/misc.h"

class CairoFontEngine;

namespace pdf2htmlEX {

Expand Down Expand Up @@ -217,6 +220,10 @@ struct HTMLRenderer : OutputDev
// make sure the current HTML style consistent with PDF
void prepare_text_line(GfxState * state);

// Check whether this char has a non-empty glyph in this font. If not sure, return true.
// A char has an empty glyph or no glyph is usually a whitespace.
bool has_glyph(CharCode code, GfxFont* font);

////////////////////////////////////////////////////
// PDF stuffs
////////////////////////////////////////////////////
Expand Down Expand Up @@ -341,6 +348,11 @@ struct HTMLRenderer : OutputDev

CoveredTextDetector covered_text_detector;
DrawingTracer tracer;

#if ENABLE_SVG
FT_Library ft_lib;
std::unique_ptr<CairoFontEngine> font_engine;
#endif
};

} //namespace pdf2htmlEX
Expand Down
26 changes: 26 additions & 0 deletions src/HTMLRenderer/font.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "CairoFontEngine.h"
#include "CairoOutputDev.h"
#include <Gfx.h>
#include FT_OUTLINE_H
#endif

namespace pdf2htmlEX {
Expand Down Expand Up @@ -1082,4 +1083,29 @@ void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, cons
f_css.fs << "}" << endl;
}

bool HTMLRenderer::has_glyph(CharCode code, GfxFont* font)
{
#if ENABLE_SVG
if (font->getType() == fontType3)
return true;
CairoFreeTypeFont* ftfont = (CairoFreeTypeFont*)font_engine->getFont(font, cur_doc, false, xref);
if (ftfont == nullptr)
return false;
FT_Face face = ftfont->get_ft_face();
if (face == nullptr)
return false;
auto gid = ftfont->getGlyph(code, nullptr, 0);
// gid == 0 means no glyph
if (gid == 0)
return false;
if (FT_Load_Glyph(face, gid, FT_LOAD_NO_SCALE))
return false;
FT_GlyphSlot slot = face->glyph;
// n_contours == 0 means an empty glyph
if (slot->format == FT_GLYPH_FORMAT_OUTLINE && slot->outline.n_contours == 0)
return false;
#endif
return true;
}

} //namespace pdf2htmlEX
12 changes: 12 additions & 0 deletions src/HTMLRenderer/general.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
#include "util/css_const.h"
#include "util/encoding.h"

#if ENABLE_SVG
#include "CairoFontEngine.h"
#endif

namespace pdf2htmlEX {

using std::fixed;
Expand Down Expand Up @@ -86,11 +90,19 @@ HTMLRenderer::HTMLRenderer(const Param & param)
[this](double * box, bool partial) { covered_text_detector.add_char_bbox_clipped(box, partial); };
tracer.on_non_char_drawn =
[this](double * box) { covered_text_detector.add_non_char_bbox(box); };

#if ENABLE_SVG
FT_Init_FreeType(&ft_lib);
font_engine = std::unique_ptr<CairoFontEngine>(new CairoFontEngine(ft_lib));
#endif
}

HTMLRenderer::~HTMLRenderer()
{
ffw_finalize();
#if ENABLE_SVG
FT_Done_FreeType(ft_lib);
#endif
}

void HTMLRenderer::process(PDFDoc *doc)
Expand Down
42 changes: 27 additions & 15 deletions src/HTMLRenderer/text.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
while (len > 0)
{
auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)%s\n", (wchar_t)u[0], u[0], has_glyph(code, font) ? "":" no glyph"));

if(!(equal(ox, 0) && equal(oy, 0)))
{
Expand Down Expand Up @@ -113,24 +113,36 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
}
else
{
Unicode uu;
if(cur_text_state.font_info->use_tounicode)
if (uLen == 1 && is_illegal_unicode(u[0]) && !has_glyph(code, font))
{
uu = check_unicode(u, uLen, code, font);
// Convert illegal html unicode to a whitespace, if it has no glyph.
// Add a zero-width space AFTER the offset to make sure words are
// delimited, and make sure the ZWSP can be optimized out if the
// offset is represented by a space (see HTMLTextLine::dump_unicode).
html_text_page.get_cur_line()->append_offset(ddx * draw_text_scale);
html_text_page.get_cur_line()->append_unicodes(&zero_width_space, 1, 0);
}
else
{
uu = unicode_from_font(code, font);
}
html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
/*
* In PDF, word_space is appended if (n == 1 and *p = ' ')
* but in HTML, word_space is appended if (uu == ' ')
*/
int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
if(space_count != 0)
{
html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
Unicode uu;
if(cur_text_state.font_info->use_tounicode)
{
uu = check_unicode(u, uLen, code, font);
}
else
{
uu = unicode_from_font(code, font);
}
html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
/*
* In PDF, word_space is appended if (n == 1 and *p = ' ')
* but in HTML, word_space is appended if (uu == ' ')
*/
int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
if(space_count != 0)
{
html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
}
}
}
}
Expand Down
26 changes: 20 additions & 6 deletions src/HTMLTextLine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

#include "util/encoding.h"
#include "util/css_const.h"
#include "util/unicode.h"

namespace pdf2htmlEX {

Expand All @@ -32,6 +33,7 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
,clip_x1(0)
,clip_y1(0)
,width(0)
,last_output_unicode(0)
{ }

void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
Expand Down Expand Up @@ -88,16 +90,25 @@ void HTMLTextLine::dump_char(std::ostream & out, int pos)
int c = text[pos];
if (c > 0)
{
Unicode u = c;
writeUnicodes(out, &u, 1);
dump_unicode(out, c);
}
else if (c < 0)
{
auto dt = decomposed_text[- c - 1];
writeUnicodes(out, &dt.front(), dt.size());
for (auto it = dt.begin(), end = dt.end(); it != end; it++)
dump_unicode(out, *it);
}
}

void HTMLTextLine::dump_unicode(std::ostream & out, Unicode u)
{
// ZWSP following space can be optimized out.
if (u == zero_width_space && last_output_unicode == ' ')
return;
writeUnicodes(out, &u, 1);
last_output_unicode = u;
}

void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
{
static const Color transparent(0, 0, 0, true);
Expand Down Expand Up @@ -162,6 +173,7 @@ void HTMLTextLine::dump_text(ostream & out)
<< " " << CSS::BOTTOM_CN << all_manager.bottom.install(line_state.y - clip_y1)
;
// it will be closed by the first state
last_output_unicode = 0;
}

std::vector<State*> stack;
Expand Down Expand Up @@ -249,8 +261,7 @@ void HTMLTextLine::dump_text(ostream & out)
double space_off = state_iter1->single_space_offset();
if(std::abs(target - space_off) <= param.h_eps)
{
Unicode u = ' ';
writeUnicodes(out, &u, 1);
dump_unicode(out, ' ');
actual_offset = space_off;
done = true;
}
Expand All @@ -269,7 +280,10 @@ void HTMLTextLine::dump_text(ostream & out)
double threshold = state_iter1->em_size() * (param.space_threshold);

out << "<span class=\"" << CSS::WHITESPACE_CN
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
<< ' ' << CSS::WHITESPACE_CN << wid << "\">";
if (target > (threshold - EPS))
dump_unicode(out, ' ');
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that the space character will bring extra width after the span, which could be unintended?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The space is in that span, not after it. This change doesn't change previous behavior, just updates last_output_unicode.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see, sorry for the mistake.

out << "</span>";
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions src/HTMLTextLine.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class HTMLTextLine
*/
void dump_chars(std::ostream & out, int begin, int len);
void dump_char(std::ostream & out, int pos);
void dump_unicode(std::ostream & out, Unicode u);

const Param & param;
AllStateManager & all_manager;
Expand All @@ -128,6 +129,8 @@ class HTMLTextLine
*/
std::vector<int> text;
std::vector<std::vector<Unicode> > decomposed_text;

Unicode last_output_unicode; //last unicode written to html (chars in tags excluded)
};

} // namespace pdf2htmlEX
Expand Down
2 changes: 2 additions & 0 deletions src/util/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

namespace pdf2htmlEX {

const Unicode zero_width_space = 0x200B;

/**
* Check whether a unicode character is illegal for the output HTML.
* Unlike PDF readers, browsers has special treatments for such characters (normally treated as
Expand Down