Skip to content

Commit

Permalink
fixed svg float lexing
Browse files Browse the repository at this point in the history
  • Loading branch information
cdillond committed Jun 28, 2024
1 parent 67d9439 commit 507a7cb
Show file tree
Hide file tree
Showing 9 changed files with 196 additions and 53 deletions.
79 changes: 79 additions & 0 deletions font/hb_cgo.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package font

/*
#cgo LDFLAGS: -lharfbuzz -lharfbuzz-subset
#include <stdio.h>
#include <string.h>
#include <harfbuzz/hb.h>
#include <harfbuzz/hb-subset.h>
int subset(const unsigned char *src, unsigned int src_len, uint32_t uni_chars[], int num_chars, unsigned char *out)
{
unsigned int out_len = 0;
hb_blob_t *data = hb_blob_create_or_fail(src, src_len, HB_MEMORY_MODE_READONLY, NULL, NULL);
if (data == NULL)
return out_len;
hb_face_t *face = hb_face_create(data, 0);
hb_subset_input_t *input = hb_subset_input_create_or_fail();
if (input == NULL)
goto destroy_face;
hb_set_t *charset = hb_subset_input_unicode_set(input);
for (int i = 0; i < num_chars; i++)
hb_set_add(charset, uni_chars[i]);
hb_subset_input_set_flags(input, HB_SUBSET_FLAGS_RETAIN_GIDS);
hb_face_t *sub_face = hb_subset_or_fail(face, input);
if (sub_face == NULL)
goto destroy_input;
hb_blob_t *sub_blob = hb_face_reference_blob(sub_face);
const char *out_data = hb_blob_get_data(sub_blob, &out_len);
memcpy(out, out_data, out_len);
hb_blob_destroy(sub_blob);
destroy_input:
hb_subset_input_destroy(input);
destroy_face:
hb_face_destroy(face);
return out_len;
}
*/
import "C"
import (
"fmt"
"unsafe"

"golang.org/x/image/font/sfnt"
)

// HBSubsetC can be used as a gdf.FontSubsetFunc. It calls functions in libharfbuzz and libharfbuzz-subset via CGo. In order
// for this function to work, CGo must be enabled and HarfBuzz must be installed on your system.
func HBSubsetC(_ *sfnt.Font, src []byte, charset map[rune]struct{}) ([]byte, error) {
// convert runes to uint32_t chars readable by hb-subset
charset_u32 := make([]uint32, len(charset))
for char := range charset {
charset_u32 = append(charset_u32, uint32(char))
}
// allocate at least as much as the current file size
b := make([]byte, 0, len(src))

srcData := unsafe.SliceData(src)
charsetData := unsafe.SliceData(charset_u32)
outData := unsafe.SliceData(b)

written := int(C.subset(
(*C.uchar)(srcData),
C.uint(uint(len(src))),
(*C.uint)(charsetData),
C.int(len(charset_u32)),
(*C.uchar)(outData)))
if written < 1 {
return nil, fmt.Errorf("error subsetting font")
}
b = unsafe.Slice(outData, written)
return b, nil
}
20 changes: 9 additions & 11 deletions font/others.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build !windows

package font

import (
Expand All @@ -9,11 +11,11 @@ import (
"golang.org/x/image/font/sfnt"
)

// HBSubset returns a func that can be used as a gdf.FontSubsetFunc on POSIX machines.
// For this function to work, the HarfBuzz hb-subset tool must be installed. The HBSubset
// func may handle edge cases that the TTFSubset func does not. hb-subset has a mature,
// well-tested API and is capable of handling more font formats than the default function.
// However, this approach requires a call to os.Exec and may not be suitable for all environments.
// HBSubset returns a func that can be used as a gdf.FontSubsetFunc on systems with /dev/stdin
// and /dev/stdout device files. For this function to work, the HarfBuzz hb-subset tool must
// be installed. The HBSubset func may handle edge cases that the TTFSubset func does not. hb-subset
// has a mature, well-tested API and is capable of handling more font formats than TTFSubset.
// However, this approach requires os/exec, so it might not be suitable for all environments.
func HBSubset(_ *sfnt.Font, src []byte, cutset map[rune]struct{}) ([]byte, error) {
u := make([]byte, 0, 512)
for key := range cutset {
Expand All @@ -24,15 +26,11 @@ func HBSubset(_ *sfnt.Font, src []byte, cutset map[rune]struct{}) ([]byte, error
return nil, fmt.Errorf("cutset is too small")
}
cmd := exec.Command("hb-subset",
"--font-file=/dev/stdin",
"--font-file=/dev/stdin", // must be passed explicitly as an arg
"-u", string(u[:len(u)-1]),
"--retain-gids",
"-o", "/dev/stdout",
"-o", "/dev/stdout", // ditto for stdout
)
cmd.Stdin = bytes.NewReader(src)
return cmd.Output()
}

// NoSubset can be used as a gdf.FontSubsetFunc when you wish to avoid subsetting a given
// font. Beware: this can negatively impact the output PDF file size.
func NoSubset(_ *sfnt.Font, src []byte, _ map[rune]struct{}) ([]byte, error) { return src, nil }
4 changes: 4 additions & 0 deletions font/subset.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ var pdfTables = [...]TableTag{
//1886352244, // post
}

// NoSubset can be used as a gdf.FontSubsetFunc when you wish to avoid subsetting a given
// font. Beware: this can negatively impact the output PDF file size.
func NoSubset(_ *sfnt.Font, src []byte, _ map[rune]struct{}) ([]byte, error) { return src, nil }

// TTFSubset is something of a poor man's subsetting function. It works - for TrueType fonts with 'glyf' tables only - by zeroing out
// the outlines of all glyphs not corresponding to or directly referenced by f's glyphs for the runes in cutset,
// truncating f's glyf and loca tables, and then writing only the required tables to the returned byte slice. The final subset font
Expand Down
1 change: 1 addition & 0 deletions gdf.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ func includeChildren(pdf *PDF, o obj) error {
for key := range f.charset {
tmp[key] = struct{}{}
}

if f.FontSubsetFunc == nil {
f.FontSubsetFunc = font.TTFSubset
}
Expand Down
8 changes: 5 additions & 3 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ The default basic unit for a PDF document is the point, defined as 1/72 of an in
## Raster Images
In general, raster images displayed within a PDF document can be thought of as having two parts: a header, containing information about the image's size and encoding characteristics, and a byte slice representing the image's RGB/Gray/CMYK pixels in scanline order. (Alpha channel values must be encoded in a separate grayscale image.) Lossless compression filters can be applied to the byte slice to reduce its size, but this is can be costly. Where possible, it is best to store images as pre-compressed XImage objects. As a notable exception, most JPEG images can be embedded in a PDF without the need to decode and re-encode them.

## Fonts
The PDF specification allows for several different types of font. gdf supports only TrueType/OpenType fonts. Unlike in many document formats, a PDF font determines the encoding of the text rendered in that font. Attending to all of the many font types, font table formats, encodings, etc. can quickly become tedious and overwhelming (not to mention difficult to debug without knowledge of non-Latin scripts). It is gdf's aim to be lightweight and simple, rather than comprehensive. gdf therefore only supports Windows-1252 ("WinAnsiEncoding") character encodings. gdf takes care of encoding the UTF-8 strings accepted as input to its functions, but users should be aware that any text that contains characters not included in the Windows-1252 character set will not be rendered as intended.
## Fonts and Text Encoding
There are many ways a font can exist in a PDF file, but gdf allows for just one. In it's current form, gdf supports only TrueType/OpenType/WOFF typefaces with *nonsymbolic* characters. To render any text to a page, you must load a supported font using either the `LoadTrueType` function or the `LoadTrueTypeFile` function. Despite their names, these functions can also be used for OpenType and WOFF fonts. In PDF documents, the font used to render a piece of text determines the character encoding of that text. That is, PDF documents do not have a de facto character encoding; instead a PDF document can be a patchwork of different, even custom encodings, each of which must be specified on a per-font basis. All text written to a PDF file by gdf is encoded using the Windows-1252 ("WinAnsiEncoding") code page. This covers nearly all English-language use cases, but it is, of course, less than ideal, and hopefully, temporary. Users should be aware that any text that contains characters not included in the Windows-1252 character set will not be rendered as intended.

The PDF 2.0 spec requires fonts to be embedded in any PDF file that uses them, so font subsetting is strongly recommended. By default, gdf uses the `font.TTFSubset` function to subset embedded fonts, but this has known issues with WOFF fonts. The `font.HBSubset` function, which can be used as a replacement, is usually preferable, but it requires the user to install the HarfBuzz hb-subset tool. The function won't work on Windows, though it should be easy enough to tweak the source code so that it does.

## Text Formatting
While text can be drawn directly to a `ContentStream` by calling methods like `ContentStream.ShowString()`, the `TextController` type implements line-breaking and text-shaping algorithms, and simplifies text formatting by offering an easier to use API.
While text can be drawn directly to a `ContentStream` by calling methods like `ContentStream.ShowString()`, the `text.Controller` type implements line-breaking and text-shaping algorithms, and simplifies text formatting by offering an easier to use API.

## Annotations and AcroForms
Annotations are objects, rendered by the PDF viewer on a page, that are not part of the `Page`'s `ContentStream`. gdf supports 2 kinds of annotations: `TextAnnots` and `Widgets`. To a far greater extent than the graphics objects controlled by the `ContentStream`, the visual appearance of an annotation depends on the PDF viewing software.
Expand Down
37 changes: 37 additions & 0 deletions svg/color.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,21 @@ package svg

import (
"strconv"
"strings"
"unicode"

"github.com/cdillond/gdf"
)

func parseColor(s string) (gdf.RGBColor, bool) {
if len(s) > 4 && s[:4] == "rgb(" {
return parseRGBFunc(s)
}

if rgb, ok := namedColors[s]; ok {
return rgb, ok
}

if len(s) < 1 {
return gdf.RGBColor{}, false
}
Expand Down Expand Up @@ -45,6 +52,36 @@ func parseColor(s string) (gdf.RGBColor, bool) {

}

func parseRGBFunc(s string) (out gdf.RGBColor, ok bool) {
s = s[4:]
s = strings.Trim(s, "()\x20\n\r\t\v\f\x85\xA0")
cols := strings.FieldsFunc(s, func(r rune) bool {
return unicode.IsSpace(r) || r == ','
})
// this is an ad-hoc parsing method which does not account for edge cases or alpha values, e.g. rgb(127 255 127 / 80% )
if len(cols) < 3 {
return out, ok
}
var rgbArr [3]float64
var err error
for i := 0; i < 3; i++ {
if cols[i] == "none" {
continue
} else if strings.Contains(cols[i], "%") {
rgbArr[i], err = strconv.ParseFloat(cols[i][:len(cols[i])-1], 64)
rgbArr[i] /= 100.
} else {
rgbArr[i], err = strconv.ParseFloat(cols[i], 64)
rgbArr[i] /= 255.
}
if err != nil {
return out, ok
}
}
out.R, out.G, out.B = rgbArr[0], rgbArr[1], rgbArr[2]
return out, true
}

var rgbBlack = gdf.RGBColor{1, 1, 1}
var rgbWhite = gdf.RGBColor{0, 0, 0}
var badColor = gdf.RGBColor{R: -1, G: -1, B: -1}
Expand Down
4 changes: 4 additions & 0 deletions svg/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ func lexPathNum(l *lexer) stateFn {

}
}
l.undo()
l.out <- token{typ: num, text: l.src[i:l.n]}
return nil

Expand Down Expand Up @@ -251,6 +252,7 @@ exponent:
return lexSVGPathOp
}
}
l.undo()
l.out <- token{typ: num, text: l.src[i:l.n]}
return nil
}
Expand Down Expand Up @@ -305,6 +307,7 @@ func lexPolyNum(l *lexer) stateFn {

}
}
l.undo()
l.out <- token{typ: num, text: l.src[i:l.n]}
return nil

Expand Down Expand Up @@ -343,6 +346,7 @@ exponent:
return nil
}
}
l.undo()
l.out <- token{typ: num, text: l.src[i:l.n]}
return nil
}
Expand Down
4 changes: 2 additions & 2 deletions svg/readme.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package svg provides very limited, experimental facilities for rendering SVG images to PDFs. In addition to the obvious constraints (e.g., lack of animation), this package does not implement several important SVG features. Here's a rundown of some of them:
1. There is limited support for CSS properties.
2. Support for SVG text elements is unplanned.
3. Elliptical Arc Curve (`A` and `a`) path commands are not supported, and the present solution for displaying ellipse elements needs substantial improvement.
3. Elliptical Arc Curve (`A` and `a`) path commands may be improperly rendered.
4. Mask elements and transparency/opacity-related attributes are not supported.

These limitations preclude the use of this package for certain applications, but it can work with a surprising number of basic SVG images. Running an SVG through an SVG optimizer, such as SVGO(MG), and making simple manual adjustments to the SVG's source text can often fix rendering issues.
These limitations preclude the use of this package for certain applications, but it can work with a surprising number of basic SVG images. Running `rsvg-convert` with the `-f svg` option on the input SVG prior to its inclusion in the PDF is **highly** recommended. Making simple manual adjustments to the SVG's source text can also often fix rendering issues.
92 changes: 55 additions & 37 deletions svg/walk.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,46 @@ import (
)

type svgRoot struct {
n *node
xContent gdf.XContent
Height, Width float64
HScale, WScale float64
defs map[string]*node
n *node
xContent gdf.XContent
Height, Width float64
ViewBox gdf.Rect
defs map[string]*node
}

func abs(f float64) float64 {
if f < 0 {
f = -f
}
return f
}

type node_type uint

const (
svg_type node_type = iota
use_type
path_type
rect_type
polygon_type
circle_type
ellipse_type
moveto_type
lineto_type
curveto_type
close_path_type
)

type svg2 struct {
children []svg2 // this gives us an idea of the number
transform int
node_type
}

func walk2(s svg2) {
for i := range s.children {
walk2(s.children[i])
}
}

/*
Expand All @@ -21,14 +56,17 @@ at (0, 0), and end at the bottom right of the page at (w, h). Any graphics drawn
are not rendered. If both a viewBox and height and width parameters are provided, then the height
and width are interpreted instead as scale values. The values of an SVG with a height and width of 5px
but a viewBox of 0 0 10 10 are scaled down by 2.
OK: let's redo that and make it simpler. The height and width are the values against which relative lengths
are resolved. The viewBox determines the BoundingBox and the initial clip path. If no viewBox is provided,
the height and width are used, starting at 0, 0. If no height and width are provided, the viewBox is used.
It is in error for neither to be provided. The viewBox dimensions are always in pixels.
*/

func Decode(r io.Reader) (gdf.XContent, error) {
out := svgRoot{
n: new(node),
xContent: *gdf.NewXContent(nil, gdf.Rect{}),
HScale: 1,
WScale: 1,
defs: make(map[string]*node),
}
out.xContent.Filter = gdf.NoFilter // for now...
Expand All @@ -45,45 +83,25 @@ func Decode(r io.Reader) (gdf.XContent, error) {
w = *out.n.self.width
out.Width = w
}
/*
if out.n.self.viewBox != nil {
out.Width = px * (out.n.self.viewBox[2] - out.n.self.viewBox[0])
/*if out.n.self.width == nil {
out.Width = w
}*/
//out.Height = px * (out.n.self.viewBox[3] - out.n.self.viewBox[1])
/*if out.n.self.height == nil {
out.Height = h
}*/

//}

//if h != 0 {
// out.HScale = out.Height / h
//} else {
out.HScale = 1
//}
//if w != 0 {
// out.WScale = out.Width / w
//} else {
out.WScale = 1
//}

if out.n.self.viewBox != nil {
out.xContent.Re2(gdf.Rect{
out.ViewBox = gdf.Rect{
LLX: out.n.self.viewBox[0],
LLY: out.n.self.viewBox[1],
URX: out.n.self.viewBox[2],
URY: out.n.self.viewBox[3],
})
out.xContent.Clip(gdf.EvenOdd)
}
} else {
out.ViewBox = gdf.NewRect(gdf.Point{0, 0}, gdf.Point{out.Width, out.Height})
}
if out.n.self.width == nil && out.n.self.height == nil {
out.Width = abs(out.ViewBox.URX - out.ViewBox.LLX)
out.Height = abs(out.ViewBox.URY - out.ViewBox.LLY)
}

} //else {
out.xContent.BBox.URY = out.Height
out.xContent.BBox.URX = out.Width
//}

walk(out.n, &out.xContent, out.HScale, out.WScale, out.defs)
walk(out.n, &out.xContent, 1, 1, out.defs)
return out.xContent, nil
}

Expand Down

0 comments on commit 507a7cb

Please sign in to comment.