fixed svg float lexing

cdillond · Jun 28, 2024 · 507a7cb · 507a7cb
1 parent 67d9439
commit 507a7cb
Show file tree

Hide file tree

Showing 9 changed files with 196 additions and 53 deletions.
diff --git a/font/hb_cgo.go b/font/hb_cgo.go
@@ -0,0 +1,79 @@
+package font
+
+/*
+#cgo LDFLAGS: -lharfbuzz -lharfbuzz-subset
+#include <stdio.h>
+#include <string.h>
+#include <harfbuzz/hb.h>
+#include <harfbuzz/hb-subset.h>
+
+int subset(const unsigned char *src, unsigned int src_len, uint32_t uni_chars[], int num_chars, unsigned char *out)
+{
+    unsigned int out_len = 0;
+	hb_blob_t *data = hb_blob_create_or_fail(src, src_len, HB_MEMORY_MODE_READONLY, NULL, NULL);
+    if (data == NULL)
+        return out_len;
+
+    hb_face_t *face = hb_face_create(data, 0);
+
+    hb_subset_input_t *input = hb_subset_input_create_or_fail();
+    if (input == NULL)
+        goto destroy_face;
+
+    hb_set_t *charset = hb_subset_input_unicode_set(input);
+    for (int i = 0; i < num_chars; i++)
+        hb_set_add(charset, uni_chars[i]);
+
+    hb_subset_input_set_flags(input, HB_SUBSET_FLAGS_RETAIN_GIDS);
+
+    hb_face_t *sub_face = hb_subset_or_fail(face, input);
+    if (sub_face == NULL)
+        goto destroy_input;
+
+    hb_blob_t *sub_blob = hb_face_reference_blob(sub_face);
+    const char *out_data = hb_blob_get_data(sub_blob, &out_len);
+	memcpy(out, out_data, out_len);
+	hb_blob_destroy(sub_blob);
+destroy_input:
+    hb_subset_input_destroy(input);
+destroy_face:
+    hb_face_destroy(face);
+
+    return out_len;
+}
+*/
+import "C"
+import (
+	"fmt"
+	"unsafe"
+
+	"golang.org/x/image/font/sfnt"
+)
+
+// HBSubsetC can be used as a gdf.FontSubsetFunc. It calls functions in libharfbuzz and libharfbuzz-subset via CGo. In order
+// for this function to work, CGo must be enabled and HarfBuzz must be installed on your system.
+func HBSubsetC(_ *sfnt.Font, src []byte, charset map[rune]struct{}) ([]byte, error) {
+	// convert runes to uint32_t chars readable by hb-subset
+	charset_u32 := make([]uint32, len(charset))
+	for char := range charset {
+		charset_u32 = append(charset_u32, uint32(char))
+	}
+	// allocate at least as much as the current file size
+	b := make([]byte, 0, len(src))
+
+	srcData := unsafe.SliceData(src)
+	charsetData := unsafe.SliceData(charset_u32)
+	outData := unsafe.SliceData(b)
+
+	written := int(C.subset(
+		(*C.uchar)(srcData),
+		C.uint(uint(len(src))),
+		(*C.uint)(charsetData),
+		C.int(len(charset_u32)),
+		(*C.uchar)(outData)))
+	if written < 1 {
+		return nil, fmt.Errorf("error subsetting font")
+	}
+	b = unsafe.Slice(outData, written)
+	return b, nil
+}
diff --git a/font/others.go b/font/others.go
@@ -1,3 +1,5 @@
+//go:build !windows
+
 package font
 
 import (
@@ -9,11 +11,11 @@ import (
 	"golang.org/x/image/font/sfnt"
 )
 
-// HBSubset returns a func that can be used as a gdf.FontSubsetFunc on POSIX machines.
-// For this function to work, the HarfBuzz hb-subset tool must be installed. The HBSubset
-// func may handle edge cases that the TTFSubset func does not. hb-subset has a mature,
-// well-tested API and is capable of handling more font formats than the default function.
-// However, this approach requires a call to os.Exec and may not be suitable for all environments.
+// HBSubset returns a func that can be used as a gdf.FontSubsetFunc on systems with /dev/stdin
+// and /dev/stdout device files. For this function to work, the HarfBuzz hb-subset tool must
+// be installed. The HBSubset func may handle edge cases that the TTFSubset func does not. hb-subset
+// has a mature, well-tested API and is capable of handling more font formats than TTFSubset.
+// However, this approach requires os/exec, so it might not be suitable for all environments.
 func HBSubset(_ *sfnt.Font, src []byte, cutset map[rune]struct{}) ([]byte, error) {
 	u := make([]byte, 0, 512)
 	for key := range cutset {
@@ -24,15 +26,11 @@ func HBSubset(_ *sfnt.Font, src []byte, cutset map[rune]struct{}) ([]byte, error
 		return nil, fmt.Errorf("cutset is too small")
 	}
 	cmd := exec.Command("hb-subset",
-		"--font-file=/dev/stdin",
+		"--font-file=/dev/stdin", // must be passed explicitly as an arg
 		"-u", string(u[:len(u)-1]),
 		"--retain-gids",
-		"-o", "/dev/stdout",
+		"-o", "/dev/stdout", // ditto for stdout
 	)
 	cmd.Stdin = bytes.NewReader(src)
 	return cmd.Output()
 }
-
-// NoSubset can be used as a gdf.FontSubsetFunc when you wish to avoid subsetting a given
-// font. Beware: this can negatively impact the output PDF file size.
-func NoSubset(_ *sfnt.Font, src []byte, _ map[rune]struct{}) ([]byte, error) { return src, nil }
diff --git a/font/subset.go b/font/subset.go
@@ -36,6 +36,10 @@ var pdfTables = [...]TableTag{
 	//1886352244, // post
 }
 
+// NoSubset can be used as a gdf.FontSubsetFunc when you wish to avoid subsetting a given
+// font. Beware: this can negatively impact the output PDF file size.
+func NoSubset(_ *sfnt.Font, src []byte, _ map[rune]struct{}) ([]byte, error) { return src, nil }
+
 // TTFSubset is something of a poor man's subsetting function. It works - for TrueType fonts with 'glyf' tables only - by zeroing out
 // the outlines of all glyphs not corresponding to or directly referenced by f's glyphs for the runes in cutset,
 // truncating f's glyf and loca tables, and then writing only the required tables to the returned byte slice. The final subset font

diff --git a/gdf.go b/gdf.go
@@ -69,6 +69,7 @@ func includeChildren(pdf *PDF, o obj) error {
 			for key := range f.charset {
 				tmp[key] = struct{}{}
 			}
+
 			if f.FontSubsetFunc == nil {
 				f.FontSubsetFunc = font.TTFSubset
 			}

diff --git a/readme.md b/readme.md
@@ -44,11 +44,13 @@ The default basic unit for a PDF document is the point, defined as 1/72 of an in
 ## Raster Images
 In general, raster images displayed within a PDF document can be thought of as having two parts: a header, containing information about the image's size and encoding characteristics, and a byte slice representing the image's RGB/Gray/CMYK pixels in scanline order. (Alpha channel values must be encoded in a separate grayscale image.) Lossless compression filters can be applied to the byte slice to reduce its size, but this is can be costly. Where possible, it is best to store images as pre-compressed XImage objects. As a notable exception, most JPEG images can be embedded in a PDF without the need to decode and re-encode them.
 
-## Fonts
-The PDF specification allows for several different types of font. gdf supports only TrueType/OpenType fonts. Unlike in many document formats, a PDF font determines the encoding of the text rendered in that font. Attending to all of the many font types, font table formats, encodings, etc. can quickly become tedious and overwhelming (not to mention difficult to debug without knowledge of non-Latin scripts). It is gdf's aim to be lightweight and simple, rather than comprehensive. gdf therefore only supports Windows-1252 ("WinAnsiEncoding") character encodings. gdf takes care of encoding the UTF-8 strings accepted as input to its functions, but users should be aware that any text that contains characters not included in the Windows-1252 character set will not be rendered as intended.
+## Fonts and Text Encoding
+There are many ways a font can exist in a PDF file, but gdf allows for just one. In it's current form, gdf supports only TrueType/OpenType/WOFF typefaces with *nonsymbolic* characters. To render any text to a page, you must load a supported font using either the `LoadTrueType` function or the `LoadTrueTypeFile` function. Despite their names, these functions can also be used for OpenType and WOFF fonts. In PDF documents, the font used to render a piece of text determines the character encoding of that text. That is, PDF documents do not have a de facto character encoding; instead a PDF document can be a patchwork of different, even custom encodings, each of which must be specified on a per-font basis. All text written to a PDF file by gdf is encoded using the Windows-1252 ("WinAnsiEncoding") code page. This covers nearly all English-language use cases, but it is, of course, less than ideal, and hopefully, temporary. Users should be aware that any text that contains characters not included in the Windows-1252 character set will not be rendered as intended.
+
+The PDF 2.0 spec requires fonts to be embedded in any PDF file that uses them, so font subsetting is strongly recommended. By default, gdf uses the `font.TTFSubset` function to subset embedded fonts, but this has known issues with WOFF fonts. The `font.HBSubset` function, which can be used as a replacement, is usually preferable, but it requires the user to install the HarfBuzz hb-subset tool. The function won't work on Windows, though it should be easy enough to tweak the source code so that it does.
 
 ## Text Formatting
-While text can be drawn directly to a `ContentStream` by calling methods like `ContentStream.ShowString()`, the `TextController` type implements line-breaking and text-shaping algorithms, and simplifies text formatting by offering an easier to use API.  
+While text can be drawn directly to a `ContentStream` by calling methods like `ContentStream.ShowString()`, the `text.Controller` type implements line-breaking and text-shaping algorithms, and simplifies text formatting by offering an easier to use API.
 
 ## Annotations and AcroForms
 Annotations are objects, rendered by the PDF viewer on a page, that are not part of the `Page`'s `ContentStream`. gdf supports 2 kinds of annotations: `TextAnnots` and `Widgets`. To a far greater extent than the graphics objects controlled by the `ContentStream`, the visual appearance of an annotation depends on the PDF viewing software.

diff --git a/svg/color.go b/svg/color.go
@@ -2,14 +2,21 @@ package svg
 
 import (
 	"strconv"
+	"strings"
+	"unicode"
 
 	"github.com/cdillond/gdf"
 )
 
 func parseColor(s string) (gdf.RGBColor, bool) {
+	if len(s) > 4 && s[:4] == "rgb(" {
+		return parseRGBFunc(s)
+	}
+
 	if rgb, ok := namedColors[s]; ok {
 		return rgb, ok
 	}
+
 	if len(s) < 1 {
 		return gdf.RGBColor{}, false
 	}
@@ -45,6 +52,36 @@ func parseColor(s string) (gdf.RGBColor, bool) {
 
 }
 
+func parseRGBFunc(s string) (out gdf.RGBColor, ok bool) {
+	s = s[4:]
+	s = strings.Trim(s, "()\x20\n\r\t\v\f\x85\xA0")
+	cols := strings.FieldsFunc(s, func(r rune) bool {
+		return unicode.IsSpace(r) || r == ','
+	})
+	// this is an ad-hoc parsing method which does not account for edge cases or alpha values, e.g. rgb(127 255 127 / 80% )
+	if len(cols) < 3 {
+		return out, ok
+	}
+	var rgbArr [3]float64
+	var err error
+	for i := 0; i < 3; i++ {
+		if cols[i] == "none" {
+			continue
+		} else if strings.Contains(cols[i], "%") {
+			rgbArr[i], err = strconv.ParseFloat(cols[i][:len(cols[i])-1], 64)
+			rgbArr[i] /= 100.
+		} else {
+			rgbArr[i], err = strconv.ParseFloat(cols[i], 64)
+			rgbArr[i] /= 255.
+		}
+		if err != nil {
+			return out, ok
+		}
+	}
+	out.R, out.G, out.B = rgbArr[0], rgbArr[1], rgbArr[2]
+	return out, true
+}
+
 var rgbBlack = gdf.RGBColor{1, 1, 1}
 var rgbWhite = gdf.RGBColor{0, 0, 0}
 var badColor = gdf.RGBColor{R: -1, G: -1, B: -1}

diff --git a/svg/lexer.go b/svg/lexer.go
@@ -213,6 +213,7 @@ func lexPathNum(l *lexer) stateFn {
 
 		}
 	}
+	l.undo()
 	l.out <- token{typ: num, text: l.src[i:l.n]}
 	return nil
 
@@ -251,6 +252,7 @@ exponent:
 			return lexSVGPathOp
 		}
 	}
+	l.undo()
 	l.out <- token{typ: num, text: l.src[i:l.n]}
 	return nil
 }
@@ -305,6 +307,7 @@ func lexPolyNum(l *lexer) stateFn {
 
 		}
 	}
+	l.undo()
 	l.out <- token{typ: num, text: l.src[i:l.n]}
 	return nil
 
@@ -343,6 +346,7 @@ exponent:
 			return nil
 		}
 	}
+	l.undo()
 	l.out <- token{typ: num, text: l.src[i:l.n]}
 	return nil
 }

diff --git a/svg/readme.md b/svg/readme.md
@@ -1,7 +1,7 @@
 Package svg provides very limited, experimental facilities for rendering SVG images to PDFs. In addition to the obvious constraints (e.g., lack of animation), this package does not implement several important SVG features. Here's a rundown of some of them:
 1. There is limited support for CSS properties.
 2. Support for SVG text elements is unplanned.
-3. Elliptical Arc Curve (`A` and `a`) path commands are not supported, and the present solution for displaying ellipse elements needs substantial improvement.
+3. Elliptical Arc Curve (`A` and `a`) path commands may be improperly rendered.
 4. Mask elements and transparency/opacity-related attributes are not supported.
 
-These limitations preclude the use of this package for certain applications, but it can work with a surprising number of basic SVG images. Running an SVG through an SVG optimizer, such as SVGO(MG), and making simple manual adjustments to the SVG's source text can often fix rendering issues.
+These limitations preclude the use of this package for certain applications, but it can work with a surprising number of basic SVG images. Running `rsvg-convert` with the `-f svg` option on the input SVG prior to its inclusion in the PDF is **highly** recommended. Making simple manual adjustments to the SVG's source text can also often fix rendering issues.
diff --git a/svg/walk.go b/svg/walk.go
@@ -7,11 +7,46 @@ import (
 )
 
 type svgRoot struct {
-	n              *node
-	xContent       gdf.XContent
-	Height, Width  float64
-	HScale, WScale float64
-	defs           map[string]*node
+	n             *node
+	xContent      gdf.XContent
+	Height, Width float64
+	ViewBox       gdf.Rect
+	defs          map[string]*node
+}
+
+func abs(f float64) float64 {
+	if f < 0 {
+		f = -f
+	}
+	return f
+}
+
+type node_type uint
+
+const (
+	svg_type node_type = iota
+	use_type
+	path_type
+	rect_type
+	polygon_type
+	circle_type
+	ellipse_type
+	moveto_type
+	lineto_type
+	curveto_type
+	close_path_type
+)
+
+type svg2 struct {
+	children  []svg2 // this gives us an idea of the number
+	transform int
+	node_type
+}
+
+func walk2(s svg2) {
+	for i := range s.children {
+		walk2(s.children[i])
+	}
 }
 
 /*
@@ -21,14 +56,17 @@ at (0, 0), and end at the bottom right of the page at (w, h). Any graphics drawn
 are not rendered. If both a viewBox and height and width parameters are provided, then the height
 and width are interpreted instead as scale values. The values of an SVG with a height and width of 5px
 but a viewBox of 0 0 10 10 are scaled down by 2.
+
+OK: let's redo that and make it simpler. The height and width are the values against which relative lengths
+are resolved. The viewBox determines the BoundingBox and the initial clip path. If no viewBox is provided,
+the height and width are used, starting at 0, 0. If no height and width are provided, the viewBox is used.
+It is in error for neither to be provided. The viewBox dimensions are always in pixels.
 */
 
 func Decode(r io.Reader) (gdf.XContent, error) {
 	out := svgRoot{
 		n:        new(node),
 		xContent: *gdf.NewXContent(nil, gdf.Rect{}),
-		HScale:   1,
-		WScale:   1,
 		defs:     make(map[string]*node),
 	}
 	out.xContent.Filter = gdf.NoFilter // for now...
@@ -45,45 +83,25 @@ func Decode(r io.Reader) (gdf.XContent, error) {
 		w = *out.n.self.width
 		out.Width = w
 	}
-	/*
-		if out.n.self.viewBox != nil {
-			out.Width = px * (out.n.self.viewBox[2] - out.n.self.viewBox[0])
-			/*if out.n.self.width == nil {
-				out.Width = w
-			}*/
-	//out.Height = px * (out.n.self.viewBox[3] - out.n.self.viewBox[1])
-	/*if out.n.self.height == nil {
-		out.Height = h
-	}*/
-
-	//}
-
-	//if h != 0 {
-	//	out.HScale = out.Height / h
-	//} else {
-	out.HScale = 1
-	//}
-	//if w != 0 {
-	//	out.WScale = out.Width / w
-	//} else {
-	out.WScale = 1
-	//}
-
 	if out.n.self.viewBox != nil {
-		out.xContent.Re2(gdf.Rect{
+		out.ViewBox = gdf.Rect{
 			LLX: out.n.self.viewBox[0],
 			LLY: out.n.self.viewBox[1],
 			URX: out.n.self.viewBox[2],
 			URY: out.n.self.viewBox[3],
-		})
-		out.xContent.Clip(gdf.EvenOdd)
+		}
+	} else {
+		out.ViewBox = gdf.NewRect(gdf.Point{0, 0}, gdf.Point{out.Width, out.Height})
+	}
+	if out.n.self.width == nil && out.n.self.height == nil {
+		out.Width = abs(out.ViewBox.URX - out.ViewBox.LLX)
+		out.Height = abs(out.ViewBox.URY - out.ViewBox.LLY)
+	}
 
-	} //else {
 	out.xContent.BBox.URY = out.Height
 	out.xContent.BBox.URX = out.Width
-	//}
 
-	walk(out.n, &out.xContent, out.HScale, out.WScale, out.defs)
+	walk(out.n, &out.xContent, 1, 1, out.defs)
 	return out.xContent, nil
 }