Skip to content

Commit

Permalink
Merge pull request #14 from neilpa/unicode-encodings
Browse files Browse the repository at this point in the history
UTF-16 support for JSON and option to skip BOMs
  • Loading branch information
neilpa authored Oct 6, 2020
2 parents 5071019 + 87a159c commit b2b9a85
Show file tree
Hide file tree
Showing 51 changed files with 286 additions and 44 deletions.
56 changes: 56 additions & 0 deletions gen_testdata.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// +build ignore

// gen_testdata clones the utf-8 tests data to the other
// unicode encodings and adds BOM variants of each.
package main

import (
"io/ioutil"
"log"
"os"
"path/filepath"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
)

func main() {
var xforms = []struct {
dir, bom string
enc encoding.Encoding
}{
{"testdata/utf-16be", "\xFE\xFF", unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)},
{"testdata/utf-16le", "\xFF\xFE", unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)},
}

paths, _ := filepath.Glob("testdata/utf-8/*")
for _, p := range paths {
src, err := ioutil.ReadFile(p)
if err != nil {
log.Fatal(err)
}

write("testdata/utf-8_bom", p, "\xEF\xBB\xBF", src)
for _, xform := range xforms {
dst, err := xform.enc.NewEncoder().Bytes(src)
if err != nil {
log.Fatal(err)
}
write(xform.dir, p, "", dst)
write(xform.dir+"_bom", p, xform.bom, dst)
}
}
}

func write(dir, orig, bom string, buf []byte) {
f, err := os.Create(filepath.Join(dir, filepath.Base(orig)))
if err != nil {
log.Fatal(err)
}
if _, err = f.Write([]byte(bom)); err != nil {
log.Fatal(err)
}
if _, err = f.Write(buf); err != nil {
log.Fatal(err)
}
}
93 changes: 80 additions & 13 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
// a provided JSON Schema - https://json-schema.org/
package main

//go:generate go run gen_testdata.go

import (
"bufio"
"bytes"
"flag"
"fmt"
"io"
Expand All @@ -15,21 +18,37 @@ import (
"strings"
"sync"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"

"github.com/ghodss/yaml"
"github.com/mitchellh/go-homedir"
"github.com/xeipuuv/gojsonschema"
)

var (
version = "v1.3.0-dev"
version = "v1.4.0-dev"
schemaFlag = flag.String("s", "", "primary JSON schema to validate against, required")
quietFlag = flag.Bool("q", false, "quiet, only print validation failures and errors")
versionFlag = flag.Bool("v", false, "print version and exit")
bomFlag = flag.Bool("b", false, "allow BOM in JSON files, error if seen and unset")

listFlags stringFlags
refFlags stringFlags
)

// https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding
const (
bomUTF8 = "\xEF\xBB\xBF"
bomUTF16BE = "\xFE\xFF"
bomUTF16LE = "\xFF\xFE"
)

var (
encUTF16BE = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
encUTF16LE = unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
)

func init() {
flag.Var(&listFlags, "l", "validate JSON documents from newline separated paths and/or globs in a text file (relative to the basename of the file itself)")
flag.Var(&refFlags, "r", "referenced schema(s), can be globs and/or used multiple times")
Expand Down Expand Up @@ -60,7 +79,7 @@ func realMain(args []string, w io.Writer) int {
dir := filepath.Dir(list)
f, err := os.Open(list)
if err != nil {
log.Fatalf("%s: %s\n", list, err)
return schemaError("%s: %s", list, err)
}
defer f.Close()

Expand All @@ -74,7 +93,7 @@ func realMain(args []string, w io.Writer) int {
docs = append(docs, glob(pattern)...)
}
if err := scanner.Err(); err != nil {
log.Fatalf("%s: invalid file list: %s\n", list, err)
return schemaError("%s: invalid file list: %s", list, err)
}
}
if len(docs) == 0 {
Expand All @@ -85,13 +104,13 @@ func realMain(args []string, w io.Writer) int {
sl := gojsonschema.NewSchemaLoader()
schemaPath, err := filepath.Abs(*schemaFlag)
if err != nil {
log.Fatalf("%s: unable to convert to absolute path: %s\n", *schemaFlag, err)
return schemaError("%s: unable to convert to absolute path: %s", *schemaFlag, err)
}
for _, ref := range refFlags {
for _, p := range glob(ref) {
absPath, err := filepath.Abs(p)
if err != nil {
log.Fatalf("%s: unable to convert to absolute path: %s\n", absPath, err)
return schemaError("%s: unable to convert to absolute path: %s", absPath, err)
}

if absPath == schemaPath {
Expand All @@ -100,22 +119,22 @@ func realMain(args []string, w io.Writer) int {

loader, err := jsonLoader(absPath)
if err != nil {
log.Fatalf("%s: unable to load schema ref: %s\n", *schemaFlag, err)
return schemaError("%s: unable to load schema ref: %s", *schemaFlag, err)
}

if err := sl.AddSchemas(loader); err != nil {
log.Fatalf("%s: invalid schema: %s\n", p, err)
return schemaError("%s: invalid schema: %s", p, err)
}
}
}

schemaLoader, err := jsonLoader(schemaPath)
if err != nil {
log.Fatalf("%s: unable to load schema: %s\n", *schemaFlag, err)
return schemaError("%s: unable to load schema: %s", *schemaFlag, err)
}
schema, err := sl.Compile(schemaLoader)
if err != nil {
log.Fatalf("%s: invalid schema: %s\n", *schemaFlag, err)
return schemaError("%s: invalid schema: %s", *schemaFlag, err)
}

// Validate the schema against each doc in parallel, limiting simultaneous
Expand All @@ -131,7 +150,6 @@ func realMain(args []string, w io.Writer) int {
sem <- 0
defer func() { <-sem }()


loader, err := jsonLoader(path)
if err != nil {
msg := fmt.Sprintf("%s: error: load doc: %s", path, err)
Expand Down Expand Up @@ -190,19 +208,62 @@ func jsonLoader(path string) (gojsonschema.JSONLoader, error) {
}
switch filepath.Ext(path) {
case ".yml", ".yaml":
// TODO YAML requires the precense of a BOM to detect UTF-16
// text. Is there a decent hueristic to detect UTF-16 text
// missing a BOM so we can provide a better error message?
buf, err = yaml.YAMLToJSON(buf)
default:
buf, err = jsonDecodeCharset(buf)
}
if err != nil {
return nil, err
}
// TODO What if we have an empty document?
return gojsonschema.NewBytesLoader(buf), nil
}

// jsonDecodeCharset attempts to detect UTF-16 (LE or BE) JSON text and
// decode as appropriate. It also skips a BOM at the start of the buffer
// if `-b` was specified. Presence of a BOM is an error otherwise.
func jsonDecodeCharset(buf []byte) ([]byte, error) {
if len(buf) < 2 { // UTF-8
return buf, nil
}

bom := ""
var enc encoding.Encoding
switch {
case bytes.HasPrefix(buf, []byte(bomUTF8)):
bom = bomUTF8
case bytes.HasPrefix(buf, []byte(bomUTF16BE)):
bom = bomUTF16BE
enc = encUTF16BE
case bytes.HasPrefix(buf, []byte(bomUTF16LE)):
bom = bomUTF16LE
enc = encUTF16LE
case buf[0] == 0:
enc = encUTF16BE
case buf[1] == 0:
enc = encUTF16LE
}

if bom != "" {
if !*bomFlag {
return nil, fmt.Errorf("unexpected BOM, see `-b` flag")
}
buf = buf[len(bom):]
}
if enc != nil {
return enc.NewDecoder().Bytes(buf)
}
return buf, nil
}

func printUsage() {
fmt.Fprintf(os.Stderr, `Usage: %s -s schema.(json|yml) [options] document.(json|yml) ...
yajsv validates JSON and YAML document(s) against a schema. One of three statuses are
reported per document:
yajsv validates JSON and YAML document(s) against a schema. One of three status
results are reported per document:
pass: Document is valid relative to the schema
fail: Document is invalid relative to the schema
Expand All @@ -212,7 +273,8 @@ func printUsage() {
schema validation failure.
Sets the exit code to 1 on any failures, 2 on any errors, 3 on both, 4 on
invalid usage. Otherwise, 0 is returned if everything passes validation.
invalid usage, 5 on schema definition or file-list errors. Otherwise, 0 is
returned if everything passes validation.
Options:
Expand All @@ -227,6 +289,11 @@ func usageError(msg string) int {
return 4
}

func schemaError(format string, args ...interface{}) int {
fmt.Fprintf(os.Stderr, format+"\n", args...)
return 5
}

// glob is a wrapper that also resolves `~` since we may be skipping
// the shell expansion when single-quoting globs at the command line
func glob(pattern string) []string {
Expand Down
Loading

0 comments on commit b2b9a85

Please sign in to comment.