From ad2141e20fa89d0acd89fd55ffd595cda8bcabf6 Mon Sep 17 00:00:00 2001 From: Sergey Mudrik Date: Mon, 25 Apr 2016 23:08:37 +0300 Subject: [PATCH] Added autodetect charset for URL And added -dont-detect-charset option --- README.md | 3 ++- cmd/html2data/html2data.go | 4 +++- html2data.go | 34 +++++++++++++++++++++++++--------- html2data_test.go | 12 +++++++++++- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index c93723c..78f4f8d 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ import ( func main() { doc := html2data.FromURL("http://example.com") // or with config - // doc := html2data.FromURL("http://example.com", html2data.URLCfg{UA: "userAgent", TimeOut: 10}) + // doc := html2data.FromURL("http://example.com", html2data.URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: false}) if doc.Err != nil { log.Fatal(doc.Err) } @@ -105,6 +105,7 @@ Command line utility * `-find-in="outer.css.selector"` -- search in the specified elements instead document * `-json` -- get result as JSON * `-dont-trim-spaces` -- get text as is + * `-dont-detect-charset` -- dont detect charset and convert text * `-timeout=10` -- setting timeout when loading the URL ### Install diff --git a/cmd/html2data/html2data.go b/cmd/html2data/html2data.go index 18d4869..87ac3e4 100644 --- a/cmd/html2data/html2data.go +++ b/cmd/html2data/html2data.go @@ -22,6 +22,7 @@ type cmdConfig struct { getJSON bool dontTrimSpaces bool timeOut int + dontDetectCharset bool } var ( @@ -33,6 +34,7 @@ func init() { flag.StringVar(&config.outerCSS, "find-in", "", "search in the specified elements instead document") flag.BoolVar(&config.getJSON, "json", false, "JSON output") flag.BoolVar(&config.dontTrimSpaces, "dont-trim-spaces", false, "dont trim spaces, get text as is") + flag.BoolVar(&config.dontDetectCharset, "dont-detect-charset", false, "dont detect charset and convert text") flag.IntVar(&config.timeOut, "timeout", 0, "timeout in seconds") } @@ -75,7 +77,7 @@ func runApp() error { reader := bufio.NewReader(os.Stdin) doc = html2data.FromReader(reader) } else if strings.HasPrefix(config.url, "http://") || strings.HasPrefix(config.url, "https://") { - doc = html2data.FromURL(config.url, html2data.URLCfg{UA: config.userAgent, TimeOut: config.timeOut}) + doc = html2data.FromURL(config.url, html2data.URLCfg{UA: config.userAgent, TimeOut: config.timeOut, DontDetectCharset: config.dontDetectCharset}) } else if len(config.url) > 0 { doc = html2data.FromFile(config.url) } else { diff --git a/html2data.go b/html2data.go index d54eb99..5ddc87c 100644 --- a/html2data.go +++ b/html2data.go @@ -38,6 +38,7 @@ import ( "time" "github.com/PuerkitoBio/goquery" + "golang.org/x/net/html/charset" ) // docOrSelection - for exec .Find @@ -263,8 +264,9 @@ func FromFile(fileName string) Doc { // URLCfg - config for FromURL() type URLCfg struct { - UA string // custom user-agent - TimeOut int // timeout in seconds + UA string // custom user-agent + TimeOut int // timeout in seconds + DontDetectCharset bool // dont autoconvert to UTF8 } // FromURL - get doc from URL @@ -272,24 +274,25 @@ type URLCfg struct { // FromURL("https://url") // FromURL("https://url", URLCfg{UA: "Custom UA 1.0", TimeOut: 10}) func FromURL(URL string, config ...URLCfg) Doc { - ua, timeout := "", 0 + ua, timeout, dontDetectCharset := "", 0, false if len(config) == 1 { ua = config[0].UA timeout = config[0].TimeOut + dontDetectCharset = config[0].DontDetectCharset } else if len(config) > 1 { panic("FromURL(): only one config argument allowed") } - httpResponse, err := getHTMLPage(URL, ua, timeout) + htmlReader, err := getHTMLPage(URL, ua, timeout, dontDetectCharset) if err != nil { return Doc{Err: err} } - return FromReader(httpResponse.Body) + return FromReader(htmlReader) } // getHTMLPage - get html by http(s) as http.Response -func getHTMLPage(url string, ua string, timeout int) (response *http.Response, err error) { +func getHTMLPage(url string, ua string, timeout int, dontDetectCharset bool) (htmlReader io.Reader, err error) { cookie, _ := cookiejar.New(nil) client := &http.Client{ Jar: cookie, @@ -298,13 +301,26 @@ func getHTMLPage(url string, ua string, timeout int) (response *http.Response, e request, err := http.NewRequest("GET", url, nil) if err != nil { - return response, err + return htmlReader, err } if ua != "" { request.Header.Set("User-Agent", ua) } - response, err = client.Do(request) - return response, err + response, err := client.Do(request) + if err != nil { + return htmlReader, err + } + + if contentType := response.Header.Get("Content-Type"); contentType != "" && !dontDetectCharset { + htmlReader, err = charset.NewReader(response.Body, contentType) + if err != nil { + return htmlReader, err + } + } else { + return response.Body, nil + } + + return htmlReader, nil } diff --git a/html2data_test.go b/html2data_test.go index 57c2f2b..341e8c7 100644 --- a/html2data_test.go +++ b/html2data_test.go @@ -512,7 +512,7 @@ func Test_FromURL(t *testing.T) { // UA test ts = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - fmt.Fprintln(w, "
"+r.UserAgent()+"
") + fmt.Fprintln(w, "
"+r.UserAgent()+"
Тест") })) customUA := "CustomUA/1.0" @@ -524,6 +524,16 @@ func Test_FromURL(t *testing.T) { if err != nil || div != customUA { t.Errorf("User-agent test failed, div: '%s'", div) } + + doc = FromURL(ts.URL, URLCfg{DontDetectCharset: true}) + if doc.Err != nil { + t.Errorf("Dont load url, error: %s", doc.Err) + } + span, err := doc.GetDataSingle("span#2") + if err != nil || span != "Тест" { + t.Errorf("DontDetectCharset failed, span: '%s'", div) + } + ts.Close() } func Test_FromFile(t *testing.T) {