Skip to content

Commit

Permalink
Added autodetect charset for URL
Browse files Browse the repository at this point in the history
And added -dont-detect-charset option
  • Loading branch information
msoap committed Apr 25, 2016
1 parent 03fa6c7 commit ad2141e
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 12 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ import (
func main() {
doc := html2data.FromURL("http://example.com")
// or with config
// doc := html2data.FromURL("http://example.com", html2data.URLCfg{UA: "userAgent", TimeOut: 10})
// doc := html2data.FromURL("http://example.com", html2data.URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: false})
if doc.Err != nil {
log.Fatal(doc.Err)
}
Expand Down Expand Up @@ -105,6 +105,7 @@ Command line utility
* `-find-in="outer.css.selector"` -- search in the specified elements instead document
* `-json` -- get result as JSON
* `-dont-trim-spaces` -- get text as is
* `-dont-detect-charset` -- dont detect charset and convert text
* `-timeout=10` -- setting timeout when loading the URL

### Install
Expand Down
4 changes: 3 additions & 1 deletion cmd/html2data/html2data.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ type cmdConfig struct {
getJSON bool
dontTrimSpaces bool
timeOut int
dontDetectCharset bool
}

var (
Expand All @@ -33,6 +34,7 @@ func init() {
flag.StringVar(&config.outerCSS, "find-in", "", "search in the specified elements instead document")
flag.BoolVar(&config.getJSON, "json", false, "JSON output")
flag.BoolVar(&config.dontTrimSpaces, "dont-trim-spaces", false, "dont trim spaces, get text as is")
flag.BoolVar(&config.dontDetectCharset, "dont-detect-charset", false, "dont detect charset and convert text")
flag.IntVar(&config.timeOut, "timeout", 0, "timeout in seconds")
}

Expand Down Expand Up @@ -75,7 +77,7 @@ func runApp() error {
reader := bufio.NewReader(os.Stdin)
doc = html2data.FromReader(reader)
} else if strings.HasPrefix(config.url, "http://") || strings.HasPrefix(config.url, "https://") {
doc = html2data.FromURL(config.url, html2data.URLCfg{UA: config.userAgent, TimeOut: config.timeOut})
doc = html2data.FromURL(config.url, html2data.URLCfg{UA: config.userAgent, TimeOut: config.timeOut, DontDetectCharset: config.dontDetectCharset})
} else if len(config.url) > 0 {
doc = html2data.FromFile(config.url)
} else {
Expand Down
34 changes: 25 additions & 9 deletions html2data.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
"time"

"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html/charset"
)

// docOrSelection - for exec .Find
Expand Down Expand Up @@ -263,33 +264,35 @@ func FromFile(fileName string) Doc {

// URLCfg - config for FromURL()
type URLCfg struct {
UA string // custom user-agent
TimeOut int // timeout in seconds
UA string // custom user-agent
TimeOut int // timeout in seconds
DontDetectCharset bool // dont autoconvert to UTF8
}

// FromURL - get doc from URL
//
// FromURL("https://url")
// FromURL("https://url", URLCfg{UA: "Custom UA 1.0", TimeOut: 10})
func FromURL(URL string, config ...URLCfg) Doc {
ua, timeout := "", 0
ua, timeout, dontDetectCharset := "", 0, false
if len(config) == 1 {
ua = config[0].UA
timeout = config[0].TimeOut
dontDetectCharset = config[0].DontDetectCharset
} else if len(config) > 1 {
panic("FromURL(): only one config argument allowed")
}

httpResponse, err := getHTMLPage(URL, ua, timeout)
htmlReader, err := getHTMLPage(URL, ua, timeout, dontDetectCharset)
if err != nil {
return Doc{Err: err}
}

return FromReader(httpResponse.Body)
return FromReader(htmlReader)
}

// getHTMLPage - get html by http(s) as http.Response
func getHTMLPage(url string, ua string, timeout int) (response *http.Response, err error) {
func getHTMLPage(url string, ua string, timeout int, dontDetectCharset bool) (htmlReader io.Reader, err error) {
cookie, _ := cookiejar.New(nil)
client := &http.Client{
Jar: cookie,
Expand All @@ -298,13 +301,26 @@ func getHTMLPage(url string, ua string, timeout int) (response *http.Response, e

request, err := http.NewRequest("GET", url, nil)
if err != nil {
return response, err
return htmlReader, err
}

if ua != "" {
request.Header.Set("User-Agent", ua)
}

response, err = client.Do(request)
return response, err
response, err := client.Do(request)
if err != nil {
return htmlReader, err
}

if contentType := response.Header.Get("Content-Type"); contentType != "" && !dontDetectCharset {
htmlReader, err = charset.NewReader(response.Body, contentType)
if err != nil {
return htmlReader, err
}
} else {
return response.Body, nil
}

return htmlReader, nil
}
12 changes: 11 additions & 1 deletion html2data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ func Test_FromURL(t *testing.T) {

// UA test
ts = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintln(w, "<div>"+r.UserAgent()+"</div>")
fmt.Fprintln(w, "<div>"+r.UserAgent()+"</div><span id=2>Тест</span>")
}))

customUA := "CustomUA/1.0"
Expand All @@ -524,6 +524,16 @@ func Test_FromURL(t *testing.T) {
if err != nil || div != customUA {
t.Errorf("User-agent test failed, div: '%s'", div)
}

doc = FromURL(ts.URL, URLCfg{DontDetectCharset: true})
if doc.Err != nil {
t.Errorf("Dont load url, error: %s", doc.Err)
}
span, err := doc.GetDataSingle("span#2")
if err != nil || span != "Тест" {
t.Errorf("DontDetectCharset failed, span: '%s'", div)
}
ts.Close()
}

func Test_FromFile(t *testing.T) {
Expand Down

0 comments on commit ad2141e

Please sign in to comment.