diff --git a/README.md b/README.md index 5516776..fe07c1b 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ build/sicra -h Usage of build/sicra: -add-error - Add URL to sitemap, even if response error (default true) + Add URL to sitemap, even if response error (only for 5xx codes) (default true) -async Run async requests -delay int diff --git a/main.go b/main.go index 5e6dfa6..c19acdf 100644 --- a/main.go +++ b/main.go @@ -11,7 +11,7 @@ import ( ) func main() { - addError := flag.Bool("add-error", true, "Add URL to sitemap, even if response error") + addError := flag.Bool("add-error", true, "Add URL to sitemap, even if response error (only for 5xx codes)") asyncScan := flag.Bool("async", false, "Run async requests") delay := flag.Int64("delay", 0, "Delay between requests in Millisecond") maxDepth := flag.Int("max-depth", 0, "MaxDepth limits the recursion depth of visited URLs.") @@ -49,24 +49,38 @@ func main() { *skipNoIndex, *verbose) - err = sicra.GenerateSiteMap(*outFile, scrape.AddedURLs) - if err != nil { - log.Fatal(err) + p := filepath.Dir(*outFile) + // generate sitemap.xml + if len(scrape.AddedURLs) > 0 { + err = sicra.GenerateSiteMap(*outFile, scrape.AddedURLs) + if err != nil { + log.Fatal(err) + } } - + // generate noindex.txt if *skipNoIndex { - p := filepath.Dir(*outFile) - err = sicra.GenerateNoIndex(p+"/noindex.txt", scrape.NoIndexURLs) + if len(scrape.NoIndexURLs) > 0 { + err = sicra.GenerateTxt(p+"/noindex.txt", scrape.NoIndexURLs) + if err != nil { + log.Fatal(err) + } + } + } + // generate errors.txt + if len(scrape.ErrorURLs) > 0 { + err = sicra.GenerateTxt(p+"/errors.txt", scrape.ErrorURLs) if err != nil { log.Fatal(err) } } - - fmt.Print( - "Request URLs: ", scrape.AllVisitURLsCount, "\n", - "Added URLs ", scrape.AddedURLsCount, "\n", - "No Index URLs ", scrape.NoIndexURLsCount, "\n", - "Response URLs ", scrape.ResponseURLsCount, "\n", - "Error URLs ", scrape.ErrorURLsCount, "\n", - ) + // print stats + if *verbose { + fmt.Print( + "Request URLs: ", scrape.AllVisitURLsCount, "\n", + "Added URLs ", scrape.AddedURLsCount, "\n", + "No Index URLs ", scrape.NoIndexURLsCount, "\n", + "Response URLs ", scrape.ResponseURLsCount, "\n", + "Error URLs ", scrape.ErrorURLsCount, "\n", + ) + } } diff --git a/sicra/crawler.go b/sicra/crawler.go index c85d588..94cb3b0 100644 --- a/sicra/crawler.go +++ b/sicra/crawler.go @@ -1,9 +1,11 @@ package sicra import ( + "fmt" "log" "net/url" "regexp" + "strconv" "time" "github.com/gocolly/colly" @@ -13,9 +15,10 @@ type scrapeURL struct { AddedURLs []string AddedURLsCount int AllVisitURLsCount int + ErrorURLs []string ErrorURLsCount int - NoIndexURLsCount int NoIndexURLs []string + NoIndexURLsCount int ResponseURLsCount int } @@ -68,13 +71,19 @@ func Crawler( c.OnError(func(er *colly.Response, err error) { requestURL := urlEscape(er.Request.URL.String()) + r := regexp.MustCompile("^5[0-9]{1,2}$") + statusCode := strconv.Itoa(er.StatusCode) + strErr := fmt.Sprint(err) if verbose { log.Println("Error:", err, requestURL) } if addError { - add(requestURL, verbose, scrapeURLs) + if r.MatchString(statusCode) { + add(requestURL, verbose, scrapeURLs) + } } scrapeURLs.ErrorURLsCount++ + scrapeURLs.ErrorURLs = append(scrapeURLs.ErrorURLs, statusCode+" "+strErr+" "+requestURL) }) c.OnResponse(func(re *colly.Response) { diff --git a/sicra/sitemap.go b/sicra/sitemap.go index 98d1109..a805629 100644 --- a/sicra/sitemap.go +++ b/sicra/sitemap.go @@ -24,7 +24,7 @@ func GenerateSiteMap(fileName string, urls []string) error { for _, loc := range urls { fh.WriteString(" " + "\n") fh.WriteString(" " + "" + loc + "\n") - fh.WriteString(" " + "" + currentTime + "\n") + fh.WriteString(" " + "" + currentTime + "\n") fh.WriteString(" " + "hourly\n") fh.WriteString(" " + "0.5\n") fh.WriteString(" " + "\n") @@ -34,7 +34,8 @@ func GenerateSiteMap(fileName string, urls []string) error { return nil } -func GenerateNoIndex(fileName string, urls []string) error { +// GenerateTxt generate txt file for error list url or skiped noindex +func GenerateTxt(fileName string, urls []string) error { err := deleteFileIfExists(fileName) if err != nil { log.Fatal(err) diff --git a/test/error.conf b/test/error.conf new file mode 100644 index 0000000..9050bc7 --- /dev/null +++ b/test/error.conf @@ -0,0 +1,23 @@ +server { + listen 8080; + server_name _; + + location /403 { + return 403; + } + location /404 { + return 404; + } + location /500 { + return 500; + } + location /502 { + return 502; + } + location /444 { + return 444; + } + location /555 { + return 555; + } +}