From 0096e0ed1d1c178d917e75d3a4974c75f78d8272 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 27 Sep 2021 12:59:35 -0400 Subject: [PATCH 1/5] Refactor code for testing and fix issues with depth scraping --- api/handlers.go | 18 ++--- api/handlers_test.go | 6 +- linktree/linktree.go | 174 +++++++++++++++++++++++++++---------------- main.go | 19 +++-- 4 files changed, 124 insertions(+), 93 deletions(-) diff --git a/api/handlers.go b/api/handlers.go index 30f1a44..911fb0b 100644 --- a/api/handlers.go +++ b/api/handlers.go @@ -31,8 +31,7 @@ func GetTreeNode(client *http.Client) func(w http.ResponseWriter, r *http.Reques link := queryMap.Get("link") log.Printf("processing link %s at a depth of %d\n", link, depth) - manager := linktree.NewNodeManager(client) - node := manager.LoadNode(link, depth) + node := linktree.BuildTree(client, link, depth) log.Printf("Tree built for %s at depth %d\n", node.URL, depth) err = json.NewEncoder(w).Encode(node) if err != nil { @@ -64,20 +63,13 @@ func isEmailValid(e string) bool { // gets any email addresses on the url passed func getEmails(client *http.Client, link string) []string { - manager := linktree.NewNodeManager(client) - linkChan := manager.StreamUrls(link, func(childLink string) bool { + links := []string{} + linktree.Crawl(client, link, 1, func(childLink string) { linkPieces := strings.Split(childLink, "mailto:") - if len(linkPieces) > 1 { - return isEmailValid(linkPieces[1]) + if len(linkPieces) > 1 && isEmailValid(linkPieces[1]) { + links = append(links, linkPieces[1]) } - return false }) - links := []string{} - for childLink := range linkChan { - linkPieces := strings.Split(childLink, "mailto:") - links = append(links, linkPieces[1]) - - } return links } diff --git a/api/handlers_test.go b/api/handlers_test.go index c64b0b0..d874061 100644 --- a/api/handlers_test.go +++ b/api/handlers_test.go @@ -101,8 +101,7 @@ func TestGetTree(t *testing.T) { httpmock.RegisterResponder("GET", rootLink, httpmock.NewStringResponder(200, page)) - manager := linktree.NewNodeManager(http.DefaultClient) - node := manager.LoadNode(rootLink, 1) + node := linktree.BuildTree(http.DefaultClient, rootLink, 1) httpmock.DeactivateAndReset() assertNode(t, node, rootLink, 1) @@ -121,8 +120,7 @@ func TestGetTree(t *testing.T) { httpmock.RegisterResponder("GET", rootLink, httpmock.NewStringResponder(200, page)) - manager = linktree.NewNodeManager(http.DefaultClient) - node = manager.LoadNode(rootLink, 2) + node = linktree.BuildTree(http.DefaultClient, rootLink, 2) httpmock.DeactivateAndReset() assertNode(t, node, rootLink, 1) diff --git a/linktree/linktree.go b/linktree/linktree.go index 1f79d60..34f66ab 100644 --- a/linktree/linktree.go +++ b/linktree/linktree.go @@ -13,7 +13,7 @@ import ( // Node represents a single URL type Node struct { - manager *NodeManager + client *http.Client URL string `json:"url"` StatusCode int `json:"status_code"` Status string `json:"status"` @@ -33,7 +33,7 @@ func (n *Node) PrintTree() { // UpdateStatus updates the status of the URL func (n *Node) updateStatus() { - resp, err := n.manager.client.Get(n.URL) + resp, err := n.client.Get(n.URL) if err != nil { n.Status = "UNKNOWN" n.StatusCode = http.StatusInternalServerError @@ -43,16 +43,6 @@ func (n *Node) updateStatus() { n.StatusCode = resp.StatusCode } -// NewNode returns a new Link object -func NewNode(manager *NodeManager, URL string) *Node { - n := &Node{ - URL: URL, - manager: manager, - } - n.updateStatus() - return n -} - // NodeManager ... type NodeManager struct { client *http.Client @@ -66,12 +56,22 @@ func isValidURL(URL string) bool { return false } +// NewNode returns a new Link object +func NewNode(client *http.Client, URL string) *Node { + n := &Node{ + URL: URL, + client: client, + } + n.updateStatus() + return n +} + // StreamUrls the child nodes of a link using a custom validator -func (m *NodeManager) StreamUrls(link string, validator func(link string) bool) chan string { - linkChan := make(chan string, 100) +func streamTokens(client *http.Client, page string) chan html.Token { + tokenStream := make(chan html.Token, 100) go func() { - defer close(linkChan) - resp, err := m.client.Get(link) + defer close(tokenStream) + resp, err := client.Get(page) if err != nil { log.Println(err) return @@ -88,77 +88,119 @@ func (m *NodeManager) StreamUrls(link string, validator func(link string) bool) return case html.StartTagToken: token := tokenizer.Token() - if token.Data == "a" { - for _, attr := range token.Attr { - if attr.Key == "href" { - if validator(attr.Val) { - linkChan <- attr.Val - } - } - } - } + tokenStream <- token } } }() - return linkChan + return tokenStream } -// LoadNode ... -func (m *NodeManager) LoadNode(root string, depth int) *Node { - node := NewNode(m, root) - rootChan := m.StreamUrls(root, isValidURL) - m.buildTree(rootChan, depth, node) - m.wg.Wait() - return node -} +func filterTokens(tokenStream chan html.Token, filter *TokenFilter) chan string { + filterStream := make(chan string) -// streams the status of the links from the channel until the depth has reached 0 -func (m *NodeManager) crawl(linkChan <-chan string, depth int, doWork func(link string)) { - for link := range linkChan { - go func(l string) { - defer m.wg.Done() - doWork(l) - if depth > 1 { - depth-- - subLinkChan := m.StreamUrls(l, isValidURL) - m.crawl(subLinkChan, depth, doWork) + filterAttributes := func(token html.Token) { + // check if token passes filter + for _, attr := range token.Attr { + if _, foundAttribute := filter.attributes[attr.Key]; foundAttribute { + filterStream <- attr.Val } - }(link) - m.wg.Add(1) + } } + + go func() { + defer close(filterStream) + for token := range tokenStream { + if len(filter.tags) == 0 { + filterStream <- token.Data + } + + // check if token passes tag filter or tag filter is empty + if _, foundTag := filter.tags[token.Data]; foundTag { + // emit attributes if there is a filter, otherwise emit token + if len(filter.attributes) > 0 { + filterAttributes(token) + } else { + filterStream <- token.Data + } + } + } + }() + + return filterStream +} + +type TokenFilter struct { + tags map[string]bool + attributes map[string]bool } // builds a tree from the given link channel -func (m *NodeManager) buildTree(linkChan <-chan string, depth int, node *Node) { - for link := range linkChan { - go func(l string, node *Node) { - defer m.wg.Done() +func buildTree(parent *Node, depth int, childLinks chan string, wg *sync.WaitGroup) { + for link := range childLinks { + go func(parent *Node, link string, depth int) { + defer wg.Done() // Do not add the link as it's own child - if node.URL != l { - n := NewNode(m, l) - node.Children = append(node.Children, n) + if parent.URL != link { + n := NewNode(parent.client, link) + parent.Children = append(parent.Children, n) if depth > 1 { depth-- - subLinkChan := m.StreamUrls(l, isValidURL) - m.buildTree(subLinkChan, depth, n) + tokenStream := streamTokens(n.client, n.URL) + filteredStream := filterTokens(tokenStream, &TokenFilter{ + tags: map[string]bool{"a": true}, + attributes: map[string]bool{"href": true}, + }) + buildTree(n, depth, filteredStream, wg) } } - }(link, node) - m.wg.Add(1) + }(parent, link, depth) + wg.Add(1) } } -// NewNodeManager ... -func NewNodeManager(client *http.Client) *NodeManager { - return &NodeManager{ - client: client, - wg: new(sync.WaitGroup), +// BuildTree... +func BuildTree(client *http.Client, root string, depth int) *Node { + node := NewNode(client, root) + tokenStream := streamTokens(client, root) + filteredStream := filterTokens(tokenStream, &TokenFilter{ + tags: map[string]bool{"a": true}, + attributes: map[string]bool{"href": true}, + }) + + wg := new(sync.WaitGroup) + buildTree(node, depth, filteredStream, wg) + wg.Wait() + return node +} + +// streams the status of the links from the channel until the depth has reached 0 +func crawl(client *http.Client, wg *sync.WaitGroup, linkChan <-chan string, depth int, doWork func(link string)) { + for link := range linkChan { + go func(currentLink string, currentDepth int) { + defer wg.Done() + doWork(currentLink) + if currentDepth > 1 { + currentDepth-- + tokenStream := streamTokens(client, currentLink) + filteredStream := filterTokens(tokenStream, &TokenFilter{ + tags: map[string]bool{"a": true}, + attributes: map[string]bool{"href": true}, + }) + crawl(client, wg, filteredStream, currentDepth, doWork) + } + }(link, depth) + wg.Add(1) } } // Crawl ... -func (m *NodeManager) Crawl(root string, depth int, work func(link string)) { - rootChan := m.StreamUrls(root, isValidURL) - m.crawl(rootChan, depth, work) - m.wg.Wait() +func Crawl(client *http.Client, root string, depth int, work func(link string)) { + tokenStream := streamTokens(client, root) + filteredStream := filterTokens(tokenStream, &TokenFilter{ + tags: map[string]bool{"a": true}, + attributes: map[string]bool{"href": true}, + }) + wg := new(sync.WaitGroup) + crawl(client, wg, filteredStream, depth, work) + wg.Wait() } diff --git a/main.go b/main.go index ab8e445..21296f7 100644 --- a/main.go +++ b/main.go @@ -38,9 +38,9 @@ func newTorClient(host, port string) (*http.Client, error) { }, nil } -func writeTerminal(manager *linktree.NodeManager, root string, depth int) { +func writeTerminal(client *http.Client, root string, depth int) { printStatus := func(link string) { - n := linktree.NewNode(manager, link) + n := linktree.NewNode(client, link) markError := ansi.ColorFunc("red") markSuccess := ansi.ColorFunc("green") if n.StatusCode != 200 { @@ -49,10 +49,10 @@ func writeTerminal(manager *linktree.NodeManager, root string, depth int) { fmt.Printf("Link: %20s Status: %d %s\n", n.URL, n.StatusCode, markSuccess(n.Status)) } } - manager.Crawl(root, depth, printStatus) + linktree.Crawl(client, root, depth, printStatus) } -func writeExcel(manager *linktree.NodeManager, root string, depth int) { +func writeExcel(client *http.Client, root string, depth int) { f := excelize.NewFile() err := f.SetCellStr(f.GetSheetName(0), "A1", "Link") if err != nil { @@ -66,7 +66,7 @@ func writeExcel(manager *linktree.NodeManager, root string, depth int) { } row := 2 addRow := func(link string) { - node := linktree.NewNode(manager, link) + node := linktree.NewNode(client, link) linkCell := fmt.Sprintf("A%d", row) statusCell := fmt.Sprintf("B%d", row) err = f.SetCellStr(f.GetSheetName(0), linkCell, node.URL) @@ -81,7 +81,7 @@ func writeExcel(manager *linktree.NodeManager, root string, depth int) { } row++ } - manager.Crawl(root, depth, addRow) + linktree.Crawl(client, root, depth, addRow) u, err := url.Parse(root) if err != nil { log.Fatal(err) @@ -154,14 +154,13 @@ func main() { return } - manager := linktree.NewNodeManager(client) switch output { case "terminal": - writeTerminal(manager, root, depth) + writeTerminal(client, root, depth) case "excel": - writeExcel(manager, root, depth) + writeExcel(client, root, depth) case "tree": - node := manager.LoadNode(root, depth) + node := linktree.BuildTree(client, root, depth) node.PrintTree() } } From 820bf4d870724fcf88101253058021eda852e088 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 27 Sep 2021 13:56:09 -0400 Subject: [PATCH 2/5] More refactoring --- api/handlers.go | 10 +++++++--- api/handlers_test.go | 6 ++++-- linktree/linktree.go | 35 ++++++++++++++++------------------- main.go | 27 ++++++++++++++++----------- 4 files changed, 43 insertions(+), 35 deletions(-) diff --git a/api/handlers.go b/api/handlers.go index 911fb0b..a56bbe9 100644 --- a/api/handlers.go +++ b/api/handlers.go @@ -31,7 +31,8 @@ func GetTreeNode(client *http.Client) func(w http.ResponseWriter, r *http.Reques link := queryMap.Get("link") log.Printf("processing link %s at a depth of %d\n", link, depth) - node := linktree.BuildTree(client, link, depth) + node := linktree.NewNode(client, link) + node.Load(depth) log.Printf("Tree built for %s at depth %d\n", node.URL, depth) err = json.NewEncoder(w).Encode(node) if err != nil { @@ -64,12 +65,15 @@ func isEmailValid(e string) bool { // gets any email addresses on the url passed func getEmails(client *http.Client, link string) []string { links := []string{} - linktree.Crawl(client, link, 1, func(childLink string) { + node := linktree.NewNode(client, link) + depth := 1 + collectLinks := func(childLink string) { linkPieces := strings.Split(childLink, "mailto:") if len(linkPieces) > 1 && isEmailValid(linkPieces[1]) { links = append(links, linkPieces[1]) } - }) + } + node.Crawl(depth, collectLinks) return links } diff --git a/api/handlers_test.go b/api/handlers_test.go index d874061..b66f69b 100644 --- a/api/handlers_test.go +++ b/api/handlers_test.go @@ -101,7 +101,8 @@ func TestGetTree(t *testing.T) { httpmock.RegisterResponder("GET", rootLink, httpmock.NewStringResponder(200, page)) - node := linktree.BuildTree(http.DefaultClient, rootLink, 1) + node := linktree.NewNode(http.DefaultClient, rootLink) + node.Load(1) httpmock.DeactivateAndReset() assertNode(t, node, rootLink, 1) @@ -120,7 +121,8 @@ func TestGetTree(t *testing.T) { httpmock.RegisterResponder("GET", rootLink, httpmock.NewStringResponder(200, page)) - node = linktree.BuildTree(http.DefaultClient, rootLink, 2) + node = linktree.NewNode(http.DefaultClient, rootLink) + node.Load(2) httpmock.DeactivateAndReset() assertNode(t, node, rootLink, 1) diff --git a/linktree/linktree.go b/linktree/linktree.go index 34f66ab..eeacf26 100644 --- a/linktree/linktree.go +++ b/linktree/linktree.go @@ -7,17 +7,20 @@ import ( "net/http" "net/url" "sync" + "time" "golang.org/x/net/html" ) // Node represents a single URL type Node struct { - client *http.Client URL string `json:"url"` StatusCode int `json:"status_code"` Status string `json:"status"` Children []*Node `json:"children"` + Client *http.Client + Loaded bool + LastLoaded time.Time } // PrintTree ... @@ -33,7 +36,7 @@ func (n *Node) PrintTree() { // UpdateStatus updates the status of the URL func (n *Node) updateStatus() { - resp, err := n.client.Get(n.URL) + resp, err := n.Client.Get(n.URL) if err != nil { n.Status = "UNKNOWN" n.StatusCode = http.StatusInternalServerError @@ -43,12 +46,6 @@ func (n *Node) updateStatus() { n.StatusCode = resp.StatusCode } -// NodeManager ... -type NodeManager struct { - client *http.Client - wg *sync.WaitGroup -} - func isValidURL(URL string) bool { if u, err := url.ParseRequestURI(URL); err == nil && (u.Scheme == "http" || u.Scheme == "https") { return true @@ -60,7 +57,7 @@ func isValidURL(URL string) bool { func NewNode(client *http.Client, URL string) *Node { n := &Node{ URL: URL, - client: client, + Client: client, } n.updateStatus() return n @@ -141,11 +138,11 @@ func buildTree(parent *Node, depth int, childLinks chan string, wg *sync.WaitGro defer wg.Done() // Do not add the link as it's own child if parent.URL != link { - n := NewNode(parent.client, link) + n := NewNode(parent.Client, link) parent.Children = append(parent.Children, n) if depth > 1 { depth-- - tokenStream := streamTokens(n.client, n.URL) + tokenStream := streamTokens(n.Client, n.URL) filteredStream := filterTokens(tokenStream, &TokenFilter{ tags: map[string]bool{"a": true}, attributes: map[string]bool{"href": true}, @@ -159,18 +156,18 @@ func buildTree(parent *Node, depth int, childLinks chan string, wg *sync.WaitGro } // BuildTree... -func BuildTree(client *http.Client, root string, depth int) *Node { - node := NewNode(client, root) - tokenStream := streamTokens(client, root) +func (n *Node) Load(depth int) { + tokenStream := streamTokens(n.Client, n.URL) filteredStream := filterTokens(tokenStream, &TokenFilter{ tags: map[string]bool{"a": true}, attributes: map[string]bool{"href": true}, }) wg := new(sync.WaitGroup) - buildTree(node, depth, filteredStream, wg) + buildTree(n, depth, filteredStream, wg) wg.Wait() - return node + n.Loaded = true + n.LastLoaded = time.Now().UTC() } // streams the status of the links from the channel until the depth has reached 0 @@ -194,13 +191,13 @@ func crawl(client *http.Client, wg *sync.WaitGroup, linkChan <-chan string, dept } // Crawl ... -func Crawl(client *http.Client, root string, depth int, work func(link string)) { - tokenStream := streamTokens(client, root) +func (n *Node) Crawl(depth int, work func(link string)) { + tokenStream := streamTokens(n.Client, n.URL) filteredStream := filterTokens(tokenStream, &TokenFilter{ tags: map[string]bool{"a": true}, attributes: map[string]bool{"href": true}, }) wg := new(sync.WaitGroup) - crawl(client, wg, filteredStream, depth, work) + crawl(n.Client, wg, filteredStream, depth, work) wg.Wait() } diff --git a/main.go b/main.go index 21296f7..6890d17 100644 --- a/main.go +++ b/main.go @@ -38,9 +38,14 @@ func newTorClient(host, port string) (*http.Client, error) { }, nil } -func writeTerminal(client *http.Client, root string, depth int) { +func writeTree(node *linktree.Node, depth int) { + node.Load(depth) + node.PrintTree() +} + +func writeTerminal(node *linktree.Node, depth int) { printStatus := func(link string) { - n := linktree.NewNode(client, link) + n := linktree.NewNode(node.Client, link) markError := ansi.ColorFunc("red") markSuccess := ansi.ColorFunc("green") if n.StatusCode != 200 { @@ -49,10 +54,10 @@ func writeTerminal(client *http.Client, root string, depth int) { fmt.Printf("Link: %20s Status: %d %s\n", n.URL, n.StatusCode, markSuccess(n.Status)) } } - linktree.Crawl(client, root, depth, printStatus) + node.Crawl(depth, printStatus) } -func writeExcel(client *http.Client, root string, depth int) { +func writeExcel(node *linktree.Node, depth int) { f := excelize.NewFile() err := f.SetCellStr(f.GetSheetName(0), "A1", "Link") if err != nil { @@ -66,7 +71,7 @@ func writeExcel(client *http.Client, root string, depth int) { } row := 2 addRow := func(link string) { - node := linktree.NewNode(client, link) + node := linktree.NewNode(node.Client, link) linkCell := fmt.Sprintf("A%d", row) statusCell := fmt.Sprintf("B%d", row) err = f.SetCellStr(f.GetSheetName(0), linkCell, node.URL) @@ -81,8 +86,8 @@ func writeExcel(client *http.Client, root string, depth int) { } row++ } - linktree.Crawl(client, root, depth, addRow) - u, err := url.Parse(root) + node.Crawl(depth, addRow) + u, err := url.Parse(node.URL) if err != nil { log.Fatal(err) return @@ -154,13 +159,13 @@ func main() { return } + node := linktree.NewNode(client, root) switch output { case "terminal": - writeTerminal(client, root, depth) + writeTerminal(node, depth) case "excel": - writeExcel(client, root, depth) + writeExcel(node, depth) case "tree": - node := linktree.BuildTree(client, root, depth) - node.PrintTree() + writeTree(node, depth) } } From ab55c3870c61a6fb181a014de75caa53d4170edf Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 27 Sep 2021 15:51:08 -0400 Subject: [PATCH 3/5] Add validator for URLs and more documentation --- linktree/linktree.go | 57 +++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/linktree/linktree.go b/linktree/linktree.go index eeacf26..071ace9 100644 --- a/linktree/linktree.go +++ b/linktree/linktree.go @@ -63,12 +63,12 @@ func NewNode(client *http.Client, URL string) *Node { return n } -// StreamUrls the child nodes of a link using a custom validator -func streamTokens(client *http.Client, page string) chan html.Token { +// streams start tag tokens found within HTML content at the given link +func streamTokens(client *http.Client, link string) chan html.Token { tokenStream := make(chan html.Token, 100) go func() { defer close(tokenStream) - resp, err := client.Get(page) + resp, err := client.Get(link) if err != nil { log.Println(err) return @@ -92,6 +92,7 @@ func streamTokens(client *http.Client, page string) chan html.Token { return tokenStream } +// filters tokens from the stream that do not pass the given tokenfilter func filterTokens(tokenStream chan html.Token, filter *TokenFilter) chan string { filterStream := make(chan string) @@ -126,36 +127,44 @@ func filterTokens(tokenStream chan html.Token, filter *TokenFilter) chan string return filterStream } +// TokenFilter determines which tokens will be filtered from a stream, +// 1. There are zero to many attributes per tag. +// if the tag is included then those tags will be used (e.g. all anchor tags) +// if the attribute is included then those attributes will be used (e.g. all href attributes) +// if both are specified then the combination will be used (e.g. all href attributes within anchor tags only) +// if neither is specified then all tokens will be used (e.g. all tags found) type TokenFilter struct { tags map[string]bool attributes map[string]bool } -// builds a tree from the given link channel +// builds a tree for the parent node using the incoming links as children (repeated until depth has been exhausted) func buildTree(parent *Node, depth int, childLinks chan string, wg *sync.WaitGroup) { for link := range childLinks { - go func(parent *Node, link string, depth int) { - defer wg.Done() - // Do not add the link as it's own child - if parent.URL != link { - n := NewNode(parent.Client, link) - parent.Children = append(parent.Children, n) - if depth > 1 { - depth-- - tokenStream := streamTokens(n.Client, n.URL) - filteredStream := filterTokens(tokenStream, &TokenFilter{ - tags: map[string]bool{"a": true}, - attributes: map[string]bool{"href": true}, - }) - buildTree(n, depth, filteredStream, wg) + if isValidURL(link) { + wg.Add(1) + go func(parent *Node, link string, depth int) { + defer wg.Done() + // Do not add the link as it's own child + if parent.URL != link { + n := NewNode(parent.Client, link) + parent.Children = append(parent.Children, n) + if depth > 1 { + depth-- + tokenStream := streamTokens(n.Client, n.URL) + filteredStream := filterTokens(tokenStream, &TokenFilter{ + tags: map[string]bool{"a": true}, + attributes: map[string]bool{"href": true}, + }) + buildTree(n, depth, filteredStream, wg) + } } - } - }(parent, link, depth) - wg.Add(1) + }(parent, link, depth) + } } } -// BuildTree... +// Load places the tree within memory. func (n *Node) Load(depth int) { tokenStream := streamTokens(n.Client, n.URL) filteredStream := filterTokens(tokenStream, &TokenFilter{ @@ -170,7 +179,7 @@ func (n *Node) Load(depth int) { n.LastLoaded = time.Now().UTC() } -// streams the status of the links from the channel until the depth has reached 0 +// perform work on each token stream until the deapth has been reached func crawl(client *http.Client, wg *sync.WaitGroup, linkChan <-chan string, depth int, doWork func(link string)) { for link := range linkChan { go func(currentLink string, currentDepth int) { @@ -190,7 +199,7 @@ func crawl(client *http.Client, wg *sync.WaitGroup, linkChan <-chan string, dept } } -// Crawl ... +// Crawl traverses the children of a node without storing it in memory func (n *Node) Crawl(depth int, work func(link string)) { tokenStream := streamTokens(n.Client, n.URL) filteredStream := filterTokens(tokenStream, &TokenFilter{ From 6fa41c833de787567caaa49bcad66222f79a5fc8 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Tue, 28 Sep 2021 09:36:23 -0400 Subject: [PATCH 4/5] Pass token filter as argument --- linktree/linktree.go | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/linktree/linktree.go b/linktree/linktree.go index 071ace9..b7a26e8 100644 --- a/linktree/linktree.go +++ b/linktree/linktree.go @@ -139,7 +139,7 @@ type TokenFilter struct { } // builds a tree for the parent node using the incoming links as children (repeated until depth has been exhausted) -func buildTree(parent *Node, depth int, childLinks chan string, wg *sync.WaitGroup) { +func buildTree(parent *Node, depth int, childLinks chan string, wg *sync.WaitGroup, filter *TokenFilter) { for link := range childLinks { if isValidURL(link) { wg.Add(1) @@ -152,11 +152,8 @@ func buildTree(parent *Node, depth int, childLinks chan string, wg *sync.WaitGro if depth > 1 { depth-- tokenStream := streamTokens(n.Client, n.URL) - filteredStream := filterTokens(tokenStream, &TokenFilter{ - tags: map[string]bool{"a": true}, - attributes: map[string]bool{"href": true}, - }) - buildTree(n, depth, filteredStream, wg) + filteredStream := filterTokens(tokenStream, filter) + buildTree(n, depth, filteredStream, wg, filter) } } }(parent, link, depth) @@ -167,20 +164,20 @@ func buildTree(parent *Node, depth int, childLinks chan string, wg *sync.WaitGro // Load places the tree within memory. func (n *Node) Load(depth int) { tokenStream := streamTokens(n.Client, n.URL) - filteredStream := filterTokens(tokenStream, &TokenFilter{ + filter := &TokenFilter{ tags: map[string]bool{"a": true}, attributes: map[string]bool{"href": true}, - }) - + } + filteredStream := filterTokens(tokenStream, filter) wg := new(sync.WaitGroup) - buildTree(n, depth, filteredStream, wg) + buildTree(n, depth, filteredStream, wg, filter) wg.Wait() n.Loaded = true n.LastLoaded = time.Now().UTC() } // perform work on each token stream until the deapth has been reached -func crawl(client *http.Client, wg *sync.WaitGroup, linkChan <-chan string, depth int, doWork func(link string)) { +func crawl(client *http.Client, wg *sync.WaitGroup, linkChan <-chan string, depth int, filter *TokenFilter, doWork func(link string)) { for link := range linkChan { go func(currentLink string, currentDepth int) { defer wg.Done() @@ -188,11 +185,8 @@ func crawl(client *http.Client, wg *sync.WaitGroup, linkChan <-chan string, dept if currentDepth > 1 { currentDepth-- tokenStream := streamTokens(client, currentLink) - filteredStream := filterTokens(tokenStream, &TokenFilter{ - tags: map[string]bool{"a": true}, - attributes: map[string]bool{"href": true}, - }) - crawl(client, wg, filteredStream, currentDepth, doWork) + filteredStream := filterTokens(tokenStream, filter) + crawl(client, wg, filteredStream, currentDepth, filter, doWork) } }(link, depth) wg.Add(1) @@ -202,11 +196,12 @@ func crawl(client *http.Client, wg *sync.WaitGroup, linkChan <-chan string, dept // Crawl traverses the children of a node without storing it in memory func (n *Node) Crawl(depth int, work func(link string)) { tokenStream := streamTokens(n.Client, n.URL) - filteredStream := filterTokens(tokenStream, &TokenFilter{ + filter := &TokenFilter{ tags: map[string]bool{"a": true}, attributes: map[string]bool{"href": true}, - }) + } + filteredStream := filterTokens(tokenStream, filter) wg := new(sync.WaitGroup) - crawl(n.Client, wg, filteredStream, depth, work) + crawl(n.Client, wg, filteredStream, depth, filter, work) wg.Wait() } From 04b348114c7bc78160c7fb8a831b75110013f80c Mon Sep 17 00:00:00 2001 From: Akeem King Date: Tue, 28 Sep 2021 11:12:58 -0400 Subject: [PATCH 5/5] Add linktree unit tests --- linktree/linktree_test.go | 78 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 linktree/linktree_test.go diff --git a/linktree/linktree_test.go b/linktree/linktree_test.go new file mode 100644 index 0000000..f416601 --- /dev/null +++ b/linktree/linktree_test.go @@ -0,0 +1,78 @@ +package linktree + +import ( + "fmt" + "net/http" + "testing" + + "github.com/jarcoal/httpmock" + "github.com/stretchr/testify/assert" +) + +func newPage(title, body string) string { + baseHTML := ` + + %s + %s + ` + return fmt.Sprintf(baseHTML, title, body) +} + +func TestNewNode(t *testing.T) { + httpmock.Activate() + + link := "https://www.random.com" + n := NewNode(http.DefaultClient, link) + assert.Equal(t, n.URL, link) + assert.Equal(t, n.Status, "UNKNOWN") + assert.Equal(t, n.StatusCode, http.StatusInternalServerError) + + page := newPage("Random", "") + httpmock.RegisterResponder(http.MethodGet, link, + httpmock.NewStringResponder(http.StatusOK, page)) + + n.updateStatus() + assert.Equal(t, n.URL, link) + assert.Equal(t, n.Status, http.StatusText(http.StatusOK)) + assert.Equal(t, n.StatusCode, http.StatusOK) + + httpmock.DeactivateAndReset() +} + +func TestLoadNode(t *testing.T) { + httpmock.Activate() + + link := "https://www.test.com" + n := NewNode(http.DefaultClient, link) + n.Load(1) + assert.True(t, n.Loaded) + + page := newPage("test", `link to child`) + httpmock.RegisterResponder(http.MethodGet, link, + httpmock.NewStringResponder(http.StatusOK, page)) + + n = NewNode(http.DefaultClient, link) + n.Load(1) + assert.True(t, n.Loaded) + assert.Len(t, n.Children, 1) + + httpmock.DeactivateAndReset() +} + +func TestCrawlNode(t *testing.T) { + httpmock.Activate() + + link := "https://www.test.com" + page := newPage("test", `link to child`) + httpmock.RegisterResponder(http.MethodGet, link, + httpmock.NewStringResponder(http.StatusOK, page)) + + n := NewNode(http.DefaultClient, link) + n.Crawl(1, func(link string) { + assert.Equal(t, link, "https://www.child1.com") + }) + + assert.Len(t, n.Children, 0) // nothing should be stored in memory + + httpmock.DeactivateAndReset() +}