Skip to content

Commit 81a5439

Browse files
v0.7.1; optimized code, added progress bar
1 parent be5f278 commit 81a5439

File tree

2 files changed

+130
-58
lines changed

2 files changed

+130
-58
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Cyclone's URL Spider
22

3-
![image](https://i.imgur.com/qc6XviF.png)
3+
![image](https://i.imgur.com/Z6RjlUv.png)
44

55
Wordlist & ngram creation tool to crawl a given url and create a wordlist / ngrams (depending on flags given).
66
### Usage Instructions:

spider.go

Lines changed: 129 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -36,23 +36,31 @@ version 0.7.0;
3636
fixed bug when attempting to crawl deeper than available URLs to crawl
3737
fixed crawl depth calculation
3838
optimized code which runs 2.8x faster vs v0.6.x during bench testing
39+
version 0.7.1;
40+
added progress bars to word / ngrams processing & file writing operations
41+
added RAM usage monitoring
42+
optimized order of operations for faster processing with less RAM
43+
TO-DO: refactor code (func main is getting messy)
3944
*/
4045

4146
// clear screen function
4247
func clearScreen() {
48+
var cmd *exec.Cmd
49+
4350
switch runtime.GOOS {
44-
case "linux":
45-
cmd := exec.Command("clear")
46-
cmd.Stdout = os.Stdout
47-
cmd.Run()
48-
case "darwin":
49-
cmd := exec.Command("clear")
50-
cmd.Stdout = os.Stdout
51-
cmd.Run()
51+
case "linux", "darwin":
52+
cmd = exec.Command("clear")
5253
case "windows":
53-
cmd := exec.Command("cmd", "/c", "cls")
54-
cmd.Stdout = os.Stdout
55-
cmd.Run()
54+
cmd = exec.Command("cmd", "/c", "cls")
55+
default:
56+
fmt.Fprintln(os.Stderr, "Unsupported platform")
57+
os.Exit(1)
58+
}
59+
60+
cmd.Stdout = os.Stdout
61+
if err := cmd.Run(); err != nil {
62+
fmt.Fprintf(os.Stderr, "Failed to clear screen: %v\n", err)
63+
os.Exit(1)
5664
}
5765
}
5866

@@ -145,7 +153,7 @@ func crawlAndScrape(u string, depth int, delay int, urlCountChan chan<- int, tex
145153
absoluteLink := joinURL(u, link)
146154
linkDomain, err := getBaseDomain(absoluteLink)
147155
if err != nil {
148-
fmt.Println("Error getting link domain:", err)
156+
fmt.Fprintf(os.Stderr, "Error getting link domain for %s: %v\n", absoluteLink, err)
149157
continue
150158
}
151159
if linkDomain == baseDomain {
@@ -176,25 +184,38 @@ func joinURL(baseURL, relativeURL string) string {
176184
return newURL.String()
177185
}
178186

179-
func generateNgrams(text string, n int) []string {
180-
words := strings.Fields(text)
181-
if len(words) < n {
182-
return nil // return nil if not enough words for the n-gram
187+
func updateProgressBar(action string, total, processed int) {
188+
if total == 0 {
189+
return // avoid division by zero
183190
}
184-
var ngrams []string
185-
for i := 0; i <= len(words)-n; i++ {
186-
ngrams = append(ngrams, strings.Join(words[i:i+n], " "))
191+
percentage := float64(processed) / float64(total) * 100
192+
fmt.Printf("\r%s...\t[", action)
193+
for i := 0; i < int(percentage/5); i++ {
194+
fmt.Print("=")
187195
}
188-
return ngrams
196+
for i := int(percentage / 5); i < 20; i++ {
197+
fmt.Print(" ")
198+
}
199+
fmt.Printf("] %.2f%%", percentage)
189200
}
190201

191-
func uniqueStrings(str string) map[string]bool {
192-
words := strings.Fields(str)
193-
uniqueWords := make(map[string]bool)
194-
for _, word := range words {
195-
uniqueWords[word] = true
202+
func monitorRAMUsage(stopChan chan bool, maxRAMUsage *float64) {
203+
var memStats runtime.MemStats
204+
ticker := time.NewTicker(100 * time.Millisecond)
205+
defer ticker.Stop()
206+
207+
for {
208+
select {
209+
case <-ticker.C:
210+
runtime.ReadMemStats(&memStats)
211+
currentUsage := float64(memStats.Alloc) / 1024 / 1024 / 1024 // GB
212+
if currentUsage > *maxRAMUsage {
213+
*maxRAMUsage = currentUsage
214+
}
215+
case <-stopChan:
216+
return
217+
}
196218
}
197-
return uniqueWords
198219
}
199220

200221
// main function
@@ -218,15 +239,15 @@ func main() {
218239
}
219240

220241
if *versionFlag {
221-
version := "Q3ljbG9uZSdzIFVSTCBTcGlkZXIgdjAuNy4wCg=="
242+
version := "Q3ljbG9uZSdzIFVSTCBTcGlkZXIgdjAuNy4xLWJldGEK"
222243
versionDecoded, _ := base64.StdEncoding.DecodeString(version)
223244
fmt.Fprintln(os.Stderr, string(versionDecoded))
224245
os.Exit(0)
225246
}
226247

227248
if *urlFlag == "" {
228249
fmt.Fprintln(os.Stderr, "Error: -url flag is required")
229-
fmt.Fprintln(os.Stderr, "Try running --help for more information")
250+
fmt.Fprintln(os.Stderr, "Try running -help for more information")
230251
os.Exit(1)
231252
}
232253

@@ -287,7 +308,7 @@ func main() {
287308
fmt.Fprintln(os.Stderr, " ---------------------- ")
288309
fmt.Fprintln(os.Stderr)
289310
fmt.Fprintf(os.Stderr, "Crawling URL:\t%s\n", *urlFlag)
290-
fmt.Fprintf(os.Stderr, "Base Domain:\t%s\n", baseDomain)
311+
fmt.Fprintf(os.Stderr, "Base domain:\t%s\n", baseDomain)
291312
fmt.Fprintf(os.Stderr, "Crawl depth:\t%d\n", *crawlFlag)
292313
fmt.Fprintf(os.Stderr, "ngram len:\t%s\n", *ngramFlag)
293314
fmt.Fprintf(os.Stderr, "Crawl delay:\t%dms (increase this to avoid rate limiting, ex: -delay 100)\n", *delayFlag)
@@ -298,6 +319,11 @@ func main() {
298319
visitedURLs := make(map[string]bool)
299320
doneChan := make(chan struct{})
300321
var wg sync.WaitGroup
322+
stopMonitor := make(chan bool)
323+
var maxRAMUsage float64
324+
325+
// start RAM usage monitor
326+
go monitorRAMUsage(stopMonitor, &maxRAMUsage)
301327

302328
// goroutine to print URLs crawled
303329
wg.Add(1)
@@ -308,49 +334,81 @@ func main() {
308334
for {
309335
select {
310336
case <-ticker.C:
311-
fmt.Fprintf(os.Stderr, "\rURLs Crawled:\t%d", totalCrawled)
337+
fmt.Fprintf(os.Stderr, "\rURLs crawled:\t%d", totalCrawled)
312338
case count := <-urlCountChan:
313339
totalCrawled += count
314340
case <-doneChan:
315-
fmt.Fprintf(os.Stderr, "\rURLs Crawled:\t%d", totalCrawled) // final update
341+
fmt.Fprintf(os.Stderr, "\rURLs crawled:\t%d", totalCrawled) // final update
316342
return
317343
}
318344
}
319345
}()
320346

321347
// start crawling process in goroutine
348+
wg.Add(1)
322349
go func() {
350+
defer wg.Done()
323351
crawlAndScrape(*urlFlag, *crawlFlag, *delayFlag, urlCountChan, textsChan, visitedURLs)
324-
close(textsChan) // close channel after crawling is complete
325-
}()
326-
327-
// wait for crawling to complete
328-
go func() {
329-
wg.Wait()
352+
time.Sleep(100 * time.Millisecond)
353+
close(textsChan)
330354
close(doneChan)
355+
fmt.Println()
331356
}()
332357

333-
// process the collected texts and generate n-grams
334-
ngrams := make(map[string]bool)
358+
// initialize maps for unique word and n-gram counting
359+
uniqueWordsMap := make(map[string]bool)
360+
uniqueNgramsMap := make(map[string]bool)
335361

336-
if len(ngramRange) > 1 {
337-
ngramMax, _ = strconv.Atoi(ngramRange[1])
362+
// collect all texts into a slice
363+
var texts []string
364+
for text := range textsChan {
365+
texts = append(texts, text)
338366
}
367+
totalTexts := len(texts)
339368

340-
for text := range textsChan {
341-
for i := ngramMin; i <= ngramMax; i++ {
342-
for _, ngram := range generateNgrams(text, i) {
343-
ngrams[ngram] = true
369+
// set up progress bar ticker
370+
progressTicker := time.NewTicker(100 * time.Millisecond) // update progress every 100ms
371+
defer progressTicker.Stop()
372+
processedTexts := 0
373+
374+
// process texts and generate n-grams
375+
for _, text := range texts {
376+
words := strings.Fields(text)
377+
for _, word := range words {
378+
uniqueWordsMap[word] = true // count unique words
379+
}
380+
381+
for i := 0; i <= len(words)-ngramMin; i++ {
382+
for n := ngramMin; n <= ngramMax && i+n <= len(words); n++ {
383+
ngram := strings.Join(words[i:i+n], " ")
384+
uniqueNgramsMap[ngram] = true // count unique n-grams
344385
}
345386
}
387+
388+
processedTexts++
389+
select {
390+
case <-progressTicker.C:
391+
updateProgressBar("Processing", totalTexts, processedTexts)
392+
default:
393+
// continue without blocking if ticker channel is not ready
394+
}
346395
}
347396

348-
// extract n-grams into a slice
397+
// final update to progress bar output
398+
updateProgressBar("Processing", totalTexts, processedTexts)
399+
400+
// convert unique n-grams map back to a slice for writing to file
349401
var ngramSlice []string
350-
for ngram := range ngrams {
402+
for ngram := range uniqueNgramsMap {
351403
ngramSlice = append(ngramSlice, ngram)
352404
}
353405

406+
// calculated counts
407+
uniqueWords := len(uniqueWordsMap)
408+
uniqueNgrams := len(uniqueNgramsMap)
409+
fmt.Fprintf(os.Stderr, "\nUnique words:\t%d\n", uniqueWords)
410+
fmt.Fprintf(os.Stderr, "Unique ngrams:\t%d\n", uniqueNgrams)
411+
354412
// write unique n-grams to file
355413
file, err := os.Create(*oFlag)
356414
if err != nil {
@@ -360,29 +418,43 @@ func main() {
360418
defer file.Close()
361419

362420
writer := bufio.NewWriterSize(file, 1*1024*1024) // 1MB buffer for better write performance
363-
for _, ngram := range ngramSlice {
421+
totalNgrams := len(ngramSlice)
422+
423+
// progress update interval
424+
progressUpdateInterval := totalNgrams / 100
425+
if progressUpdateInterval == 0 {
426+
progressUpdateInterval = 1
427+
}
428+
429+
var memStats runtime.MemStats
430+
runtime.ReadMemStats(&memStats)
431+
432+
for i, ngram := range ngramSlice {
364433
_, err := writer.WriteString(ngram + "\n")
365434
if err != nil {
366435
fmt.Println("Error writing to buffer:", err)
367436
return
368437
}
438+
if i%progressUpdateInterval == 0 {
439+
updateProgressBar("Writing", totalNgrams, i+1) // update write progress bar
440+
}
369441
}
442+
370443
err = writer.Flush()
371444
if err != nil {
372445
fmt.Println("Error flushing buffer to file:", err)
373446
return
374447
}
448+
updateProgressBar("Writing", totalNgrams, totalNgrams) // final update to write progress bar
375449

376-
// calculate unique words and n-grams
377-
uniqueWords := len(uniqueStrings(strings.Join(ngramSlice, " ")))
378-
uniqueNgrams := len(ngramSlice)
450+
// stop RAM monitoring
451+
stopMonitor <- true
379452

380453
// print statistics
381-
runtime := time.Since(start)
382-
fmt.Fprintf(os.Stderr, "\nUnique words:\t%d\n", uniqueWords)
383-
fmt.Fprintf(os.Stderr, "Unique ngrams:\t%d\n", uniqueNgrams)
384-
fmt.Fprintf(os.Stderr, "Saved to:\t%s\n", *oFlag)
385-
fmt.Fprintf(os.Stderr, "Runtime:\t%.3fs\n", runtime.Seconds())
454+
fmt.Fprintf(os.Stderr, "\nOutput file:\t%s\n", *oFlag)
455+
fmt.Fprintf(os.Stderr, "RAM used:\t%.2f GB\n", maxRAMUsage)
456+
runTime := time.Since(start)
457+
fmt.Fprintf(os.Stderr, "Runtime:\t%.3fs\n", runTime.Seconds())
386458
}
387459

388-
// end code
460+
// end code

0 commit comments

Comments
 (0)