@@ -36,23 +36,31 @@ version 0.7.0;
36
36
fixed bug when attempting to crawl deeper than available URLs to crawl
37
37
fixed crawl depth calculation
38
38
optimized code which runs 2.8x faster vs v0.6.x during bench testing
39
+ version 0.7.1;
40
+ added progress bars to word / ngrams processing & file writing operations
41
+ added RAM usage monitoring
42
+ optimized order of operations for faster processing with less RAM
43
+ TO-DO: refactor code (func main is getting messy)
39
44
*/
40
45
41
46
// clear screen function
42
47
func clearScreen () {
48
+ var cmd * exec.Cmd
49
+
43
50
switch runtime .GOOS {
44
- case "linux" :
45
- cmd := exec .Command ("clear" )
46
- cmd .Stdout = os .Stdout
47
- cmd .Run ()
48
- case "darwin" :
49
- cmd := exec .Command ("clear" )
50
- cmd .Stdout = os .Stdout
51
- cmd .Run ()
51
+ case "linux" , "darwin" :
52
+ cmd = exec .Command ("clear" )
52
53
case "windows" :
53
- cmd := exec .Command ("cmd" , "/c" , "cls" )
54
- cmd .Stdout = os .Stdout
55
- cmd .Run ()
54
+ cmd = exec .Command ("cmd" , "/c" , "cls" )
55
+ default :
56
+ fmt .Fprintln (os .Stderr , "Unsupported platform" )
57
+ os .Exit (1 )
58
+ }
59
+
60
+ cmd .Stdout = os .Stdout
61
+ if err := cmd .Run (); err != nil {
62
+ fmt .Fprintf (os .Stderr , "Failed to clear screen: %v\n " , err )
63
+ os .Exit (1 )
56
64
}
57
65
}
58
66
@@ -145,7 +153,7 @@ func crawlAndScrape(u string, depth int, delay int, urlCountChan chan<- int, tex
145
153
absoluteLink := joinURL (u , link )
146
154
linkDomain , err := getBaseDomain (absoluteLink )
147
155
if err != nil {
148
- fmt .Println ( "Error getting link domain:" , err )
156
+ fmt .Fprintf ( os . Stderr , "Error getting link domain for %s: %v \n " , absoluteLink , err )
149
157
continue
150
158
}
151
159
if linkDomain == baseDomain {
@@ -176,25 +184,38 @@ func joinURL(baseURL, relativeURL string) string {
176
184
return newURL .String ()
177
185
}
178
186
179
- func generateNgrams (text string , n int ) []string {
180
- words := strings .Fields (text )
181
- if len (words ) < n {
182
- return nil // return nil if not enough words for the n-gram
187
+ func updateProgressBar (action string , total , processed int ) {
188
+ if total == 0 {
189
+ return // avoid division by zero
183
190
}
184
- var ngrams []string
185
- for i := 0 ; i <= len (words )- n ; i ++ {
186
- ngrams = append (ngrams , strings .Join (words [i :i + n ], " " ))
191
+ percentage := float64 (processed ) / float64 (total ) * 100
192
+ fmt .Printf ("\r %s...\t [" , action )
193
+ for i := 0 ; i < int (percentage / 5 ); i ++ {
194
+ fmt .Print ("=" )
187
195
}
188
- return ngrams
196
+ for i := int (percentage / 5 ); i < 20 ; i ++ {
197
+ fmt .Print (" " )
198
+ }
199
+ fmt .Printf ("] %.2f%%" , percentage )
189
200
}
190
201
191
- func uniqueStrings (str string ) map [string ]bool {
192
- words := strings .Fields (str )
193
- uniqueWords := make (map [string ]bool )
194
- for _ , word := range words {
195
- uniqueWords [word ] = true
202
+ func monitorRAMUsage (stopChan chan bool , maxRAMUsage * float64 ) {
203
+ var memStats runtime.MemStats
204
+ ticker := time .NewTicker (100 * time .Millisecond )
205
+ defer ticker .Stop ()
206
+
207
+ for {
208
+ select {
209
+ case <- ticker .C :
210
+ runtime .ReadMemStats (& memStats )
211
+ currentUsage := float64 (memStats .Alloc ) / 1024 / 1024 / 1024 // GB
212
+ if currentUsage > * maxRAMUsage {
213
+ * maxRAMUsage = currentUsage
214
+ }
215
+ case <- stopChan :
216
+ return
217
+ }
196
218
}
197
- return uniqueWords
198
219
}
199
220
200
221
// main function
@@ -218,15 +239,15 @@ func main() {
218
239
}
219
240
220
241
if * versionFlag {
221
- version := "Q3ljbG9uZSdzIFVSTCBTcGlkZXIgdjAuNy4wCg== "
242
+ version := "Q3ljbG9uZSdzIFVSTCBTcGlkZXIgdjAuNy4xLWJldGEK "
222
243
versionDecoded , _ := base64 .StdEncoding .DecodeString (version )
223
244
fmt .Fprintln (os .Stderr , string (versionDecoded ))
224
245
os .Exit (0 )
225
246
}
226
247
227
248
if * urlFlag == "" {
228
249
fmt .Fprintln (os .Stderr , "Error: -url flag is required" )
229
- fmt .Fprintln (os .Stderr , "Try running -- help for more information" )
250
+ fmt .Fprintln (os .Stderr , "Try running -help for more information" )
230
251
os .Exit (1 )
231
252
}
232
253
@@ -287,7 +308,7 @@ func main() {
287
308
fmt .Fprintln (os .Stderr , " ---------------------- " )
288
309
fmt .Fprintln (os .Stderr )
289
310
fmt .Fprintf (os .Stderr , "Crawling URL:\t %s\n " , * urlFlag )
290
- fmt .Fprintf (os .Stderr , "Base Domain :\t %s\n " , baseDomain )
311
+ fmt .Fprintf (os .Stderr , "Base domain :\t %s\n " , baseDomain )
291
312
fmt .Fprintf (os .Stderr , "Crawl depth:\t %d\n " , * crawlFlag )
292
313
fmt .Fprintf (os .Stderr , "ngram len:\t %s\n " , * ngramFlag )
293
314
fmt .Fprintf (os .Stderr , "Crawl delay:\t %dms (increase this to avoid rate limiting, ex: -delay 100)\n " , * delayFlag )
@@ -298,6 +319,11 @@ func main() {
298
319
visitedURLs := make (map [string ]bool )
299
320
doneChan := make (chan struct {})
300
321
var wg sync.WaitGroup
322
+ stopMonitor := make (chan bool )
323
+ var maxRAMUsage float64
324
+
325
+ // start RAM usage monitor
326
+ go monitorRAMUsage (stopMonitor , & maxRAMUsage )
301
327
302
328
// goroutine to print URLs crawled
303
329
wg .Add (1 )
@@ -308,49 +334,81 @@ func main() {
308
334
for {
309
335
select {
310
336
case <- ticker .C :
311
- fmt .Fprintf (os .Stderr , "\r URLs Crawled :\t %d" , totalCrawled )
337
+ fmt .Fprintf (os .Stderr , "\r URLs crawled :\t %d" , totalCrawled )
312
338
case count := <- urlCountChan :
313
339
totalCrawled += count
314
340
case <- doneChan :
315
- fmt .Fprintf (os .Stderr , "\r URLs Crawled :\t %d" , totalCrawled ) // final update
341
+ fmt .Fprintf (os .Stderr , "\r URLs crawled :\t %d" , totalCrawled ) // final update
316
342
return
317
343
}
318
344
}
319
345
}()
320
346
321
347
// start crawling process in goroutine
348
+ wg .Add (1 )
322
349
go func () {
350
+ defer wg .Done ()
323
351
crawlAndScrape (* urlFlag , * crawlFlag , * delayFlag , urlCountChan , textsChan , visitedURLs )
324
- close (textsChan ) // close channel after crawling is complete
325
- }()
326
-
327
- // wait for crawling to complete
328
- go func () {
329
- wg .Wait ()
352
+ time .Sleep (100 * time .Millisecond )
353
+ close (textsChan )
330
354
close (doneChan )
355
+ fmt .Println ()
331
356
}()
332
357
333
- // process the collected texts and generate n-grams
334
- ngrams := make (map [string ]bool )
358
+ // initialize maps for unique word and n-gram counting
359
+ uniqueWordsMap := make (map [string ]bool )
360
+ uniqueNgramsMap := make (map [string ]bool )
335
361
336
- if len (ngramRange ) > 1 {
337
- ngramMax , _ = strconv .Atoi (ngramRange [1 ])
362
+ // collect all texts into a slice
363
+ var texts []string
364
+ for text := range textsChan {
365
+ texts = append (texts , text )
338
366
}
367
+ totalTexts := len (texts )
339
368
340
- for text := range textsChan {
341
- for i := ngramMin ; i <= ngramMax ; i ++ {
342
- for _ , ngram := range generateNgrams (text , i ) {
343
- ngrams [ngram ] = true
369
+ // set up progress bar ticker
370
+ progressTicker := time .NewTicker (100 * time .Millisecond ) // update progress every 100ms
371
+ defer progressTicker .Stop ()
372
+ processedTexts := 0
373
+
374
+ // process texts and generate n-grams
375
+ for _ , text := range texts {
376
+ words := strings .Fields (text )
377
+ for _ , word := range words {
378
+ uniqueWordsMap [word ] = true // count unique words
379
+ }
380
+
381
+ for i := 0 ; i <= len (words )- ngramMin ; i ++ {
382
+ for n := ngramMin ; n <= ngramMax && i + n <= len (words ); n ++ {
383
+ ngram := strings .Join (words [i :i + n ], " " )
384
+ uniqueNgramsMap [ngram ] = true // count unique n-grams
344
385
}
345
386
}
387
+
388
+ processedTexts ++
389
+ select {
390
+ case <- progressTicker .C :
391
+ updateProgressBar ("Processing" , totalTexts , processedTexts )
392
+ default :
393
+ // continue without blocking if ticker channel is not ready
394
+ }
346
395
}
347
396
348
- // extract n-grams into a slice
397
+ // final update to progress bar output
398
+ updateProgressBar ("Processing" , totalTexts , processedTexts )
399
+
400
+ // convert unique n-grams map back to a slice for writing to file
349
401
var ngramSlice []string
350
- for ngram := range ngrams {
402
+ for ngram := range uniqueNgramsMap {
351
403
ngramSlice = append (ngramSlice , ngram )
352
404
}
353
405
406
+ // calculated counts
407
+ uniqueWords := len (uniqueWordsMap )
408
+ uniqueNgrams := len (uniqueNgramsMap )
409
+ fmt .Fprintf (os .Stderr , "\n Unique words:\t %d\n " , uniqueWords )
410
+ fmt .Fprintf (os .Stderr , "Unique ngrams:\t %d\n " , uniqueNgrams )
411
+
354
412
// write unique n-grams to file
355
413
file , err := os .Create (* oFlag )
356
414
if err != nil {
@@ -360,29 +418,43 @@ func main() {
360
418
defer file .Close ()
361
419
362
420
writer := bufio .NewWriterSize (file , 1 * 1024 * 1024 ) // 1MB buffer for better write performance
363
- for _ , ngram := range ngramSlice {
421
+ totalNgrams := len (ngramSlice )
422
+
423
+ // progress update interval
424
+ progressUpdateInterval := totalNgrams / 100
425
+ if progressUpdateInterval == 0 {
426
+ progressUpdateInterval = 1
427
+ }
428
+
429
+ var memStats runtime.MemStats
430
+ runtime .ReadMemStats (& memStats )
431
+
432
+ for i , ngram := range ngramSlice {
364
433
_ , err := writer .WriteString (ngram + "\n " )
365
434
if err != nil {
366
435
fmt .Println ("Error writing to buffer:" , err )
367
436
return
368
437
}
438
+ if i % progressUpdateInterval == 0 {
439
+ updateProgressBar ("Writing" , totalNgrams , i + 1 ) // update write progress bar
440
+ }
369
441
}
442
+
370
443
err = writer .Flush ()
371
444
if err != nil {
372
445
fmt .Println ("Error flushing buffer to file:" , err )
373
446
return
374
447
}
448
+ updateProgressBar ("Writing" , totalNgrams , totalNgrams ) // final update to write progress bar
375
449
376
- // calculate unique words and n-grams
377
- uniqueWords := len (uniqueStrings (strings .Join (ngramSlice , " " )))
378
- uniqueNgrams := len (ngramSlice )
450
+ // stop RAM monitoring
451
+ stopMonitor <- true
379
452
380
453
// print statistics
381
- runtime := time .Since (start )
382
- fmt .Fprintf (os .Stderr , "\n Unique words:\t %d\n " , uniqueWords )
383
- fmt .Fprintf (os .Stderr , "Unique ngrams:\t %d\n " , uniqueNgrams )
384
- fmt .Fprintf (os .Stderr , "Saved to:\t %s\n " , * oFlag )
385
- fmt .Fprintf (os .Stderr , "Runtime:\t %.3fs\n " , runtime .Seconds ())
454
+ fmt .Fprintf (os .Stderr , "\n Output file:\t %s\n " , * oFlag )
455
+ fmt .Fprintf (os .Stderr , "RAM used:\t %.2f GB\n " , maxRAMUsage )
456
+ runTime := time .Since (start )
457
+ fmt .Fprintf (os .Stderr , "Runtime:\t %.3fs\n " , runTime .Seconds ())
386
458
}
387
459
388
- // end code
460
+ // end code
0 commit comments