@@ -332,7 +332,7 @@ async def launch_headless_explorer(
332
332
continue
333
333
else :
334
334
try :
335
- response = await crawler .async_send (request )
335
+ response = await crawler .async_send (request , timeout = crawler . timeout . connect )
336
336
except httpx .RequestError as exception :
337
337
logging .error (f"{ request } generated an exception: { exception .__class__ .__name__ } " )
338
338
continue
@@ -389,13 +389,52 @@ def __init__(
389
389
self ._final_cookies = None
390
390
self ._cookies = cookies or CookieJar ()
391
391
self ._wait_time = wait_time
392
+ self ._headless_task = None
393
+
394
+ async def process_requests (self , excluded_requests , exclusion_regexes ):
395
+ while True :
396
+ try :
397
+ request , response = self ._queue .get_nowait ()
398
+ except asyncio .QueueEmpty :
399
+ await asyncio .sleep (.1 )
400
+ except KeyboardInterrupt :
401
+ break
402
+ else :
403
+ self ._queue .task_done ()
404
+
405
+ # Scope check and deduplication are made here
406
+ if not self ._scope .check (request ) or request in self ._processed_requests :
407
+ continue
408
+
409
+ # Check for exclusion here because we don't have full control over the headless browser
410
+ if request in excluded_requests or any (regex .match (request .url ) for regex in exclusion_regexes ):
411
+ continue
412
+
413
+ dir_name = request .dir_name
414
+ if self ._max_files_per_dir and self ._file_counts [dir_name ] >= self ._max_files_per_dir :
415
+ continue
416
+
417
+ self ._file_counts [dir_name ] += 1
418
+
419
+ if self .has_too_many_parameters (request ):
420
+ continue
421
+
422
+ if self ._qs_limit and request .parameters_count :
423
+ self ._pattern_counts [request .pattern ] += 1
424
+
425
+ yield request , response
426
+ self ._processed_requests .append (request )
427
+ log_verbose (f"[+] { request } " )
428
+
429
+ if self ._stopped .is_set ():
430
+ break
392
431
393
432
async def async_explore (
394
433
self ,
395
434
to_explore : Deque [Request ],
396
435
excluded_urls : list = None
397
436
) -> AsyncIterator [Tuple [Request , Response ]]:
398
- queue = asyncio .Queue ()
437
+ self . _queue = asyncio .Queue ()
399
438
400
439
exclusion_regexes = []
401
440
excluded_requests = []
@@ -408,10 +447,10 @@ async def async_explore(
408
447
excluded_requests .append (bad_request )
409
448
410
449
# Launch proxy as asyncio task
411
- mitm_task = asyncio .create_task (
450
+ self . _mitm_task = asyncio .create_task (
412
451
launch_proxy (
413
452
self ._mitm_port ,
414
- queue ,
453
+ self . _queue ,
415
454
self ._crawler .headers ,
416
455
self ._cookies ,
417
456
self ._scope ,
@@ -420,12 +459,12 @@ async def async_explore(
420
459
)
421
460
)
422
461
423
- headless_task = None
462
+
424
463
if self ._headless == "no" :
425
464
# No headless crawler, just intercepting mode so no starting URLs
426
465
to_explore .clear ()
427
466
else :
428
- headless_task = asyncio .create_task (
467
+ self . _headless_task = asyncio .create_task (
429
468
launch_headless_explorer (
430
469
self ._stopped ,
431
470
self ._crawler ,
@@ -440,52 +479,23 @@ async def async_explore(
440
479
)
441
480
)
442
481
443
- while True :
444
- try :
445
- request , response = queue .get_nowait ()
446
- except asyncio .QueueEmpty :
447
- await asyncio .sleep (.1 )
448
- except KeyboardInterrupt :
449
- break
450
- else :
451
- queue .task_done ()
452
-
453
- # Scope check and deduplication are made here
454
- if not self ._scope .check (request ) or request in self ._processed_requests :
455
- continue
456
-
457
- # Check for exclusion here because we don't have full control over the headless browser
458
- if request in excluded_requests or any (regex .match (request .url ) for regex in exclusion_regexes ):
459
- continue
460
-
461
- dir_name = request .dir_name
462
- if self ._max_files_per_dir and self ._file_counts [dir_name ] >= self ._max_files_per_dir :
463
- continue
464
-
465
- self ._file_counts [dir_name ] += 1
466
-
467
- if self .has_too_many_parameters (request ):
468
- continue
469
-
470
- if self ._qs_limit and request .parameters_count :
471
- self ._pattern_counts [request .pattern ] += 1
472
-
473
- yield request , response
474
- self ._processed_requests .append (request )
475
- log_verbose (f"[+] { request } " )
476
-
482
+ async for request , response in self .process_requests (excluded_requests , exclusion_regexes ):
483
+ yield request , response
477
484
if self ._stopped .is_set ():
478
485
break
479
486
480
- await queue .join ()
487
+ async def clean (self ):
488
+ if not self ._queue .empty ():
489
+ await self ._queue .join ()
490
+
481
491
# The headless crawler must stop when the stop event is set, let's just wait for it
482
- if headless_task :
483
- await headless_task
492
+ if self . _headless_task :
493
+ await self . _headless_task
484
494
485
495
# We are canceling the mitm proxy, but we could have used a special request to shut down the master to.
486
496
# https://docs.mitmproxy.org/stable/addons-examples/#shutdown
487
- mitm_task .cancel ()
488
- self ._final_cookies = await mitm_task
497
+ self . _mitm_task .cancel ()
498
+ self ._final_cookies = await self . _mitm_task
489
499
await self ._crawler .close ()
490
500
491
501
@property
0 commit comments