Skip to content

Commit 1b58447

Browse files
committed
Parent Tags added for check
1 parent ebe3cb3 commit 1b58447

File tree

1 file changed

+13
-3
lines changed

1 file changed

+13
-3
lines changed

main.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from urllib import request
22
from bs4 import BeautifulSoup
33
from multiprocessing import Process
4+
from collections import deque
45
import json
56
import os
67
import sys
@@ -140,9 +141,18 @@ def getNext(self, content):
140141
# if 'href' in line:
141142
# # print(line)
142143
# continue
144+
ret = deque([])
145+
retP = deque([])
143146
for l in soup.find_all(href=True):
144147
if self.nextRe.match(str(l)) and ( 'class' in str(l) or 'id' in str(l)):
145-
return str(l["href"])
148+
ret.appendleft(str(l["href"]))
149+
elif self.nextRe.match(str(l.parent)) and ( 'class' in str(l.parent) or 'id' in str(l.parent)):
150+
retP.appendleft(str(l["href"]))
151+
if ret:
152+
return ret[0]
153+
if retP:
154+
return ret[0]
155+
print(ret, retP)
146156
return False
147157
except Exception as E:
148158
print(E)
@@ -193,7 +203,7 @@ def util(self, URL, iterations = 1, filename = 'out'):
193203
print('.')
194204

195205
nextURL = self.getNext(content)
196-
print('.')
206+
print(nextURL)
197207
if not nextURL:
198208
self.util(URL, iterations = 0, filename = filename)
199209
else:
@@ -208,4 +218,4 @@ def main(self):
208218
argsList = sys.argv
209219
# TPExtractor(argsList).main()
210220
# Generic(argsList).main()
211-
# getPDF('wer')
221+
# getPDF('out')

0 commit comments

Comments
 (0)