forked from UniCourt/DataEngineering-Workshop1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_scraping_sample.py
31 lines (27 loc) · 937 Bytes
/
web_scraping_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
from bs4 import BeautifulSoup
import re
import psycopg2
# Create connection to database
conn = psycopg2.connect(
host="postgres_service",
database="LipsumGenerator",
user="postgres",
password="admin")
cursor = conn.cursor()
res = requests.get('https://www.lipsum.com/')
soup = BeautifulSoup(res.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
data = soup.find(re.compile(r'div'), attrs={'id': "Panes"})
print(data.find("lorem"))
question_list = []
answer_list = []
for row in data.findAll("div"):
question_list.append(row.h2.text)
temp_string = ""
counter=0
for i in row.findAll("p"):
temp_string = temp_string + "\n" + i.text
answer_list.append(temp_string)
file = open("qn_ans_ans", "w")
for i in range(len(question_list)):
cursor.execute("insert into qn_ans values(%s,%s)", (question_list[i], answer_list[i]))