-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
83 lines (67 loc) · 2.52 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from flask import Flask, request, jsonify
from bs4 import BeautifulSoup
import requests
from flask_pymongo import PyMongo
import os
from dotenv import load_dotenv
from flask_cors import CORS
from bson import ObjectId
load_dotenv()
# mongoURI = os.getenv("URI")
mongoURI = "mongodb+srv://kiran123:kiran123@cluster0.7nxcr9a.mongodb.net/flaskdb?retryWrites=true&w=majority"
app = Flask(__name__)
CORS(app)
# Configure MongoDB using Flask-PyMongo
app.config['MONGO_URI'] = mongoURI
mongo = PyMongo(app)
collection = mongo.db.scraped_data
# POST
@app.route('/scrapeurl', methods=['POST'])
def scrape_wikipedia():
try:
data = request.get_json()
url = data['url']
# Scrape Wikipedia page
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract all URLs from the page starting with "https"
scraped_urls = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith('https')]
# Extract some lines of text from the page
paragraphs = soup.find_all('p')
paradata = [p.get_text() for p in paragraphs]
# Store the scraped data in MongoDB
inserted_data = collection.insert_one({"url": url,"scraped_urls":scraped_urls, "paradata": paradata})
return jsonify({"message": "Scraping and storing complete!", "id": str(inserted_data.inserted_id)})
except Exception as e:
return jsonify({"error": str(e)})
# GET by url
@app.route('/getbyurl', methods=['GET'])
def get_scraped_urls_single():
try:
url_param = request.args.get('url')
data = collection.find_one({"url": url_param})
if data:
scraped_urls = data.get("scraped_urls", [])
return jsonify({"scraped_urls": scraped_urls})
else:
return jsonify({"message": "No scraped URLs found for the specified URL."})
except Exception as e:
return jsonify({"error": str(e)})
# Get All
@app.route('/getalldata', methods=['GET'])
def get_all_data_reverse():
try:
# get all data in reverse order
all_data = list(collection.find().sort("_id", -1))
for item in all_data:
item["_id"] = str(item["_id"])
if all_data:
return jsonify({"data": all_data})
else:
return jsonify({"message": "No data found in the collection."})
except Exception as e:
return jsonify({"error": str(e)})
@app.route("/")
def hello_world():
mongo.db.flaskdb.insert_one({"b": "Hello World"})
return "<p>Wiki Scraper API</p>"