-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiki_scraper.py
106 lines (89 loc) · 3.76 KB
/
wiki_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import wikipedia
import nltk
from collections import defaultdict
import os
# Ensure NLTK data path is set
nltk_data_dir = os.path.expanduser('~/nltk_data')
if nltk_data_dir not in nltk.data.path:
nltk.data.path.append(nltk_data_dir)
# Now import NLTK modules
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet
class WikiKnowledgeBase:
def __init__(self):
self.knowledge_graph = defaultdict(dict)
self.concept_embeddings = {}
def fetch_topic(self, topic, depth=2):
try:
# First try searching for the exact topic
search_results = wikipedia.search(topic, results=5)
if not search_results:
print(f"No results found for: {topic}")
return None
# Try to find best match from search results
for result in search_results:
if result.lower() == topic.lower():
page = wikipedia.page(result, auto_suggest=False)
break
else:
# If no exact match, use first result
try:
page = wikipedia.page(search_results[0], auto_suggest=False)
except:
# If first result fails, try others
for result in search_results[1:]:
try:
page = wikipedia.page(result, auto_suggest=False)
break
except:
continue
else:
print(f"Could not access any pages for: {topic}")
return None
# Extract knowledge
knowledge = {
'summary': page.summary,
'content': page.content,
'links': {link: wikipedia.summary(link, sentences=2)
for link in page.links[:5] if not link.startswith('List of')},
'categories': page.categories,
'title': page.title # Add actual page title
}
# Process text content
knowledge['processed'] = self._process_text(knowledge['content'])
print(f"Retrieved article: {page.title}")
return knowledge
except wikipedia.exceptions.DisambiguationError as e:
print(f"Disambiguation for {topic}. Trying most relevant option...")
# Try to find most relevant option based on topic name
best_match = None
for option in e.options:
if topic.lower() in option.lower():
best_match = option
break
if best_match:
return self.fetch_topic(best_match)
else:
print(f"No relevant disambiguation option found for: {topic}")
return None
except Exception as e:
print(f"Error fetching topic {topic}: {str(e)}")
return None
def _process_text(self, text):
"""Process text content for learning"""
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
# Extract key concepts and relationships
concepts = []
relationships = defaultdict(list)
for i, (word, tag) in enumerate(tagged):
if tag.startswith(('NN', 'VB', 'JJ')):
concepts.append(word)
if i > 0: # Build relationships
prev_word = tagged[i-1][0]
relationships[prev_word].append(word)
return {
'concepts': concepts,
'relationships': relationships
}