Python_Search/Index_creator.py at master · sushant10/Python_Search · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#Index,graph and rank creator

import urllib2
import operator
import json

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"',start_link)
    end_quote = page.find('"',start_quote+1)
    url = page[start_quote+1:end_quote]
    return url, end_quote;


def get_all_links(page):
    links=[]
    while True:
        url, endpos=get_next_target(page)
        if url:
            links.append(url)
            page= page[endpos:]
        else:
            break
    return links


def get_page(link):
    try:

        if link.find('mailto')!=-1:
            return ''
        req = urllib2.Request(link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' })
        html = urllib2.urlopen(req).read()
        return html
    except (urllib2.URLError,urllib2.HTTPError,ValueError) as e:
                return ''

def split_string(source):
    splitlist= ",!-.;/?@ #"
    output=[]
    atsplit = True
    for char in source:
        if char in splitlist:
            atsplit=True
        else:
            if atsplit:
                output.append(char)
                atsplit= False
            else:
                output[-1] = output[-1] +char
    return output

def lookup(index, keyword):
    if keyword in index:
        return index[keyword]
    return None

def add_to_index(index,keyword,url):
    if keyword in index:
        values=[]
        values=lookup(index,keyword)
        if url not in values:
            values.append(url)
    else:
        values=[]
        values.append(url)
    index[keyword]=values

def add_page_to_index(index,url,content):
    words=split_string(content)
    for word in words:
        add_to_index(index,word,url)

def union(p,q):
    for e in q:
        if e not in p:
            p.append(e)

def compute_ranks(graph):
    d = 0.8 # damping factor
    numloops = 10

    ranks = {}
    npages = len(graph)
    for page in graph:
        ranks[page] = 1.0 / npages

    for i in range(0, numloops):
        newranks = {}
        for page in graph:
            newrank = (1 - d) / npages
            for node in graph:
                if page in graph[node]:
                    newrank+=d*(ranks[node]/len(graph[node]))
            newranks[page] = newrank
        ranks = newranks
    return ranks

def crawl_web(seed,max_depth):
    tocrawl=[seed];
    crawled=[]
    sum_depth=[0]
    sum_each=0
    i=0
    index={}
    graph={}
    depth=0
    next_depth=[]
    while tocrawl and depth<= max_depth:
        page= tocrawl.pop()
        if page not in crawled:
            content= get_page(page)
            add_page_to_index(index,page,content)
            outlinks=get_all_links(content)
            graph[page]=outlinks
            union(next_depth,outlinks)
            crawled.append(page)
        if not tocrawl:
            tocrawl,next_depth= next_depth,[]
            depth=depth+1

    with open('index.txt', 'w') as file:
            file.write(json.dumps(index))
    return graph

seed='https://en.wikipedia.org/wiki/Main_Page'
d=input("Enter the depth of the index you want to create:")

graph=crawl_web(seed,d)
ranks=compute_ranks(graph)

with open('ranks.txt', 'w') as file:
        file.write(json.dumps(ranks))