-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawlMarket.py
162 lines (134 loc) · 5.44 KB
/
crawlMarket.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python
"""
Google Android Market Crawler
For the sake of research
Args:
1: database file name
2 through n: all the types we want to explore
"""
import sys
import re
import urllib2
import urlparse
import sqlite3 as sqlite
import threading
import logging
from BeautifulSoup import BeautifulSoup
__author__ = "Sergio Bernales"
logging.basicConfig(level=logging.DEBUG)
if len(sys.argv) < 2:
sys.exit("Not Enough arguments!");
else:
dbfilename = sys.argv[1]
argLen = len(sys.argv) - 1
categories = [x.upper() for x in sys.argv[2::]]
#DB Connection: create it and/or just open it
connection = sqlite.connect(dbfilename)
cursor = connection.cursor()
#table that will contain all the permissions of an app of a certain category
cursor.execute('CREATE TABLE IF NOT EXISTS app_permissions (id INTEGER PRIMARY KEY, appname VARCHAR(256), category VARCHAR(256), permission VARCHAR(256), url VARCHAR(256))')
#cursor.execute('CREATE TABLE IF NOT EXISTS urls_to_crawl (category VARCHAR(256), url VARCHAR(256))')
connection.commit()
connection.close()
class MarketCrawler(threading.Thread):
mainURL = "https://market.android.com"
topfreeURL = "https://market.android.com/details?id=apps_topselling_free&num=24&cat="
toppaidURL = "https://market.android.com/details?id=apps_topselling_paid&num=24&cat="
pageIncrements = 24;
"""
run()
This will be the entry point for the thread and it will loop through every
category provided by the user
crawl process
"""
def run(self):
logging.debug("Running new crawler thread")
for cat in categories:
print cat
self.crawlAppsForCategory(cat)
def crawlAppsForCategory(self, cat):
pageIndex = 0
curl = self.topfreeURL + cat + "&start="
logging.debug("curl:" + curl);
currentURL = curl + str(pageIndex)
logging.debug("current URL:" + currentURL);
while True:
try:
request = urllib2.Request(currentURL)
request.add_header("User-Agent", "PermissionCrawler")
handle = urllib2.build_opener()
content = handle.open(request).read()
soup = BeautifulSoup(content)
appURLS = self.extractAppUrls(soup)
extractor = PermissionExtractor(appURLS, cat)
extractor.start()
logging.debug("Running thread")
#self.extractPermissionsIntoDB(appURLS, cat)
pageIndex+=24
currentURL = curl + str(pageIndex)
except urllib2.HTTPError, error:
if error.code == 404:
print >> sys.stderr, "404 ERROR: %s -> %s" % (error, error.url)
if error.code == 403:
print >> sys.stderr, "403 (NO MORE APP PAGES FOR THIS CATEGORY)ERROR: %s -> %s" % (error, error.url)
else:
print >> sys.stderr, "ERROR: %s" % error
break
except Exception, e:
print >> sys.stderr, "iSERROR: %s" % e
"""
From the page the lists a page of 24 apps of the particular category,
extract the links to those apps
"""
def extractAppUrls(self, soup):
tags = soup('a')
#to get rid of duplicates since the href get returns links twice
skip = False
appURLS = []
for tag in tags:
href = tag.get("href")
if skip:
skip = False
continue
if href is not None and re.match('/details', href) and not re.search('apps_editors_choice', href):
#print href
appURLS.append(self.mainURL+href)
skip = True
return appURLS
"""
Fetch all the URLS in appURLS and extract the permissions.
Put these permission into the DB
"""
class PermissionExtractor(threading.Thread):
def __init__(self, appURLS, cat):
threading.Thread.__init__(self)
self.sites = appURLS
self.category = cat
logging.debug("Created PermissionExtractor")
def run(self):
self.conn = sqlite.connect(dbfilename)
self.curs = self.conn.cursor()
#we can put this URL stuff into its own object /code repetition
for site in self.sites:
request = urllib2.Request(site)
request.add_header("User-Agent", "PyCrawler")
handle = urllib2.build_opener()
content = handle.open(request).read()
soup = BeautifulSoup(content)
appName = soup.find('h1','doc-banner-title').contents[0]
permissions = soup.findAll('div','doc-permission-description')
self.pushToDB(appName, permissions, site)
"""
Pushes permissions of a certain app into the DB
cursor.execute('CREATE TABLE IF NOT EXISTS app_permissions (id INTEGER, appname VARCHAR(256), category VARCHAR(256), permission VARCHAR(256), url VARCHAR(256))')
"""
def pushToDB(self, appName, permissions, site):
logging.debug("Pushing to DB app: " + appName)
for p in permissions:
#print appName, cat, p.contents[0], url
self.curs.execute("INSERT INTO app_permissions VALUES ((?), (?), (?), (?), (?))", (None, appName, self.category, p.contents[0], site ) )
self.conn.commit()
if __name__ == "__main__":
logging.debug("Started!")
#run the crawler thread
MarketCrawler().run()