-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4_getAllIndustryDataThroughQIDList.py
41 lines (31 loc) · 1.43 KB
/
4_getAllIndustryDataThroughQIDList.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import pandas as pd
import re
from wikidata.api_queries.queryIndustry import request_queryIndustry
# 17 of 196 failed
# Get wikidata page data through qid and extract the company industry sector
# updates json with query results
current_dir = os.path.dirname(__file__)
file_path = os.path.join(current_dir, "./data_sp500/uniqueCompaniesWithQidsAndWithLocationData.json")
df = pd.read_json(file_path, orient='columns')
df['industry'] = None
allQids = df['qid'] # f.e. http://www.wikidata.org/entity/Q81965
filtered_qids_WithoutIndustry = df[df['industry'].isnull()]['qid']
print('total queries: ', len(filtered_qids_WithoutIndustry))
notFoundCountercounter = 0
for qidLink in filtered_qids_WithoutIndustry:
if qidLink is None or qidLink == '':
notFoundCountercounter += 1
continue
qid = re.search(r'Q\d+', qidLink).group()
queryResponse = request_queryIndustry(qid)
if queryResponse is None:
notFoundCountercounter += 1
else:
companyName = queryResponse['companyLabel']['value']
industry = queryResponse['industryLabel']['value']
print(industry)
if industry is not None:
df.loc[df['qid'].str.strip() == qidLink, 'industry'] = industry
df.to_json('./data_sp500/uniqueCompaniesWithQidsLocationAndWithIndustry.json', orient='records', indent=4)
print('total number of no company qids found:', notFoundCountercounter, ' of ', len(filtered_qids_WithoutIndustry))