-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcronned.py
108 lines (95 loc) · 2.89 KB
/
cronned.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import requests, json, glob, os
import dotenv
from queries import getReq, getOne
import pandas as pd
dotenv.load_dotenv()
username = os.getenv("API_username")
password = os.getenv("API_password")
URL = os.getenv("URL")
regenerate_answers = True
KW = [
"digital",
"data",
"data engineering",
"software developer",
"machine learning",
"artificial intelligence",
"data management",
"information management",
"digital consultancy",
]
for kw in KW:
kwn = kw.replace(" ", "-")
for k in range(2):
FILENAME = "/home/kelu/projets/mm_jobs/data/lists/list_" + kwn + "_" + str(k) + ".json"
r = requests.post(
URL, # save the result to examine later
auth=(username, password), # auth
json=getReq(kw, 20, k * 20),
) # no need to json.dumps or add the header manually!
kw = kw.replace(" ", "-")
if not r.text == "Unauthorized":
with open(FILENAME, "w") as f:
f.write(r.text)
IDs = []
for file in glob.glob("/home/kelu/projets/mm_jobs/data/lists/*.json"):
# print(file)
with open(file, "r") as f:
t = f.read()
if not t == "Unauthorized":
D = json.loads(t)
for x in D["hits"]["hits"]:
IDs.append(x["_source"]["pageVersionId"])
# print(x["_source"]["title"])
print(len(IDs), len(set(IDs)))
for k in IDs:
REQ = getOne(k)
FILENAME = "/home/kelu/projets/mm_jobs/data/jobs/job_" + str(k) + ".json"
if not os.path.exists(FILENAME):
print("Processing ", k)
r = requests.post(
URL, # save the result to examine later
auth=(username, password), # you can pass this without constructor
json=REQ,
) # no need to json.dumps or add the header manually!
if not r.text == "Unauthorized":
with open(FILENAME, "w") as f:
f.write(r.text)
IDs = []
F = glob.glob("/home/kelu/projets/mm_jobs/data/jobs/*.json")
print(len(F))
for file in F:
with open(file, "r") as f:
t = f.read()
if not t == "Unauthorized":
D = json.loads(t)
for x in D["hits"]["hits"]:
IDs.append(x["_source"])
def getSkill(x):
x = x.lower()
if "data" in x:
return "Data"
elif "information management" in x:
return "Information Management"
elif "digital" in x:
return "Digital"
else:
return ""
df = pd.DataFrame(IDs)[
[
"jobRef",
"pageVersionId",
"contentPageId",
"title",
"pageText",
"publishedDate",
"sector",
"discipline",
"jobSector",
]
]
df["Digital"] = df.sector.apply(lambda x: "Digital" in x)
df["skill"] = df.title.apply(lambda x: getSkill(x))
df.to_excel("/home/kelu/projets/mm_jobs/outputs/digital_jds.xlsx")
df.to_parquet("/home/kelu/projets/mm_jobs/outputs/digital_jds.parquet.gzip", compression="gzip")
df