Skip to content

Commit 732fe4d

Browse files
committed
Revamped external manual data, split sitemap
1 parent 0549b87 commit 732fe4d

File tree

9 files changed

+85
-38
lines changed

9 files changed

+85
-38
lines changed

operators/deploy/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def operator(*_):
6161
source = DF.Flow(
6262
load_from_airtable(settings.AIRTABLE_ALTERNATE_BASE, spec.table, settings.AIRTABLE_VIEW, settings.AIRTABLE_API_KEY),
6363
DF.select_fields([spec.id_field] + select_fields),
64-
).results()[0][0]
64+
).results(onerror=None)[0][0]
6565
source = dict((row[spec.id_field], row) for row in source)
6666

6767
DF.Flow(

operators/derive/autocomplete.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def prepare_locations():
5050
tmpfile.close()
5151
all_places = DF.Flow(
5252
DF.load(tmpfile.name, format='datapackage'),
53-
).results()[0][0]
53+
).results(onerror=None)[0][0]
5454
keys = [n for rec in all_places for n in rec['name']]
5555
mapping = dict((n ,rec['bounds']) for rec in all_places for n in rec['name'])
5656
return keys, mapping

operators/derive/manual_fixes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def fetch_aux_table(self, var, table):
2525
var = DF.Flow(
2626
load_from_airtable(settings.AIRTABLE_BASE, table, settings.AIRTABLE_VIEW, settings.AIRTABLE_API_KEY),
2727
DF.select_fields([AIRTABLE_ID_FIELD, 'id']),
28-
).results()[0][0]
28+
).results(onerror=None)[0][0]
2929
logger.info(f'Got {len(var)} {table} records')
3030
var = dict((r['id'], r) for r in var)
3131
return var

operators/derive/to_dp.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -564,7 +564,7 @@ def card_data_flow():
564564
resources=['situations'],
565565
),
566566
DF.select_fields(['key', 'id', 'name', 'synonyms']),
567-
).results()[0][0]
567+
).results(onerror=None)[0][0]
568568
situations = dict(
569569
(s.pop('key'), s) for s in situations
570570
) | dict(
@@ -576,7 +576,7 @@ def card_data_flow():
576576
resources=['responses'],
577577
),
578578
DF.select_fields(['key', 'id', 'name', 'synonyms']),
579-
).results()[0][0]
579+
).results(onerror=None)[0][0]
580580
responses = dict(
581581
(r.pop('key'), r) for r in responses
582582
) | dict(

operators/derive/to_sitemap.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,30 +21,42 @@ def data_api_sitemap_flow():
2121
DF.concatenate(dict(path=[]), target=dict(name='sitemap', path='sitemap.csv')),
2222
DF.set_type('path', transform=lambda v: v.replace("'", ''').replace('"', '"')),
2323
DF.printer()
24-
).results()[0][0]
24+
).results(onerror=None)[0][0]
2525
today = datetime.date.today().isoformat()
26-
with tempfile.NamedTemporaryFile(mode='w') as buff:
27-
buff.write('<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
28-
for row in urls:
29-
buff.write('<url><loc>https://www.kolsherut.org.il{}</loc><lastmod>{}</lastmod></url>\n'.format(row['path'], today))
30-
buff.write('</urlset>')
31-
dumper = dump_to_ckan(settings.CKAN_HOST, settings.CKAN_API_KEY, settings.CKAN_OWNER_ORG, force_format=False)
32-
datapackage = dict(
33-
name='sitemap',
34-
resources=[dict(
35-
name='sitemap',
36-
path='sitemap.xml',
26+
with tempfile.TemporaryDirectory() as tmpdir:
27+
idx = 0
28+
resources = []
29+
while len(urls) > 0:
30+
res_name = f'sitemap_{idx}' if idx > 0 else 'sitemap'
31+
base_filename = f'{res_name}.xml'
32+
filename = f'{tmpdir}/{base_filename}'
33+
with open(filename, 'w') as buff:
34+
buff.write('<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
35+
_urls = urls[:50000]
36+
for row in _urls:
37+
buff.write('<url><loc>https://www.kolsherut.org.il{}</loc><lastmod>{}</lastmod></url>\n'.format(row['path'], today))
38+
buff.write('</urlset>')
39+
urls = urls[50000:]
40+
resources.append(dict(
41+
name=res_name,
42+
path=base_filename,
3743
format='xml',
3844
schema=dict(
3945
fields=[dict(name='path', type='string')]
4046
)
41-
)],
47+
))
48+
idx += 1
49+
50+
dumper = dump_to_ckan(settings.CKAN_HOST, settings.CKAN_API_KEY, settings.CKAN_OWNER_ORG, force_format=False)
51+
datapackage = dict(
52+
name='sitemap',
53+
resources=resources,
4254
)
4355
dumper.datapackage = Package(datapackage)
4456
dumper.write_ckan_dataset(dumper.datapackage)
45-
buff.flush()
4657
print(dumper.datapackage.resources[0].descriptor)
47-
dumper.write_file_to_output(buff.name, 'sitemap.xml')
58+
for resource in resources:
59+
dumper.write_file_to_output(f'{tmpdir}/{resource.path}', resource.path)
4860

4961

5062
def operator(*_):

operators/manual_data_entry/external.py

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,27 @@ def func(rows):
5555
emit['Org Email'] = row.get('Org Email')
5656
emit['Org Website'] = row.get('Org Website')
5757
emit['Data Source'] = row['Source Name']
58-
emit['taxonomies'] = [service['קטגוריה'], service['אוכלוסיית יעד'], service['שפה'], service.get('שפה-2'), service.get('שפה-3'), service.get('שפה-4'), service.get('שפה-5')]
58+
try:
59+
emit['taxonomies'] = [service['קטגוריה'], service['אוכלוסיית יעד'], service['שפה'], service.get('שפה-2'), service.get('שפה-3'), service.get('שפה-4'), service.get('שפה-5')]
60+
except KeyError:
61+
pass
62+
try:
63+
emit['target_audiences'] = service['אוכלוסיות יעד']
64+
except KeyError:
65+
emit['target_audiences'] = None
66+
emergency_service_msg = 'יש לתייג כשירות חירום'
67+
emergency_service = False
68+
try:
69+
emergency_service = service['שירות למצב החירום'] is True
70+
except KeyError:
71+
pass
72+
try:
73+
notes = service['הערות חופשיות'] or ''
74+
notes = (notes + '\n\n' + emergency_service_msg) if emergency_service else notes
75+
notes = notes.strip()
76+
emit['notes'] = notes
77+
except KeyError:
78+
emit['notes'] = None if not emergency_service else emergency_service_msg
5979
yield emit
6080

6181
return DF.Flow(
@@ -69,27 +89,30 @@ def func(rows):
6989
DF.add_field('Service Website', 'string'),
7090
DF.add_field('Service Email', 'string'),
7191
DF.add_field('Data Source', 'string'),
72-
DF.add_field('taxonomies', 'array', []),
92+
DF.add_field('taxonomies', 'array'),
93+
DF.add_field('target_audiences', 'string'),
94+
DF.add_field('notes', 'string'),
7395
func
7496
)
7597

7698
def handle_taxonomies(taxonomies):
7799
def func(row):
78100
responses = set()
79101
situations = set()
80-
for t in row['taxonomies']:
81-
if not t:
82-
continue
83-
t = t.strip()
84-
if t in taxonomies:
85-
responses.update(taxonomies[t]['response_ids'] or [])
86-
situations.update(taxonomies[t]['situation_ids'] or [])
87-
row['responses_ids'] = list(responses)
88-
row['situations_ids'] = list(situations)
102+
if row.get('taxonomies'):
103+
for t in row['taxonomies']:
104+
if not t:
105+
continue
106+
t = t.strip()
107+
if t in taxonomies:
108+
responses.update(taxonomies[t]['response_ids'] or [])
109+
situations.update(taxonomies[t]['situation_ids'] or [])
110+
row['responses_ids'] = list(responses)
111+
row['situations_ids'] = list(situations)
89112

90113
return DF.Flow(
91-
DF.add_field('responses_ids', 'array', []),
92-
DF.add_field('situations_ids', 'array', []),
114+
DF.add_field('responses_ids', 'array'),
115+
DF.add_field('situations_ids', 'array'),
93116
func,
94117
DF.delete_fields(['taxonomies'])
95118
)

operators/manual_data_entry/mde_utils.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ def func(row):
3434
return func
3535

3636

37+
def handle_no_taxonomies():
38+
def func(row):
39+
data = row['data']
40+
if not data.get('responses'):
41+
data.pop('responses', None)
42+
if not data.get('situations'):
43+
data.pop('situations', None)
44+
return func
45+
3746
# ORGS
3847
def org_updater():
3948
def func(row):
@@ -206,7 +215,7 @@ def mde_service_flow(data_sources, source_id):
206215
DF.update_resource(-1, name='services'),
207216
DF.select_fields(['Org Id', 'Branch Address', 'Branch Geocode', 'Data Source',
208217
'Service Name', 'Service Description', 'Service Conditions', 'Service Phone Number', 'Service Email', 'Service Website',
209-
'responses_ids', 'situations_ids']),
218+
'responses_ids', 'situations_ids', 'target_audiences', 'notes']),
210219
DF.rename_fields({
211220
'Org Id': 'organization',
212221
'Data Source': 'data_source',
@@ -231,14 +240,17 @@ def mde_service_flow(data_sources, source_id):
231240
phone_numbers=r.get('phone_numbers'),
232241
data_source=r['data_source'],
233242
email_address=r.get('email_address'),
243+
target_audiences=r.get('target_audiences'),
244+
notes=r.get('notes'),
234245
)),
246+
handle_no_taxonomies(),
235247
DF.add_field('id', 'string', lambda r: mde_id(r['branch_id'], r['name'])),
236248
DF.select_fields(['id', 'data']),
237249
).results()[0][0]
238250

239251
print('COLLECTED {} relevant services'.format(len(services)))
240252
airtable_updater(settings.AIRTABLE_SERVICE_TABLE, source_id,
241-
['id', 'name', 'description', 'payment_details', 'phone_numbers', 'email_address', 'urls', 'situations', 'responses', 'branches', 'data_sources'],
253+
['id', 'name', 'description', 'payment_details', 'phone_numbers', 'email_address', 'urls', 'situations', 'responses', 'branches', 'data_sources', 'target_audiences', 'notes'],
242254
services,
243255
service_updater(data_sources),
244256
airtable_base=settings.AIRTABLE_DATA_IMPORT_BASE

operators/soproc/click_scraper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,8 @@ def scrape_click():
155155
DF.update_resource(-1, name='click'),
156156
decode_and_clean(),
157157
filter_results(),
158-
DF.add_field('data_sources', 'string', lambda r: 'https://clickrevaha.molsa.gov.il/product-page/{product_id}#השירות בהרחבה ב״קליק לרווחה״'.format(**r)),
159-
DF.add_field('urls', 'string', lambda r: r['data_sources']),
158+
DF.add_field('data_sources', 'string', None),
159+
DF.add_field('urls', 'string', None),
160160
DF.select_fields(list(SELECT_FIELDS.keys())),
161161
DF.rename_fields(SELECT_FIELDS),
162162
DF.set_type('details',

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
dataflows>=0.5.4
1+
dataflows>=0.5.5
22
dataflows-airtable>=0.2.3
33
dataflows-elasticsearch>=0.1.1
44
dataflows-ckan>=0.3.8

0 commit comments

Comments
 (0)