Skip to content

Commit

Permalink
Fixes to sitemap
Browse files Browse the repository at this point in the history
  • Loading branch information
akariv committed Apr 2, 2024
1 parent 1f61166 commit 0894f54
Showing 1 changed file with 12 additions and 8 deletions.
20 changes: 12 additions & 8 deletions operators/derive/to_sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,37 @@

def data_api_sitemap_flow():
urls = DF.Flow(
[dict(path='/')],
DF.load(f'{settings.DATA_DUMP_DIR}/autocomplete/datapackage.json'),
# DF.load(f'{settings.DATA_DUMP_DIR}/place_data/datapackage.json'),
DF.filter_rows(lambda r: r['visible'] and not r['low'] and r['score'] > 1, resources='autocomplete'),
DF.load(f'{settings.DATA_DUMP_DIR}/card_data/datapackage.json'),
DF.add_field('path', 'string', lambda r: '/s/{id}'.format(**r), resources='autocomplete'),
# DF.add_field('path', 'string', lambda r: '/p/{key}'.format(**r), resources='places'),
DF.add_field('path', 'string', lambda r: '/c/{card_id}'.format(**r), resources='card_data'),
DF.concatenate(dict(path=[]), target=dict(name='sitemap', path='sitemap.csv')),
DF.set_type('path', transform=lambda v: v.replace("'", ''').replace('"', '"')),
DF.printer()
).results(on_error=None)[0][0]
).results(on_error=None)[0]
today = datetime.date.today().isoformat()
urls[0].insert(0, dict(path='/about/contact'))
urls[0].insert(0, dict(path='/about/partners'))
urls[0].insert(0, dict(path='/about/kolsherut'))
urls[0].insert(0, dict(path='/'))
_urls = []
with tempfile.TemporaryDirectory() as tmpdir:
idx = 0
resources = []
while len(urls) > 0:
while len(urls) > 0 or len(_urls) > 0:
if len(_urls) == 0:
_urls = urls.pop(0)
res_name = f'sitemap_{idx}' if idx > 0 else 'sitemap'
base_filename = f'{res_name}.xml'
filename = f'{tmpdir}/{base_filename}'
with open(filename, 'w') as buff:
buff.write('<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
_urls = urls[:50000]
for row in _urls:
for row in _urls[:50000]:
buff.write('<url><loc>https://www.kolsherut.org.il{}</loc><lastmod>{}</lastmod></url>\n'.format(row['path'], today))
buff.write('</urlset>')
urls = urls[50000:]
_urls = _urls[50000:]
resources.append(dict(
name=res_name,
path=base_filename,
Expand All @@ -46,7 +50,7 @@ def data_api_sitemap_flow():
)
))
idx += 1

assert len(resources) == 2
dumper = dump_to_ckan(settings.CKAN_HOST, settings.CKAN_API_KEY, settings.CKAN_OWNER_ORG, force_format=False)
datapackage = dict(
name='sitemap',
Expand Down

0 comments on commit 0894f54

Please sign in to comment.