diff --git a/operators/derive/to_sitemap.py b/operators/derive/to_sitemap.py index b6321f6..f961eb3 100644 --- a/operators/derive/to_sitemap.py +++ b/operators/derive/to_sitemap.py @@ -10,7 +10,6 @@ def data_api_sitemap_flow(): urls = DF.Flow( - [dict(path='/')], DF.load(f'{settings.DATA_DUMP_DIR}/autocomplete/datapackage.json'), # DF.load(f'{settings.DATA_DUMP_DIR}/place_data/datapackage.json'), DF.filter_rows(lambda r: r['visible'] and not r['low'] and r['score'] > 1, resources='autocomplete'), @@ -18,25 +17,30 @@ def data_api_sitemap_flow(): DF.add_field('path', 'string', lambda r: '/s/{id}'.format(**r), resources='autocomplete'), # DF.add_field('path', 'string', lambda r: '/p/{key}'.format(**r), resources='places'), DF.add_field('path', 'string', lambda r: '/c/{card_id}'.format(**r), resources='card_data'), - DF.concatenate(dict(path=[]), target=dict(name='sitemap', path='sitemap.csv')), DF.set_type('path', transform=lambda v: v.replace("'", ''').replace('"', '"')), DF.printer() - ).results(on_error=None)[0][0] + ).results(on_error=None)[0] today = datetime.date.today().isoformat() + urls[0].insert(0, dict(path='/about/contact')) + urls[0].insert(0, dict(path='/about/partners')) + urls[0].insert(0, dict(path='/about/kolsherut')) + urls[0].insert(0, dict(path='/')) + _urls = [] with tempfile.TemporaryDirectory() as tmpdir: idx = 0 resources = [] - while len(urls) > 0: + while len(urls) > 0 or len(_urls) > 0: + if len(_urls) == 0: + _urls = urls.pop(0) res_name = f'sitemap_{idx}' if idx > 0 else 'sitemap' base_filename = f'{res_name}.xml' filename = f'{tmpdir}/{base_filename}' with open(filename, 'w') as buff: buff.write('') - _urls = urls[:50000] - for row in _urls: + for row in _urls[:50000]: buff.write('https://www.kolsherut.org.il{}{}\n'.format(row['path'], today)) buff.write('') - urls = urls[50000:] + _urls = _urls[50000:] resources.append(dict( name=res_name, path=base_filename, @@ -46,7 +50,7 @@ def data_api_sitemap_flow(): ) )) idx += 1 - + assert len(resources) == 2 dumper = dump_to_ckan(settings.CKAN_HOST, settings.CKAN_API_KEY, settings.CKAN_OWNER_ORG, force_format=False) datapackage = dict( name='sitemap',