-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathobsigo.py
572 lines (467 loc) · 24.1 KB
/
obsigo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
#! python3
# Preprocess markdown files in a directory before passing to Hugo
import os
import shutil
# pip install python-frontmatter
import frontmatter
from io import BytesIO
import re
import yaml
print("Obsigo v0.2 - Preprocess Obsidian markdown files for Hugo")
# Process the frontmatter of the markdown file.
# TODO: converts tags with spaces to tags with hyphens
def process_frontmatter(src_metadata, rel_src_filepath, rel_dest_filepath, site_aliases_dict, stats_dict ):
source_changed = False # Source data has not changed yet
print(" FRONTMATTER:")
# Cleanup/Remove unimportant keys:
for key in unimportant_frontmatter_keys:
if key in src_metadata:
print(f" CLEANUP: Removing unimportant key from source '{key}: {src_metadata[key]}")
del src_metadata[key]
source_changed = True
stats_dict['frontmatter_source_cleanups'] += 1
# Remove 'visibility' key if its value is 'published'
if 'visibility' in src_metadata and src_metadata['visibility'] == 'published':
print(f" CLEANUP: Removing key 'visibility': {src_metadata['visibility']}")
del src_metadata['visibility']
source_changed = True
stats_dict['frontmatter_source_cleanups'] += 1
if 'draft' in src_metadata:
draft = src_metadata['draft']
else:
draft = False
# ---
# Collect aliases:
if 'aliases' in src_metadata:
# Make a copy of the aliases
post_collected_aliases = src_metadata['aliases'].copy()
# make sure it is a list
if not isinstance(post_collected_aliases, list):
post_collected_aliases = [post_collected_aliases]
stats_dict['aliases_collected'] += len(post_collected_aliases)
else:
post_collected_aliases = []
# print(f" Collected Aliases: {post_collected_aliases}")
# Also add the last directory of the file path (not the filename) as an alias
split_path = rel_src_filepath.split('/')
# print( f" Split path: {split_path} {len(split_path)}")
if split_path[-1] == 'index.md' or split_path[-1] == '_index.md':
# We are in a sub directory if we have a parent:
if len(split_path) >= 2:
# we have a parent: add the last directory of the file path (not the filename) as an alias
main_slug = split_path[-2]
post_collected_aliases.append(main_slug)
stats_dict['slugs_collected'] += 1
print(f" CONTENT SUBDIR - Adding last directory '{main_slug}' as an alias.")
else:
# no parent
main_slug = None
else:
# We have a file name, not an `index.md` file
# Use the last part of the file path as the main slug
# remove .md from the end of the folder_name
# Can be an article, can also be search.md
main_slug = re.sub(r'\.md$', '', split_path[-1])
post_collected_aliases.append(main_slug)
stats_dict['slugs_collected'] += 1
print(f" NAMED MD - Adding last directory '{main_slug}' as an alias.")
print(f" SLUG: Main slug: {main_slug}" )
print(f" ALIASES: Collected Aliases: {post_collected_aliases}")
# --
# Fix 'slug:' in source if necessary:
if main_slug is None:
# root _index.md, do nothing
print( " Home page, no slug needed.")
pass
elif 'slug' not in src_metadata:
# No 'slug', let's add it:
print(f" Adding missing slug to source: '{main_slug}'")
src_metadata['slug'] = main_slug
source_changed = True
stats_dict['missing_slugs_fixed'] += 1
elif str(src_metadata['slug']) != main_slug:
# Also add the slug as an alias if it's different from the main slug extracted from the file path
# DIVERGENT slug found!
# We need to roll the slugs:
print(f" Divergent slugs found: '{src_metadata['slug']}' != '{main_slug}'")
# Old slug must become an alias:
post_collected_aliases.append(src_metadata['slug'])
stats_dict['divergent_slugs_fixed'] += 1 # Old slug becomes an alias and filename becomes new slug
# add to the original aliasses if not already there:
if not 'aliases' in src_metadata:
src_metadata['aliases'] = []
if src_metadata['slug'] not in src_metadata['aliases']:
src_metadata['aliases'].append(src_metadata['slug'])
# the new slug must become the canoncial slug:
src_metadata['slug'] = main_slug
source_changed = True
# ---
# Build page URI:
# remove /_?index.md$ from the end of the canonical_uri:
# print( f" Relative file path: {rel_dest_filepath}")
canonical_uri = re.sub(r'/_?index\.md$', '/', '/'+rel_dest_filepath)
# print( f" canonical_uri: {canonical_uri}")
# If there is still a trailing .md, remove it (can happen for search.md for example)
canonical_uri = re.sub(r'\.md$', '/', canonical_uri)
# print( f" canonical_uri: {canonical_uri}")
# Build redirects: (site aliases)
for alias in post_collected_aliases:
# Make sure alias is a string
alias = str(alias)
# Keep only the last part of the alias --> NO, we actually want to handle aliases like STMag/index.html
# alias = alias.split('/')[-1]
if alias in site_aliases_dict:
print(f"!!!WARNING!!! Alias '{alias}' already exists in the dictionary.")
stats_dict['foreverlinks_conflicts_detected'] += 1
else:
if draft:
print(f" NOT ADDING alias {alias}->{canonical_uri} because it's a draft.")
else:
site_aliases_dict[alias] = canonical_uri
print(f" Added alias {alias}->{canonical_uri} to the dictionary.")
stats_dict['foreverlinks_collected'] += 1
# ---
# Clean up tags:
# Convert tags with spaces to tags with hyphens
if 'tags' in src_metadata:
tags = src_metadata['tags']
if tags is not None:
for i, tag in enumerate(tags):
# Convert tags with spaces to tags with hyphens
if ' ' in tag:
print(f" TAGS: Converting tag '{tag}' to '{tag.replace(' ', '-')}'")
tags[i] = tag.replace(' ', '-')
source_changed = True
# Remove `#` character from tags
if '#' in tag:
print(f" TAGS: Removing '#' from tag '{tag}'")
tags[i] = tag.replace('#', '')
source_changed = True
src_metadata['tags'] = tags
# ---
# If the cover image is a HEIC file, convert it to a JPG file
if 'cover_img' in src_metadata:
cover_img = src_metadata['cover_img']
if cover_img is not None:
if cover_img.endswith('.heic'):
print(f" COVER: Converting cover_img:'{cover_img}' to JPEG")
src_metadata['cover_img'] = re.sub(r'\.heic$', '.jpeg', cover_img)
source_changed = True
if 'cover' in src_metadata:
cover = src_metadata['cover']['image']
if cover is not None:
if cover.endswith('.heic'):
print(f" COVER: Converting cover.image:'{cover}' to JPEG")
src_metadata['cover']['image'] = re.sub(r'\.heic$', '.jpeg', cover)
source_changed = True
return source_changed
# Extract and print all links from the markdown content:
def process_links(content, file_path):
# Initialize the Hugo content with the original content
hugo_content = content
markdown_links = re.findall(r'(!?\[(.*?)\]\((.*?)\))', content)
if markdown_links:
print(f" MD Links found in {file_path}:")
for match in markdown_links:
full_md_link, link_text, link_url = match
print(f" - {full_md_link}")
# Check if the link is a YouTube embed and convert to Hugo tag
if full_md_link.startswith('!') and ('youtube.com/watch' in link_url or 'youtu.be/' in link_url):
youtube_id = re.findall(r'(?:https?://(?:www\.)?youtube\.com/watch\?v=|https?://youtu\.be/)([\w-]+)', link_url)
if youtube_id:
# Convert the YouTube link to a Hugo shortcode https://gohugo.io/content-management/shortcodes/#youtube
hugo_tag = f'{{{{< youtube {youtube_id[0]} >}}}}'
print(f" - Converting YouTube link to Hugo shortcode: {hugo_tag}")
hugo_content = hugo_content.replace(full_md_link, hugo_tag)
stats_dict['youtube_links_converted'] += 1
# Check if the link ends in index.md and remove it
elif link_url.endswith('/index.md'):
print(f" - Removing 'index.md' from the link")
hugo_content = hugo_content.replace(full_md_link, f"[{match[1]}]({link_url.replace('/index.md', '/')})")
stats_dict['links_removed_index.md'] += 1
else:
# TODO: check if the link repeats the filename like /xyz/filename/filename.md
# print( f" - Checking for repeated filename in link: {link_url}")
parts = link_url.split('/')
if len(parts) >= 2 and parts[-1] == parts[-2]+".md":
print( f" - Repeated filename found: {parts[-1]}")
# remove the last part
new_link_url = '/'.join(parts[:-1])+ '/'
print( f" - New link: {new_link_url}")
hugo_content = hugo_content.replace(full_md_link, f"[{match[1]}]({new_link_url})")
stats_dict['links_removed_duplicate_filename.md'] += 1
# Find HTML links
# html_links = re.findall(r'(<a\s+(?:[^>]*?\s+)?href=(["\'])(.*?)\2>(.*?)<\/a>)', content)
html_links = re.findall(r'(<a\s+(?:[^>]*?\s+)?href=(["\'])(.*?)\2.*?>(.*?)<\/a>)', content)
if html_links:
print(f" HTML Links found in {file_path}:")
for match in html_links:
print(f" - {match[0]}")
markdown_link = f"[{match[3]}]({match[2]})"
print(f" - MD equiv: {markdown_link}")
# replace in the content
# content = content.replace(full_md_link, markdown_link)
# Find tags and make them links BUT only outside code spans and code blocks
# TODO: generalize this to all replacements
# Regex pattern to match code blocks and inline code spans
code_pattern = r'(```[\s\S]*?```|`[\s\S]*?`)'
# Find and store all code blocks and inline code spans
code_parts = re.findall(code_pattern, hugo_content, re.DOTALL)
# print(f" Found {len(code_parts)} code blocks or inline code spans." )
# Temporarily replace code blocks and inline code spans with placeholders
placeholder = "\0CODE\0"
hugo_content_protected = re.sub(code_pattern, placeholder, hugo_content)
# print(hugo_content_protected)
# Now replace #tag outside code spans and code blocks
tags = re.findall(r'\s#(\w(\w|-)*)', hugo_content_protected)
if tags:
print(f" Tags found in {file_path}:")
for tag, dummy in tags:
print(f" - #{tag}")
# Make a link to the tag page
tag_link = f"[#{tag}](/tags/{tag}/)"
print(f" - MD link: {tag_link}")
# replace in the content
hugo_content_protected = re.sub(rf"(\s)#{tag}", r"\1"+tag_link, hugo_content_protected)
# Now find markdown images like ![alt text](image.jpg "tile") "caption" and replace title with caption:
# TODO: https://gohugo.io/content-management/shortcodes/#figure
markdown_images = re.findall(r'(!\[(.*?)\]\((.*?)(\s*"(.*?)"\s*)?\)( *?\s *?"(.*?)")?)', hugo_content_protected)
if markdown_images:
print(f" MD Images found in {file_path}:")
for match in markdown_images:
full_md_image, alt_text, image_url, extension, title_text, dummy, caption_text = match
print(f" - Image URL: {image_url}")
print(f" - Alt text: {alt_text}")
print(f" - Title text: {title_text}")
print(f" - Caption text: {caption_text}")
# If caption text is not present, use the title text
if caption_text == '':
caption_text = title_text
# Check if it's an heic image and convert to jpg
if image_url.endswith('.heic'):
image_url = re.sub(r'\.heic$', '.jpeg', image_url)
print(f" - Converting heic to jpg: {image_url}")
# replace in the content, only write caption is we have one:
if caption_text == '':
hugo_content_protected = hugo_content_protected.replace(full_md_image, f"![{alt_text}]({image_url})")
else:
hugo_content_protected = hugo_content_protected.replace(full_md_image, f"![{alt_text}]({image_url} \"{caption_text}\")")
# Now find ==highlighted text== and replace with <mark>highlighted text</mark> (non-greedy)
# This can actually be done by Goldmark in Hugo but we do it here for consistency with Obsidian which
# accepts a space before the closing ==
hugo_content_protected = re.sub(r'==(\w.*?)==', r'<mark>\1</mark>', hugo_content_protected)
# Restore the code blocks and inline code spans
if len(code_parts) > 0:
# print(f" Restoring {len(code_parts)} code blocks or inline code spans...")
for code_part in code_parts:
# print(" Restoring code block or inline code span...: " + code_part)
hugo_content_protected = hugo_content_protected.replace(placeholder, code_part, 1)
hugo_content = hugo_content_protected
return content, hugo_content
# Process a single markdown file:
def process_file(file_path, rel_src_filepath, dest_root, site_aliases_dict, stats_dict):
stats_dict['source_md_files'] += 1
# Load the file with frontmatter lib:
post = frontmatter.load(file_path)
if "title" in post:
print( f" Title: {post['title']}")
# DESTINATION PATH:
# Check if the filename ends in _?index.md or search.md
if re.search(r'(/|^)(_?index|search)\.md$', rel_src_filepath):
rel_dest_filepath = rel_src_filepath
print(f" ALREADY INDEX - keep as is: {rel_dest_filepath}")
else:
print(f" NOT INDEX.MD...")
# Get the filename without the extension
filename_base = re.sub(r'\.md$', '', os.path.basename(rel_src_filepath))
# Get the last directory from the path
lastlevel_dir = os.path.basename(os.path.dirname(rel_src_filepath))
if filename_base == lastlevel_dir:
rel_dest_filepath = os.path.join( os.path.dirname(rel_src_filepath), 'index.md')
print(f" FILE == DIR - keep only one: {rel_dest_filepath}")
else:
# Need to make a leaf directory with index.md
rel_dest_filepath = re.sub(r'\.md$', '/index.md', rel_src_filepath)
print(f" FILE != DIR - Make new leaf directory: {rel_dest_filepath}")
# Process the frontmatter
source_changed = process_frontmatter(post.metadata, rel_src_filepath, rel_dest_filepath, site_aliases_dict, stats_dict)
# Extract and print links from the content and update the content
new_src_content, new_hugo_content = process_links(post.content, rel_src_filepath )
# Replace ` # ` with ` \# ` in the destination content only (otherwise Hugo will try to render h1s
# in case of bullet lists entries like '- # of sectors per track')
new_hugo_content = re.sub(r' # ', r' \# ', new_hugo_content)
# Check if the source content has changed
if new_src_content != post.content:
post.content = new_src_content
source_changed = True
# Check if the filename is a bland index.md
if re.search(r'/index\.md$', rel_src_filepath):
print(f" BLAND index.md - Renaming to to slug: {post.metadata['slug']}.md")
# Rename the file to the slug
new_filename = str(post.metadata['slug']) + '.md'
new_file_path = os.path.join(os.path.dirname(file_path), new_filename)
print(f" Renaming file: {file_path} -> {new_file_path}")
os.rename(file_path, new_file_path)
stats_dict['index_md_files_renamed'] += 1
# Save the modified source file only if changes were made
if not source_changed:
print(" SOURCE unchanged.")
else:
print(f" Saving modified SOURCE file: {file_path} ...")
# Create an in-memory bytes buffer
f = BytesIO()
# Dump the front matter into the bytes buffer
frontmatter.dump(post, f)
with open(file_path, 'wb') as output_file:
output_file.write(f.getvalue())
# -----------------------
# Save the new Hugo content to the destination path
post.content = new_hugo_content
dest_file_path = os.path.join(dest_root, rel_dest_filepath)
# Check if we need to create the directory:
dest_dir = os.path.dirname(dest_file_path)
if not os.path.exists(dest_dir):
print(f" Creating destination directory: {dest_dir}")
os.makedirs(dest_dir)
print(f" Saving Hugo file: {dest_file_path} ...")
# Create an in-memory bytes buffer for the Hugo content
f = BytesIO()
# Dump the front matter into the bytes buffer
frontmatter.dump(post, f)
with open(dest_file_path, 'wb') as output_file:
output_file.write(f.getvalue())
# Recursively process all markdown files in a directory:
def process_directory(source_directory, destination_directory, site_aliases_dict, stats_dict ):
for root, dirs, files in os.walk(source_directory):
cur_dirname = os.path.basename(root)
if cur_dirname.startswith('.'):
# print()
# print()
# print(f"Skipping hidden directory: {root}")
continue
print()
print()
# print(f"Processing directory: {root} ... files:{files} ... subdirs: {dirs} ...")
print(f"Processing directory: {root} ... ")
if cur_dirname == '_assets':
# Copy the _assets (images) directory to the destination
source_assets_path = root
relative_assets_path = os.path.relpath(source_assets_path, source_directory)
dest_assets_path = os.path.join(destination_directory, relative_assets_path)
# if not os.path.exists(dest_assets_path):
# CHECK: why do we check if it exists? When does it already exist?
print()
print(f" Copying _assets directory from {source_assets_path} to {dest_assets_path}")
# shutil.copytree(source_assets_path, dest_assets_path)
# Copy any new or more recent files but don't overwrite files that are newer in the destination
# shutil.copytree(source_assets_path, dest_assets_path, dirs_exist_ok=True)
os.makedirs(dest_assets_path, exist_ok=True)
for filename in os.listdir(source_assets_path):
print(f" - File: {filename} :", end="")
src_file = os.path.join(source_assets_path, filename)
dst_file = os.path.join(dest_assets_path, filename)
dst_check_file = dst_file
# Remove destination file first to clear old xattr
# if os.path.exists(dst_file):
# os.remove(dst_file)
# # Copy the file
# shutil.copy2(src_file, dst_file)
# If its an .heic file, the destination check must be on the .jpeg file
if filename.endswith('.heic'):
print(" HEIC file, checked against JPEG equivalent:", end="")
dst_check_file = re.sub(r'\.heic$', '.jpeg', dst_file)
# Copy if destination doesn't exist or source is newer
if not os.path.exists(dst_check_file):
print(" New file.")
shutil.copy2(src_file, dst_file)
elif os.path.getmtime(src_file) > os.path.getmtime(dst_check_file):
print(" Updated file.")
os.remove(dst_check_file) # Remove in order to remove Xattr
shutil.copy2(src_file, dst_file)
else:
print(" Unchanged file.")
continue
# Else this is a regular directory, process the files
for file in files:
if file.endswith('.md'):
file_path = os.path.join(root, file)
relative_file_path = os.path.relpath(file_path, source_directory)
print()
print(f" Processing file: {relative_file_path} ...")
process_file(file_path, relative_file_path, destination_directory, site_aliases_dict, stats_dict)
if __name__ == "__main__":
# Use a library to get command line options
import argparse
parser = argparse.ArgumentParser(description='Preprocess Obsidian markdown files for Hugo')
parser.add_argument('-k', '--keep', action='store_true', help='Keep the destination directory')
parser.add_argument('-i', action='store_true', help='Process images in the destination directory')
parser.add_argument('-lhs', action='store_true', help='Start a Local Hugo Server after processing')
args = parser.parse_args()
# Load config
config_file = "./obsigo.yaml"
if not os.path.exists(config_file):
print(f"Configuration file '{config_file}' does not exist.")
exit(1)
with open(config_file, 'r') as file:
config = yaml.safe_load(file)
source_directory = config['source_directory']
destination_directory = config['destination_directory']
src_redirects_base_file = config['src_redirects_base_file']
dest_redirects_file = config['dest_redirects_file']
unimportant_frontmatter_keys = config['unimportant_frontmatter_keys']
with open("./obsigo_dest.txt", "w") as f:
f.write(destination_directory)
site_aliases_dict = {}
# Check if teh source directory exists
if not os.path.exists(source_directory):
print(f"Source directory '{source_directory}' does not exist.")
exit(1)
# Ensure the destination directory is empty
if os.path.exists(destination_directory):
# Check if we got the -k or --keep argument
if args.keep:
print(f"Keeping destination directory: '{destination_directory}'.")
else:
print(f"Emptying destination directory: '{destination_directory}'.")
shutil.rmtree(destination_directory)
else:
print(f"Creating destination directory: '{destination_directory}'.")
os.makedirs(destination_directory)
stats_dict = {
'source_md_files': 0,
'frontmatter_source_cleanups': 0,
'youtube_links_converted': 0,
'aliases_collected': 0,
'slugs_collected': 0,
'index_md_files_renamed': 0, # index.md files renamed to slug.md
'divergent_slugs_fixed': 0, # Old slug becomes an alias and filename becomes new slug
'missing_slugs_fixed': 0, # Missing slug added to source
'foreverlinks_collected': 0,
'foreverlinks_conflicts_detected': 0,
'links_removed_index.md': 0,
'links_removed_duplicate_filename.md': 0
}
process_directory( source_directory, destination_directory, site_aliases_dict, stats_dict )
# Display the aliases dictionary in alphabetical order:
site_aliases_dict = dict(sorted(site_aliases_dict.items()))
print("\nAliases dictionary:")
# for alias, uri in aliases_dict.items():
# print(f" {alias} -> {uri}")
print(f" Total aliases: {len(site_aliases_dict)}")
# Write it to a netlify _redirects file:
try:
with open( dest_redirects_file, 'w') as f:
for alias, uri in site_aliases_dict.items():
f.write(f"*/{alias} {uri} 301\n")
# Add the contents of ./static/_redirects_base.txt to this file:
if os.path.exists(src_redirects_base_file):
print(f" Adding contents of {src_redirects_base_file} to {dest_redirects_file}")
with open(src_redirects_base_file, 'r') as base_file:
f.write(base_file.read())
except Exception as e:
print(f"ERROR writing redirects file: {e}")
print("\nDone.")
# Display stats:
print("\nStats:")
for key, value in stats_dict.items():
print(f" {key}: {value}")