-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_article_info.py
40 lines (33 loc) · 2 KB
/
get_article_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import json
import os
import re
article_nodes = {} # holds all the information about all the articles
names_dict = {} # mapping of article ID to article name
ignored_dirs = [] # holds folders that we to ignore in this analysis
# the directory that holds the articles in relation to the location of the code
directory = "2022-09-15T19_17-export/worlds/Telluriam/articles"
def get_articles(name):
for article in os.scandir(name):
article_path = os.fsdecode(article)
if os.path.isdir(article_path): # if the current object is another folder
if article_path[article_path.rfind("\\") + 1:] not in ignored_dirs: # filter out ignored folders
get_articles(article_path) # recurse on sub-folder
elif article_path.endswith(".json") and not article_path.endswith("metadata.json"): # filter non-articles
f = open(article_path, "r", encoding="utf8") # open file with read permission
current_article = json.load(f) # get the json as a dictionary
current_article_txt = str(current_article) # convert ^ to string
f.close() # close the file
article_name = current_article["title"] # extract the name of the article
article_id = current_article["id"] # extract the id of the article
linked_ids = re.findall(r'data-article-id="(.*?)"', current_article_txt) # scan text for links
article_nodes[article_id] = {
"name": article_name,
"word_count": current_article["wordcount"],
"links": linked_ids,
"directory": re.findall(r'articles\\(\w*)', name)[0]
} # save all the information we need
names_dict[article_id] = article_name # save the name of the article
get_articles(directory) # call the function on the desired directory
# save the dictionaries to text files
json.dump(article_nodes, open("article_nodes.txt", "w"))
json.dump(names_dict, open("names_dict.txt", "w"))