Skip to content

Commit

Permalink
check for arxiv before try to get local title
Browse files Browse the repository at this point in the history
  • Loading branch information
WissamAntoun committed Jun 30, 2020
1 parent ed3a534 commit d9fd308
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 30 deletions.
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"python.pythonPath": "C:\\Users\\WISSAM-PC\\.conda\\envs\\thesis\\python.exe"
}
60 changes: 30 additions & 30 deletions rename_pdf_files_from_content_title_or_arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
import glob
import pdftitle
from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
import arxiv
import random
#%%
Expand All @@ -25,7 +25,7 @@ def get_valid_filename(s):
return re.sub(r'(?u)[^-\w.]', '', s)

#%%
all_files_names = glob.glob("C:\\Users\\WISSAM-PC\\Downloads\\Documents - copy\\*.pdf")
all_files_names = glob.glob("C:\\Users\\WISSAM-PC\\Downloads\\Documents\\*.pdf")
#%%
# research_files = []
# for f in tqdm(all_files_names):
Expand All @@ -39,35 +39,36 @@ def get_valid_filename(s):
# %%
research_files = []
for f in tqdm(all_files_names):
basename = os.path.basename(f)
if "__OLDNAME__" in basename:
if "__OLDNAME__" in f or '__XX__' in f:
continue
basename = os.path.basename(f)
dirname = os.path.dirname(f)
try:
title = pdftitle.get_title_from_file(f)
if len(title) < 3:
continue
title = re.sub(r'[\\/:"*?<>|]+'," ",title)
title = re.sub(r'\s'," ",title)
title = get_valid_filename(title)
new_basename = title + '__OLDNAME__' + basename
new_path = os.path.join(dirname, new_basename)
except:
title = None

if len(re.findall(r'\b\d{4}\.\d{5}',basename)) > 0 :
article_id = basename.strip('.pdf')
entry = arxiv.query(id_list=[article_id])
if len(entry) > 0:
title = entry[0]['title'].replace(":","")
title = re.sub(r'\s'," ",title)
title = get_valid_filename(title)
new_basename = title + '__OLDNAME__' + basename
new_path = os.path.join(dirname, new_basename)

if title==None:
if len(re.findall(r'\b\d{4}\.\d{5}',basename)) > 0 :
article_id = basename.strip('.pdf')
entry = arxiv.query(id_list=[article_id])
if len(entry) > 0:
title = entry[0]['title'].replace(":","")
title = re.sub(r'\s'," ",title)
title = get_valid_filename(title)
new_basename = title + '__OLDNAME__' + basename
new_path = os.path.join(dirname, new_basename)
else:
else:
try:
title = pdftitle.get_title_from_file(f)
if len(title) < 3:
continue
title = re.sub(r'[\\/:"*?<>|]+'," ",title)
title = re.sub(r'\s'," ",title)
title = get_valid_filename(title)
new_basename = title + '__OLDNAME__' + basename
new_path = os.path.join(dirname, new_basename)
except:
title = None
os.rename(f,f[:-4]+'__XX__'+'.pdf')
continue

try:
print(new_path)
os.rename(f, new_path)
Expand All @@ -76,9 +77,8 @@ def get_valid_filename(s):
dirname = os.path.dirname(new_path)
os.rename(f,os.path.join(dirname, new_basename[:-4]+str(random.randint(0,999))+'.pdf'))
except:
os.rename(f,f[:-4]+'__XX__'+'.pdf')
print('skipping')




# %%
# %%
1 change: 1 addition & 0 deletions run_pdf_renamer.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"C:\Users\WISSAM-PC\.conda\envs\thesis\python.exe" rename_pdf_files_from_content_title_or_arxiv.py

0 comments on commit d9fd308

Please sign in to comment.