Skip to content

Commit

Permalink
Multiple updates to version update 0.0.8 #54
Browse files Browse the repository at this point in the history
Associated issue number and/or pull request reference
Fixes #50, #51, #52

Proposed solution to the issue
Multiple updates were made concurrently as one issue could not be resolved without concurrently working on the other issues. This resulted in more changes than what was originally expected.

More information and references
Version updated to 0.0.8
  • Loading branch information
jshinm authored Dec 31, 2022
2 parents 0460c98 + 38a3a4e commit 809c0bd
Show file tree
Hide file tree
Showing 8 changed files with 557 additions and 501 deletions.
513 changes: 262 additions & 251 deletions examples/bib_example.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pandarize/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.7"
__version__ = "0.0.8"
266 changes: 34 additions & 232 deletions pandarize/_util.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,5 @@
import requests
import pandas as pd
from datetime import datetime
from pylatexenc.latex2text import LatexNodes2Text
import re
import os

def source_loader(source, savefile):
if check_url(string=source):
r = requests.get(url=source)
r = r.content
else:
try:
with open(source, 'r', encoding='UTF-8', newline='') as f:
r = f.read()
except Exception as e:
print('Error while reading from local file')

if isinstance(r, bytes):
raw = r.decode('utf-8')
elif isinstance(r, str):
raw = r
else:
raise Exception('The source cannot be parsed')

if savefile:
folder, files = os.path.split(savefile)
if not os.path.exists(path=folder):
os.mkdir(path=folder)

with open(savefile, 'w', encoding='UTF-8', newline='') as f:
f.write(raw)

return raw

def rfindall(string, pattern):
'''Find index of all occurrence of the pattern'''
Expand Down Expand Up @@ -64,93 +32,9 @@ def rfindall_matched(string, pattern, key):
match_index.append(match.start() + match.group().rfind(key))
return match_index

def bib_preprocessing(raw):
'''Pre-processes raw bib file'''

raw = raw.replace('\n', '').replace('\r', '') #remove linebreaks and linefeed
raw = re.sub(' +', ' ', raw) #contract whitespace

return raw

def bib_parser(raw, idxkey):
'''Main bib parsing logic'''
all_lst = []
lst = []
start = None
standby = None

for i, c in enumerate(raw):
if c == '@':
if not i in idxkey: #skip if not true start
continue

if lst:
# fixes cases when extra comma is added to the last key:value item
fix = raw[curr_idx:last_pair-2] + raw[last_pair-2:last_pair+1].replace(',', '')
lst.append(fix) #edge case for last key:value pair
all_lst.append(_itemize_bib(lst))
lst = []
curr_idx = i
start = True
elif c == ',' and start:
lst.append(raw[curr_idx:i+1])
start = False
curr_idx = i+1
elif c == '}' and i != len(raw)-1:
last_pair = i #catches last pair and saves position as index
standby = True
elif c == ',' and standby:
# second check to account for misused bracket edge cases
# e.g., author = {A and B and C and {D} and F}
standby = False

for check_i in raw[i+1:]:
if check_i == '}':
break
elif check_i == '=':
if raw[curr_idx:i+1]:
lst.append(raw[curr_idx:i+1]) #remove linebreak
curr_idx = i+1
else:
break
elif i == len(raw)-1:
lst.append(raw[curr_idx:i+1])
all_lst.append(_itemize_bib(lst))
elif c == ' ':
pass
else:
standby = False

df = pd.DataFrame(all_lst)
df = postprocessing(df)

return df

def _itemize_bib(lst):
'''Itemizes bib structured string into a json format'''
new_lst = []
dic = {}

for i, s in enumerate(lst):
if i == 0:
ii = s.rfind('@')
jj = s.rfind('{')
kk = s.rfind(',')
dic['type'] = s[ii:jj].replace('@', '')
dic['alias'] = s[jj:kk].replace('{', '')
else:
if s:
# print(s, sorted(rfindall(s, '=')))
ii = sorted(rfindall(s, '='))[0]
if s[-1] == ',':
s = s[:-1]
out = LatexNodes2Text().latex_to_text(s[ii+1:]).strip()
dic[s[:ii].strip()] = out

for i in lst:
new_lst.append(LatexNodes2Text().latex_to_text(i))

return dic
def truncate_names(srs):
'''Truncates names in Pandas series'''
pass

def check_string(string):
'''Screens for misinterpreted strings that interferes parsing (deprecated)'''
Expand Down Expand Up @@ -197,132 +81,50 @@ def manual_drop(raw, keys):

return raw

def postprocessing(df):
'''Post-process of constructed pandas DataFrame. Runs multiple checks.'''

# Author Name Check for Biber
df['author'] = df['author'].apply(lambda x: convert_names(x))

return df

def bib_parser_old(raw):
'''Old bib parsing logic (deprecated and replaced by the new logic)'''
df_out = pd.DataFrame()
raw = manual_drop(raw, keys=['\n'])
raw = check_string(raw)
is_newRow = True

for i, char in enumerate(raw[:]):

if char == '@' and is_newRow:
new_row = {}
get_type = i+1
elif char == '{':
if get_type:
new_row['type'] = raw[get_type:i].strip()
get_type = None
get_alias = i+1 #get the alias
elif curr_name != None:
get_item = i+1
else:
pass
elif char == '}':
if get_item:
new_row[curr_name] = raw[get_item:i]
get_item = None
curr_name = None
else:
df_row = pd.DataFrame.from_dict(new_row, orient='index').T
df_out = pd.concat([df_out, df_row])
is_newRow = True
elif char == '=' and get_name:
curr_name = raw[get_name:i].strip()
new_row[curr_name] = None
get_name = None
elif char == ',':
if get_alias:
new_row['alias'] = raw[get_alias:i]
get_alias = None
is_newRow = False
elif curr_name:
continue #edge case to handle comma (,) in the content
get_name = i+1
else:
pass

df_out.reset_index(drop=True, inplace=True)

return df_out

def check_names(string, connector):
def check_names(string, sep, connector):
'''Checks for valid author names'''
if connector in string:
return True

# skip in case at least one name is already converted
# or there's misformatting issue
if sep in string:
return True

return False

def convert_names(string, sep=',', connector='and'):
'''Convert First MI Last names to Last, First MI format.
'''
"""Convert First MI Last names to Last, First MI format.
Args:
string (str): parsed string that contains names with (name)(sep)(name) format
sep (str, optional): original string separator between names. Defaults to ','.
connector (str, optional): new name connector that will connect converted names. Defaults to 'and'.
Returns:
str: converted names connected by `connector`
"""

padded_connector = f' {connector} '

if check_names(string, connector=padded_connector):
if check_names(string, sep=sep, connector=padded_connector):
return string

names = ''
lst = string.split(sep)

for i, nms in enumerate(lst):
nm = nms.strip().split(' ')
names += f'{nm[-1]}, {nm[0]}'
if len(nm) > 2:
for mname in nm[1:-1]:
names += f' {mname[0].upper()}.'
if i+1 != len(lst):
names += f'{padded_connector}'
try:
nm = nms.strip().split(' ')
names += f'{nm[-1]}, {nm[0]}'
if len(nm) > 2:
for mname in nm[1:-1]:
names += f' {mname[0].upper()}.'
if i+1 != len(lst):
names += f'{padded_connector}'
except Exception as e:
print(f'{e} for {nms} at {i}th index')

# conditional here for truncate author list

return names

def bib_writer(df, types, alias, dirs):
'''bib writer and formatter that converts pandas
dataframe into a bib file
'''

def parse(row, types=types, alias=alias):
items = []

for i, (idx, item) in enumerate(zip(row.index, row)):
if pd.isnull(item) or item == '':
continue
item = str(item)
if idx == types:
header = f'@{item}' + '{'
elif idx == alias:
alias = item + ',\n'
else:
item_i = f'\t{idx} = ' + '{' + f'{item}' + '},\n'
items.append(item_i)

out_text = header + alias
for i in items:
out_text += i
out_text = out_text[:-2] #remove last comma
out_text += '\n},\n'

return out_text

N = df.shape[0]

# Add stamper before the first header
out = stamper(target='bib')

for i in range(N):
if i == N-1: #remove the very last comma
out += parse(df.iloc[i,:])[:-3] + parse(df.iloc[i,:])[-3:].replace(',', '') + '\n'
else:
out += parse(df.iloc[i,:]) + '\n'

if not os.path.exists(path=dirs):
os.mkdir(path=dirs)

with open(f'{dirs}output.bib', 'w', encoding='utf-8') as f:
f.write(out)
5 changes: 5 additions & 0 deletions pandarize/config/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bib:
- convert_names: True #change name format to Last, First MI
- remove_html: False #TODO: remove html tags
- remove_empty_entries: True #empty entries are removed after `transform`
- truncate_author_list: False #shorten names in the list by only stating the first person followed by optional `suffix`
42 changes: 30 additions & 12 deletions pandarize/frame.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,45 @@
import pandas as pd
from pandarize._util import *
from ._util import *
from .loader import Loader
from .parser import Parser

class Pandarizer:
def __init__(self):
self.raw = None
self.df = None
self.idxkey = None
class Pandarizer(Loader, Parser):

def initialize(self, yaml=False, path=None):
'''Initializes the setting either for the first time by
loading a default yaml config file in system dir or
load from an user-specified existing the file in `path`
'''
self.load_config(yaml=yaml, path=path)

def load(self, source=None, savefile=None):
'''Loads raw data from either local file or the url
'''
self.raw = source_loader(source=source, savefile=savefile)
self.raw = bib_preprocessing(raw=self.raw)
self.idxkey = rfindall_matched(self.raw, r'[.*]?@[^}]*{*[,]', '@')
self.source_loader(source=source, savefile=savefile)
self.bib_preprocessing()

def fit(self, kind='bib'):
def fit(self, kind='bib', postprocess=False):
'''Method that infers data structure (in the future)
'''
if kind == 'bib':
self.df = bib_parser(raw=self.raw, idxkey=self.idxkey)
self.bib_parser(postprocess=postprocess)

def transform(self, formats='bib', types=None, alias=None, dirs=None):
'''Transform loaded data into a specified data type
'''
if formats == 'bib':
bib_writer(df=self.df, types=types, alias=alias, dirs=dirs)
self.bib_writer(types=types, alias=alias, dirs=dirs)

def describe(self):
'''Generates basic metadata'''

if self.df is None:
print('No file is loaded. Please load() and fit() to create metadata.')
return

if self.df.shape[0] == 0 or self.df.shape[1] == 0:
print('The file has not been loaded successfully. Please check the file path and/or make sure that file is not corrupted.')
return

print(f'''The loaded file has {self.df.shape[0]} rows and {self.df.shape[1]} columns.\n
''')
Loading

0 comments on commit 809c0bd

Please sign in to comment.