Multiple updates to version update 0.0.8 #54

Associated issue number and/or pull request reference Fixes #50, #51, #52 Proposed solution to the issue Multiple updates were made concurrently as one issue could not be resolved without concurrently working on the other issues. This resulted in more changes than what was originally expected. More information and references Version updated to 0.0.8
modrev-ai · Dec 31, 2022 · 809c0bd · 809c0bd
2 parents 0460c98 + 38a3a4e
commit 809c0bd
Show file tree

Hide file tree

Showing 8 changed files with 557 additions and 501 deletions.
diff --git a/examples/bib_example.ipynb b/examples/bib_example.ipynb
diff --git a/pandarize/__init__.py b/pandarize/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.7"
+__version__ = "0.0.8"
diff --git a/pandarize/_util.py b/pandarize/_util.py
@@ -1,37 +1,5 @@
-import requests
-import pandas as pd
 from datetime import datetime
-from pylatexenc.latex2text import LatexNodes2Text
 import re
-import os
-
-def source_loader(source, savefile):
-    if check_url(string=source):
-        r = requests.get(url=source)
-        r = r.content
-    else:
-        try:
-            with open(source, 'r', encoding='UTF-8', newline='') as f:
-                r = f.read()
-        except Exception as e:
-            print('Error while reading from local file')
-
-    if isinstance(r, bytes):
-        raw = r.decode('utf-8')
-    elif isinstance(r, str):
-        raw = r
-    else:
-        raise Exception('The source cannot be parsed')
-
-    if savefile:
-        folder, files = os.path.split(savefile)
-        if not os.path.exists(path=folder):
-            os.mkdir(path=folder)
-
-        with open(savefile, 'w', encoding='UTF-8', newline='') as f:
-            f.write(raw)
-
-    return raw
 
 def rfindall(string, pattern):
     '''Find index of all occurrence of the pattern'''
@@ -64,93 +32,9 @@ def rfindall_matched(string, pattern, key):
        match_index.append(match.start() + match.group().rfind(key))
     return match_index
 
-def bib_preprocessing(raw):
-    '''Pre-processes raw bib file'''
-
-    raw = raw.replace('\n', '').replace('\r', '') #remove linebreaks and linefeed
-    raw = re.sub(' +', ' ', raw) #contract whitespace
-
-    return raw
-
-def bib_parser(raw, idxkey):
-    '''Main bib parsing logic'''
-    all_lst = []
-    lst = []
-    start = None
-    standby = None
-
-    for i, c in enumerate(raw):
-        if c == '@':
-            if not i in idxkey: #skip if not true start
-                continue
-
-            if lst:
-                # fixes cases when extra comma is added to the last key:value item
-                fix = raw[curr_idx:last_pair-2] + raw[last_pair-2:last_pair+1].replace(',', '')
-                lst.append(fix) #edge case for last key:value pair
-                all_lst.append(_itemize_bib(lst))
-            lst = []
-            curr_idx = i
-            start = True
-        elif c == ',' and start:
-            lst.append(raw[curr_idx:i+1])
-            start = False
-            curr_idx = i+1
-        elif c == '}' and i != len(raw)-1:
-            last_pair = i #catches last pair and saves position as index
-            standby = True
-        elif c == ',' and standby:
-            # second check to account for misused bracket edge cases
-            # e.g., author = {A and B and C and {D} and F}
-            standby = False
-
-            for check_i in raw[i+1:]:
-                if check_i == '}':
-                    break
-                elif check_i == '=':
-                    if raw[curr_idx:i+1]:
-                        lst.append(raw[curr_idx:i+1]) #remove linebreak
-                        curr_idx = i+1
-                    else:
-                        break
-        elif i == len(raw)-1:
-            lst.append(raw[curr_idx:i+1])
-            all_lst.append(_itemize_bib(lst))
-        elif c == ' ':
-            pass
-        else:
-            standby = False
-
-    df = pd.DataFrame(all_lst)
-    df = postprocessing(df)
-
-    return df
-
-def _itemize_bib(lst):
-    '''Itemizes bib structured string into a json format'''
-    new_lst = []
-    dic = {}
-
-    for i, s in enumerate(lst):
-        if i == 0:
-            ii = s.rfind('@')
-            jj = s.rfind('{')
-            kk = s.rfind(',')
-            dic['type'] = s[ii:jj].replace('@', '')
-            dic['alias'] = s[jj:kk].replace('{', '')
-        else:
-            if s:
-                # print(s, sorted(rfindall(s, '=')))
-                ii = sorted(rfindall(s, '='))[0]
-                if s[-1] == ',':
-                    s = s[:-1]
-                out = LatexNodes2Text().latex_to_text(s[ii+1:]).strip()
-                dic[s[:ii].strip()] = out
-
-    for i in lst:
-        new_lst.append(LatexNodes2Text().latex_to_text(i))
-
-    return dic
+def truncate_names(srs):
+    '''Truncates names in Pandas series'''
+    pass
 
 def check_string(string):
     '''Screens for misinterpreted strings that interferes parsing (deprecated)'''
@@ -197,132 +81,50 @@ def manual_drop(raw, keys):
 
     return raw
 
-def postprocessing(df):
-    '''Post-process of constructed pandas DataFrame. Runs multiple checks.'''
-
-    # Author Name Check for Biber
-    df['author'] = df['author'].apply(lambda x: convert_names(x))
-
-    return df
-
-def bib_parser_old(raw):
-    '''Old bib parsing logic (deprecated and replaced by the new logic)'''
-    df_out = pd.DataFrame()
-    raw = manual_drop(raw, keys=['\n'])
-    raw = check_string(raw)
-    is_newRow = True
-
-    for i, char in enumerate(raw[:]):
-
-        if char == '@' and is_newRow:
-            new_row = {}
-            get_type = i+1
-        elif char == '{':
-            if get_type:
-                new_row['type'] = raw[get_type:i].strip()
-                get_type = None
-                get_alias = i+1 #get the alias
-            elif curr_name != None:
-                get_item = i+1
-            else:
-                pass
-        elif char == '}':
-            if get_item:
-                new_row[curr_name] = raw[get_item:i]
-                get_item = None
-                curr_name = None
-            else:
-                df_row = pd.DataFrame.from_dict(new_row, orient='index').T
-                df_out = pd.concat([df_out, df_row])
-                is_newRow = True
-        elif char == '=' and get_name:
-            curr_name = raw[get_name:i].strip()
-            new_row[curr_name] = None
-            get_name = None
-        elif char == ',':
-            if get_alias:
-                new_row['alias'] = raw[get_alias:i]
-                get_alias = None
-                is_newRow = False
-            elif curr_name:
-                continue #edge case to handle comma (,) in the content
-            get_name = i+1
-        else:
-            pass
-
-    df_out.reset_index(drop=True, inplace=True)
-
-    return df_out
-
-def check_names(string, connector):
+def check_names(string, sep, connector):
     '''Checks for valid author names'''
     if connector in string:
         return True
+
+    # skip in case at least one name is already converted
+    # or there's misformatting issue
+    if sep in string:
+        return True
+
     return False
 
 def convert_names(string, sep=',', connector='and'):
-    '''Convert First MI Last names to Last, First MI format.
-    '''
+    """Convert First MI Last names to Last, First MI format.
+
+    Args:
+        string (str): parsed string that contains names with (name)(sep)(name) format
+        sep (str, optional): original string separator between names. Defaults to ','.
+        connector (str, optional): new name connector that will connect converted names. Defaults to 'and'.
+
+    Returns:
+        str: converted names connected by `connector`
+    """
+
     padded_connector = f' {connector} '
 
-    if check_names(string, connector=padded_connector):
+    if check_names(string, sep=sep, connector=padded_connector):
         return string
 
     names = ''
     lst = string.split(sep)
 
     for i, nms in enumerate(lst):
-        nm = nms.strip().split(' ')
-        names += f'{nm[-1]}, {nm[0]}'
-        if len(nm) > 2:
-            for mname in nm[1:-1]:
-                names += f' {mname[0].upper()}.'
-        if i+1 != len(lst):
-            names += f'{padded_connector}'
+        try:
+            nm = nms.strip().split(' ')
+            names += f'{nm[-1]}, {nm[0]}'
+            if len(nm) > 2:
+                for mname in nm[1:-1]:
+                    names += f' {mname[0].upper()}.'
+            if i+1 != len(lst):
+                names += f'{padded_connector}'
+        except Exception as e:
+            print(f'{e} for {nms} at {i}th index')
+
+    # conditional here for truncate author list
 
     return names
-
-def bib_writer(df, types, alias, dirs):
-    '''bib writer and formatter that converts pandas 
-    dataframe into a bib file
-    '''
-
-    def parse(row, types=types, alias=alias):
-        items = []
-
-        for i, (idx, item) in enumerate(zip(row.index, row)):
-            if pd.isnull(item) or item == '':
-                continue
-            item = str(item)
-            if idx == types:
-                header = f'@{item}' + '{'
-            elif idx == alias:
-                alias = item + ',\n'
-            else:
-                item_i = f'\t{idx} = ' + '{' + f'{item}' + '},\n'
-                items.append(item_i)
-
-        out_text = header + alias
-        for i in items:
-            out_text += i
-        out_text = out_text[:-2] #remove last comma
-        out_text += '\n},\n'
-
-        return out_text
-
-    N = df.shape[0]
-
-    # Add stamper before the first header
-    out = stamper(target='bib')
-
-    for i in range(N):
-        if i == N-1: #remove the very last comma
-            out += parse(df.iloc[i,:])[:-3] + parse(df.iloc[i,:])[-3:].replace(',', '') + '\n'
-        else:
-            out += parse(df.iloc[i,:]) + '\n'
-
-    if not os.path.exists(path=dirs):
-        os.mkdir(path=dirs)
-
-    with open(f'{dirs}output.bib', 'w', encoding='utf-8') as f:
-        f.write(out)
diff --git a/pandarize/config/config.yaml b/pandarize/config/config.yaml
@@ -0,0 +1,5 @@
+bib:
+  - convert_names: True #change name format to Last, First MI
+  - remove_html: False #TODO: remove html tags
+  - remove_empty_entries: True #empty entries are removed after `transform`
+  - truncate_author_list: False #shorten names in the list by only stating the first person followed by optional `suffix`
diff --git a/pandarize/frame.py b/pandarize/frame.py
@@ -1,27 +1,45 @@
 import pandas as pd
-from pandarize._util import *
+from ._util import *
+from .loader import Loader
+from .parser import Parser
 
-class Pandarizer:
-    def __init__(self):
-        self.raw = None
-        self.df = None
-        self.idxkey = None
+class Pandarizer(Loader, Parser):
+
+    def initialize(self, yaml=False, path=None):
+        '''Initializes the setting either for the first time by
+        loading a default yaml config file in system dir or 
+        load from an user-specified existing the file in `path`
+        '''
+        self.load_config(yaml=yaml, path=path)
 
     def load(self, source=None, savefile=None):
         '''Loads raw data from either local file or the url
         '''
-        self.raw = source_loader(source=source, savefile=savefile)
-        self.raw = bib_preprocessing(raw=self.raw)
-        self.idxkey = rfindall_matched(self.raw, r'[.*]?@[^}]*{*[,]', '@')
+        self.source_loader(source=source, savefile=savefile)
+        self.bib_preprocessing()
 
-    def fit(self, kind='bib'):
+    def fit(self, kind='bib', postprocess=False):
         '''Method that infers data structure (in the future)
         '''
         if kind == 'bib':
-            self.df = bib_parser(raw=self.raw, idxkey=self.idxkey)
+            self.bib_parser(postprocess=postprocess)
 
     def transform(self, formats='bib', types=None, alias=None, dirs=None):
         '''Transform loaded data into a specified data type
         '''
         if formats == 'bib':
-            bib_writer(df=self.df, types=types, alias=alias, dirs=dirs)
+            self.bib_writer(types=types, alias=alias, dirs=dirs)
+
+    def describe(self):
+        '''Generates basic metadata'''
+
+        if self.df is None:
+            print('No file is loaded. Please load() and fit() to create metadata.')
+            return 
+
+        if self.df.shape[0] == 0 or self.df.shape[1] == 0:
+            print('The file has not been loaded successfully. Please check the file path and/or make sure that file is not corrupted.')
+            return 
+
+        print(f'''The loaded file has {self.df.shape[0]} rows and {self.df.shape[1]} columns.\n
+              ''')