From 0ab4e236da8c7c63198c9e5a87d2e5e44c18b856 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sun, 25 Dec 2022 06:29:07 -0500 Subject: [PATCH 01/31] Re-run using `0.0.7` (#46) --- examples/bib_example.ipynb | 444 ++++++++++++++++++------------------- 1 file changed, 211 insertions(+), 233 deletions(-) diff --git a/examples/bib_example.ipynb b/examples/bib_example.ipynb index b86222a..d72479e 100644 --- a/examples/bib_example.ipynb +++ b/examples/bib_example.ipynb @@ -22,7 +22,7 @@ { "data": { "text/plain": [ - "'0.0.4'" + "'0.0.7'" ] }, "execution_count": 1, @@ -49,7 +49,7 @@ "metadata": {}, "outputs": [], "source": [ - "url = 'https://raw.githubusercontent.com/neurodata/neurodata.io/deploy/content/bibs/people.bib'" + "url = 'https://raw.githubusercontent.com/Qinka/random-bib/master/references-1a.bib'" ] }, { @@ -91,175 +91,218 @@ " \n", " type\n", " alias\n", - " author\n", - " usera\n", - " userw\n", - " month\n", + " title\n", " year\n", + " journal\n", + " author\n", " number\n", - " series\n", - " abstract\n", - " userb\n", - " userc\n", - " userd\n", - " keywords\n", - " doi\n", - " note\n", + " volume\n", " url\n", - " usere\n", - " file\n", + " doi\n", + " ...\n", + " arxivId\n", + " editor\n", + " publisher\n", + " address\n", + " month\n", + " pmid\n", + " institution\n", + " series\n", + " translator\n", + " edition\n", " \n", " \n", " \n", " \n", " 0\n", - " incollection\n", - " jovo\n", - " Joshua Vogelstein\n", - " Director\n", - " Chief Unity Ninja\n", - " \n", - " \n", - " --\n", - " \n", - " \n", - " \n", - " BME, JHU\n", - " director\n", - " director\n", - " jovo\n", - " jovo@jhu.edu\n", - " neurodata.io/about/jovo\n", - " safe-zone\n", - " vogelstein_joshua.jpg\n", + " article\n", + " Klein2012\n", + " 101 Labeled Brain Images and a Consistent Huma...\n", + " 2012\n", + " Frontiers in Neuroscience\n", + " Klein, Arno and Tourville, Jason\n", + " DEC\n", + " 6\n", + " http://journal.frontiersin.org/article/10.3389...\n", + " 10.3389/fnins.2012.00171\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 1\n", - " incollection\n", - " tylertomita\n", - " Tyler Tomita\n", - " Postdoctoral Fellow\n", - " \n", - " 8\n", - " 2014\n", - " 08/14 --\n", - " \n", - " Developed Sparse Projection Oblique Randomer F...\n", - " MSE\n", - " BME, JHU\n", - " postdoc\n", - " postdoc\n", - " ttomita\n", - " \n", - " \n", - " \n", - " tomita_tyler.jpg\n", + " inproceedings\n", + " Zhong2017b\n", + " 3D alpha matting based co-segmentation of tumo...\n", + " 2017\n", + " NaN\n", + " Zhong, Zisha and Kim, Yusung and Buatti, John ...\n", + " NaN\n", + " 10555 LNCS\n", + " NaN\n", + " 10.1007/978-3-319-67564-0_4\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 2\n", - " incollection\n", - " jongshin\n", - " Jong Shin\n", - " Software Engineer\n", - " Tech Support\n", - " 9\n", - " 2020\n", - " 09/20 --\n", - " \n", - " Currently investigating the effect of inductiv...\n", - " MS\n", - " BME, JHU\n", - " staff\n", - " staffresearch\n", - " jshinm\n", - " jshin69@jhu.edu\n", - " \n", - " safe-zone\n", - " jong_shin.png\n", + " article\n", + " Khvostikov2018\n", + " 3D CNN-based classification using sMRI and MD-...\n", + " 2018\n", + " NaN\n", + " Khvostikov, Alexander and Aderghal, Karim and ...\n", + " NaN\n", + " NaN\n", + " http://arxiv.org/abs/1801.05968\n", + " NaN\n", + " ...\n", + " 1801.05968\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 3\n", - " incollection\n", - " aligeisa\n", - " Ali Geisa\n", - " Research Assistant\n", - " \n", - " 3\n", - " 2020\n", - " 03/20 --\n", - " \n", - " Researching progressive and lifelong learning ...\n", - " MS\n", - " BME, JHU\n", - " faculty - research\n", - " staffresearch\n", - " \n", - " \n", - " \n", - " \n", - " ali_geisa.jpg\n", + " inproceedings\n", + " Dou2016\n", + " 3D deeply supervised network for automatic liv...\n", + " 2016\n", + " NaN\n", + " Dou, Qi and Chen, Hao and Jin, Yueming and Yu,...\n", + " NaN\n", + " 9901 LNCS\n", + " NaN\n", + " 10.1007/978-3-319-46723-8_18\n", + " ...\n", + " 1607.00582\n", + " Ourselin, Sebastien and Joskowicz, Leo and Sab...\n", + " Springer International Publishing\n", + " Cham\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 4\n", - " incollection\n", - " kareefullah\n", - " Kareef Ullah\n", - " Undergraduate Researcher\n", - " Cup Stacker\n", - " 9\n", - " 2021\n", - " 09/21 --\n", - " \n", - " Assisted with fixing issues in graspologic and...\n", - " \n", - " BME, JHU\n", - " undergrad\n", - " undergrad\n", - " \n", - " kullah2@jhu.edu\n", - " \n", - " \n", - " kareef_ullah.jpg\n", + " inproceedings\n", + " Zhong2018\n", + " 3D fully convolutional networks for co-segment...\n", + " 2018\n", + " NaN\n", + " Zhong, Z and Kim, Y and Zhou, L and Plichta, K...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 10.1109/ISBI.2018.8363561\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 4\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", "\n", + "

5 rows × 25 columns

\n", "" ], "text/plain": [ - " type alias author usera \\\n", - "0 incollection jovo Joshua Vogelstein Director \n", - "1 incollection tylertomita Tyler Tomita Postdoctoral Fellow \n", - "2 incollection jongshin Jong Shin Software Engineer \n", - "3 incollection aligeisa Ali Geisa Research Assistant \n", - "4 incollection kareefullah Kareef Ullah Undergraduate Researcher \n", + " type alias \\\n", + "0 article Klein2012 \n", + "1 inproceedings Zhong2017b \n", + "2 article Khvostikov2018 \n", + "3 inproceedings Dou2016 \n", + "4 inproceedings Zhong2018 \n", "\n", - " userw month year number series \\\n", - "0 Chief Unity Ninja -- \n", - "1 8 2014 08/14 -- \n", - "2 Tech Support 9 2020 09/20 -- \n", - "3 3 2020 03/20 -- \n", - "4 Cup Stacker 9 2021 09/21 -- \n", + " title year \\\n", + "0 101 Labeled Brain Images and a Consistent Huma... 2012 \n", + "1 3D alpha matting based co-segmentation of tumo... 2017 \n", + "2 3D CNN-based classification using sMRI and MD-... 2018 \n", + "3 3D deeply supervised network for automatic liv... 2016 \n", + "4 3D fully convolutional networks for co-segment... 2018 \n", "\n", - " abstract userb userc \\\n", - "0 BME, JHU \n", - "1 Developed Sparse Projection Oblique Randomer F... MSE BME, JHU \n", - "2 Currently investigating the effect of inductiv... MS BME, JHU \n", - "3 Researching progressive and lifelong learning ... MS BME, JHU \n", - "4 Assisted with fixing issues in graspologic and... BME, JHU \n", + " journal \\\n", + "0 Frontiers in Neuroscience \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", "\n", - " userd keywords doi note \\\n", - "0 director director jovo jovo@jhu.edu \n", - "1 postdoc postdoc ttomita \n", - "2 staff staffresearch jshinm jshin69@jhu.edu \n", - "3 faculty - research staffresearch \n", - "4 undergrad undergrad kullah2@jhu.edu \n", + " author number volume \\\n", + "0 Klein, Arno and Tourville, Jason DEC 6 \n", + "1 Zhong, Zisha and Kim, Yusung and Buatti, John ... NaN 10555 LNCS \n", + "2 Khvostikov, Alexander and Aderghal, Karim and ... NaN NaN \n", + "3 Dou, Qi and Chen, Hao and Jin, Yueming and Yu,... NaN 9901 LNCS \n", + "4 Zhong, Z and Kim, Y and Zhou, L and Plichta, K... NaN NaN \n", "\n", - " url usere file \n", - "0 neurodata.io/about/jovo safe-zone vogelstein_joshua.jpg \n", - "1 tomita_tyler.jpg \n", - "2 safe-zone jong_shin.png \n", - "3 ali_geisa.jpg \n", - "4 kareef_ullah.jpg " + " url \\\n", + "0 http://journal.frontiersin.org/article/10.3389... \n", + "1 NaN \n", + "2 http://arxiv.org/abs/1801.05968 \n", + "3 NaN \n", + "4 NaN \n", + "\n", + " doi ... arxivId \\\n", + "0 10.3389/fnins.2012.00171 ... NaN \n", + "1 10.1007/978-3-319-67564-0_4 ... NaN \n", + "2 NaN ... 1801.05968 \n", + "3 10.1007/978-3-319-46723-8_18 ... 1607.00582 \n", + "4 10.1109/ISBI.2018.8363561 ... NaN \n", + "\n", + " editor \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 Ourselin, Sebastien and Joskowicz, Leo and Sab... \n", + "4 NaN \n", + "\n", + " publisher address month pmid institution series \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN \n", + "3 Springer International Publishing Cham NaN NaN NaN NaN \n", + "4 NaN NaN 4 NaN NaN NaN \n", + "\n", + " translator edition \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + "[5 rows x 25 columns]" ] }, "execution_count": 4, @@ -284,7 +327,6 @@ "metadata": {}, "outputs": [], "source": [ - "# url_beta = 'https://raw.githubusercontent.com/neurodata/neurodata.io/deploy/content/bibs/talks.bib'\n", "url_beta = 'https://raw.githubusercontent.com/neurodata/neurodata.io/deploy/content/bibs/press.bib'" ] }, @@ -305,7 +347,7 @@ { "data": { "text/plain": [ - "'@misc{Prazosin2020,\\n year={2020},\\n title={Prazosin Might Be A Treatment for COVID-19. More Data Is'" + "'@misc{Prazosin2020, year={2020}, title={Prazosin Might Be A Treatment for COVID-19. More Data Is Urg'" ] }, "execution_count": 7, @@ -341,7 +383,7 @@ { "data": { "text/plain": [ - "'@misc{Prazosin2020,\\n year={2020},\\n title={Prazosin Might Be A Treatment for COVID-19. More Data Is'" + "'@misc{Prazosin2020, year={2020}, title={Prazosin Might Be A Treatment for COVID-19. More Data Is Urg'" ] }, "execution_count": 9, @@ -530,7 +572,7 @@ { "data": { "text/plain": [ - "\"% ****************************************\\r\\n% * File: SAMPLE.BIB *\\r\\n% ****************************************\\r\\n% * An invented bib file *\\r\\n% * For the sample texts *\\r\\n% * The order is unimportant and there *\\r\\n% * may be more entries than references *\\r\\n% * in the text *\\r\\n% ****************************************\\r\\n% \\r\\n@ARTICLE{smit54,\\r\\n\\tAUTHOR = {J. G. Smith and H. K. Weston},\\r\\n\\tTITLE = {Nothing Particular in this Year's History},\\r\\n\\tYEAR = {1954},\\r\\n\\tJOURNAL = {J. Geophys. Res.},\\r\\n\\tVOLUME = {2},\\r\\n\\tPAGES = {14-15}\\r\\n}\\r\\n@BOOK{colu92,\\r\\n\\tAUTHOR = {Christopher Columbus},\\r\\n\\tTITLE = {How {I} Discovered {America}},\\r\\n\\tYEAR = {1492},\\r\\n\\tPUBLISHER = {Hispanic Press},\\r\\n\\tADDRESS = {Barcelona}\\r\\n}\\r\\n@ARTICLE{gree00,\\r\\n\\tAUTHOR = {R. J. Green and U. P. Fred and W. P. Norbert},\\r\\n\\tTITLE = {Things that Go Bump in the Night},\\r\\n\\tYEAR = {1900},\\r\\n\\tJOURNAL = {Psych. Today},\\r\\n\\tVOLUME = {46},\\r\\n\\tPAGES = {345-678}\\r\\n}\\r\\n@ART\"" + "\"% ****************************************% * File: SAMPLE.BIB *% ****************************************% * An invented bib file *% * For the sample texts *% * The order is unimportant and there *% * may be more entries than references *% * in the text *% ****************************************% @ARTICLE{smit54,\\tAUTHOR = {J. G. Smith and H. K. Weston},\\tTITLE = {Nothing Particular in this Year's History},\\tYEAR = {1954},\\tJOURNAL = {J. Geophys. Res.},\\tVOLUME = {2},\\tPAGES = {14-15}}@BOOK{colu92,\\tAUTHOR = {Christopher Columbus},\\tTITLE = {How {I} Discovered {America}},\\tYEAR = {1492},\\tPUBLISHER = {Hispanic Press},\\tADDRESS = {Barcelona}}@ARTICLE{gree00,\\tAUTHOR = {R. J. Green and U. P. Fred and W. P. Norbert},\\tTITLE = {Things that Go Bump in the Night},\\tYEAR = {1900},\\tJOURNAL = {Psych. Today},\\tVOLUME = {46},\\tPAGES = {345-678}}@ARTICLE{phil99,\\tAUTHOR = {T. P. Phillips},\\tTITLE = {Possible Influence of the Magnetosphere on {American} History},\\tYEAR = {1999},\\tJOURNAL = {J. Oddball Res.},\\tVOLUME \"" ] }, "execution_count": 14, @@ -608,33 +650,7 @@ " BOOK\n", " colu92\n", " Christopher Columbus\n", - " I\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 2\n", - " BOOK\n", - " colu92\n", - " Christopher Columbus\n", - " I\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 3\n", - " BOOK\n", - " colu92\n", - " Christopher Columbus\n", - " I\n", + " How I Discovered America\n", " 1492\n", " NaN\n", " NaN\n", @@ -643,7 +659,7 @@ " Barcelona\n", " \n", " \n", - " 4\n", + " 2\n", " ARTICLE\n", " gree00\n", " R. J. Green and U. P. Fred and W. P. Norbert\n", @@ -656,24 +672,11 @@ " NaN\n", " \n", " \n", - " 5\n", - " ARTICLE\n", - " phil99\n", - " T. P. Phillips\n", - " American\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 6\n", + " 3\n", " ARTICLE\n", " phil99\n", " T. P. Phillips\n", - " American\n", + " Possible Influence of the Magnetosphere on Ame...\n", " 1999\n", " J. Oddball Res.\n", " 98\n", @@ -682,24 +685,11 @@ " NaN\n", " \n", " \n", - " 7\n", - " ARTICLE\n", - " jame76\n", - " Kelly James and Harris, Jr., George and Wilby ...\n", - " American\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 8\n", + " 4\n", " ARTICLE\n", " jame76\n", " Kelly James and Harris, Jr., George and Wilby ...\n", - " American\n", + " American Independence and Magnetism\n", " 1776\n", " Revol. Tracts\n", " 32\n", @@ -715,35 +705,23 @@ " type alias AUTHOR \\\n", "0 ARTICLE smit54 J. G. Smith and H. K. Weston \n", "1 BOOK colu92 Christopher Columbus \n", - "2 BOOK colu92 Christopher Columbus \n", - "3 BOOK colu92 Christopher Columbus \n", - "4 ARTICLE gree00 R. J. Green and U. P. Fred and W. P. Norbert \n", - "5 ARTICLE phil99 T. P. Phillips \n", - "6 ARTICLE phil99 T. P. Phillips \n", - "7 ARTICLE jame76 Kelly James and Harris, Jr., George and Wilby ... \n", - "8 ARTICLE jame76 Kelly James and Harris, Jr., George and Wilby ... \n", + "2 ARTICLE gree00 R. J. Green and U. P. Fred and W. P. Norbert \n", + "3 ARTICLE phil99 T. P. Phillips \n", + "4 ARTICLE jame76 Kelly James and Harris, Jr., George and Wilby ... \n", "\n", - " TITLE YEAR JOURNAL VOLUME \\\n", - "0 Nothing Particular in this Year's History 1954 J. Geophys. Res. 2 \n", - "1 I NaN NaN NaN \n", - "2 I NaN NaN NaN \n", - "3 I 1492 NaN NaN \n", - "4 Things that Go Bump in the Night 1900 Psych. Today 46 \n", - "5 American NaN NaN NaN \n", - "6 American 1999 J. Oddball Res. 98 \n", - "7 American NaN NaN NaN \n", - "8 American 1776 Revol. Tracts 32 \n", + " TITLE YEAR JOURNAL \\\n", + "0 Nothing Particular in this Year's History 1954 J. Geophys. Res. \n", + "1 How I Discovered America 1492 NaN \n", + "2 Things that Go Bump in the Night 1900 Psych. Today \n", + "3 Possible Influence of the Magnetosphere on Ame... 1999 J. Oddball Res. \n", + "4 American Independence and Magnetism 1776 Revol. Tracts \n", "\n", - " PAGES PUBLISHER ADDRESS \n", - "0 14-15 NaN NaN \n", - "1 NaN NaN NaN \n", - "2 NaN NaN NaN \n", - "3 NaN Hispanic Press Barcelona \n", - "4 345-678 NaN NaN \n", - "5 NaN NaN NaN \n", - "6 1000-1003 NaN NaN \n", - "7 NaN NaN NaN \n", - "8 34-55 NaN NaN " + " VOLUME PAGES PUBLISHER ADDRESS \n", + "0 2 14-15 NaN NaN \n", + "1 NaN NaN Hispanic Press Barcelona \n", + "2 46 345-678 NaN NaN \n", + "3 98 1000-1003 NaN NaN \n", + "4 32 34-55 NaN NaN " ] }, "execution_count": 16, @@ -765,7 +743,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.13 ('pdr')", + "display_name": "Python 3.9.15 ('pandarize')", "language": "python", "name": "python3" }, @@ -779,12 +757,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.15" }, "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "ec3ba36b413de325dd38d751e7a8a6eca65f333d1619da92dd7e369649a08d52" + "hash": "dad1adfa8d136b42bc671de9edd435b00b948a75d2c3d60fa318daadbc489ee6" } } }, From 0f22611dc3bb740978e92e1442093deba0e671e6 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sun, 25 Dec 2022 06:29:51 -0500 Subject: [PATCH 02/31] Add postprocessing switch (#46) --- pandarize/_util.py | 32 +++++++++++++++++++++----------- pandarize/frame.py | 4 ++-- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/pandarize/_util.py b/pandarize/_util.py index fa83f29..a392aa0 100644 --- a/pandarize/_util.py +++ b/pandarize/_util.py @@ -72,7 +72,7 @@ def bib_preprocessing(raw): return raw -def bib_parser(raw, idxkey): +def bib_parser(raw, idxkey, postprocess): '''Main bib parsing logic''' all_lst = [] lst = [] @@ -122,7 +122,8 @@ def bib_parser(raw, idxkey): standby = False df = pd.DataFrame(all_lst) - df = postprocessing(df) + if postprocess: + df = postprocessing(df) return df @@ -254,10 +255,16 @@ def bib_parser_old(raw): return df_out -def check_names(string, connector): +def check_names(string, sep, connector): '''Checks for valid author names''' if connector in string: return True + + # skip in case at least one name is already converted + # or there's misformatting issue + if sep in string: + return True + return False def convert_names(string, sep=',', connector='and'): @@ -265,20 +272,23 @@ def convert_names(string, sep=',', connector='and'): ''' padded_connector = f' {connector} ' - if check_names(string, connector=padded_connector): + if check_names(string, sep=sep, connector=padded_connector): return string names = '' lst = string.split(sep) for i, nms in enumerate(lst): - nm = nms.strip().split(' ') - names += f'{nm[-1]}, {nm[0]}' - if len(nm) > 2: - for mname in nm[1:-1]: - names += f' {mname[0].upper()}.' - if i+1 != len(lst): - names += f'{padded_connector}' + try: + nm = nms.strip().split(' ') + names += f'{nm[-1]}, {nm[0]}' + if len(nm) > 2: + for mname in nm[1:-1]: + names += f' {mname[0].upper()}.' + if i+1 != len(lst): + names += f'{padded_connector}' + except Exception as e: + print(f'{e} for {nms} at {i}th index') return names diff --git a/pandarize/frame.py b/pandarize/frame.py index cfa42e8..00e9086 100644 --- a/pandarize/frame.py +++ b/pandarize/frame.py @@ -14,11 +14,11 @@ def load(self, source=None, savefile=None): self.raw = bib_preprocessing(raw=self.raw) self.idxkey = rfindall_matched(self.raw, r'[.*]?@[^}]*{*[,]', '@') - def fit(self, kind='bib'): + def fit(self, kind='bib', postprocess=False): '''Method that infers data structure (in the future) ''' if kind == 'bib': - self.df = bib_parser(raw=self.raw, idxkey=self.idxkey) + self.df = bib_parser(raw=self.raw, idxkey=self.idxkey, postprocess=postprocess) def transform(self, formats='bib', types=None, alias=None, dirs=None): '''Transform loaded data into a specified data type From b2aaf3afa2d7e6c94ffb76251be83fb3c8cd9935 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sun, 25 Dec 2022 07:36:37 -0500 Subject: [PATCH 03/31] Add basic stdout for metadata (#50) --- pandarize/frame.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pandarize/frame.py b/pandarize/frame.py index 00e9086..bd8b840 100644 --- a/pandarize/frame.py +++ b/pandarize/frame.py @@ -24,4 +24,20 @@ def transform(self, formats='bib', types=None, alias=None, dirs=None): '''Transform loaded data into a specified data type ''' if formats == 'bib': - bib_writer(df=self.df, types=types, alias=alias, dirs=dirs) \ No newline at end of file + bib_writer(df=self.df, types=types, alias=alias, dirs=dirs) + + def describe(self): + '''Reports basic metadata''' + + if not self.df: + print('No file is loaded. Please load() and fit() to retrieve metadata.') + return 0 + + if self.df.shape[0] == 0 or self.df.shape[1] == 0: + print('The file has not been loaded successfully. Please check the file path and/or make sure that file is not corrupted.') + return 0 + + print(f'''The loaded file has {df.shape[0]} rows and {df.shape[1]} columns.\n + ''') + + ## Config Setting \ No newline at end of file From 0bf0f9827256469bcae0f73c6128d244f0181171 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sun, 25 Dec 2022 07:39:19 -0500 Subject: [PATCH 04/31] Stop returning integers (#50) --- pandarize/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandarize/frame.py b/pandarize/frame.py index bd8b840..c8a33ac 100644 --- a/pandarize/frame.py +++ b/pandarize/frame.py @@ -30,12 +30,12 @@ def describe(self): '''Reports basic metadata''' if not self.df: - print('No file is loaded. Please load() and fit() to retrieve metadata.') - return 0 + print('No file is loaded. Please load() and fit() to create metadata.') + return if self.df.shape[0] == 0 or self.df.shape[1] == 0: print('The file has not been loaded successfully. Please check the file path and/or make sure that file is not corrupted.') - return 0 + return print(f'''The loaded file has {df.shape[0]} rows and {df.shape[1]} columns.\n ''') From 8969ccfa0766c765cca577ce47f6f0a2f5d8ebd6 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sun, 25 Dec 2022 14:22:04 -0500 Subject: [PATCH 05/31] Fix conditional for empty df (#50) --- pandarize/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandarize/frame.py b/pandarize/frame.py index c8a33ac..86ea48e 100644 --- a/pandarize/frame.py +++ b/pandarize/frame.py @@ -27,9 +27,9 @@ def transform(self, formats='bib', types=None, alias=None, dirs=None): bib_writer(df=self.df, types=types, alias=alias, dirs=dirs) def describe(self): - '''Reports basic metadata''' + '''Generates basic metadata''' - if not self.df: + if self.df is None: print('No file is loaded. Please load() and fit() to create metadata.') return @@ -37,7 +37,7 @@ def describe(self): print('The file has not been loaded successfully. Please check the file path and/or make sure that file is not corrupted.') return - print(f'''The loaded file has {df.shape[0]} rows and {df.shape[1]} columns.\n + print(f'''The loaded file has {self.df.shape[0]} rows and {self.df.shape[1]} columns.\n ''') ## Config Setting \ No newline at end of file From 2c1fb900916077c9cb0fa79498dcf7b9810cd228 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sun, 25 Dec 2022 14:22:30 -0500 Subject: [PATCH 06/31] Display `describe` (#50) --- examples/bib_example.ipynb | 69 ++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/examples/bib_example.ipynb b/examples/bib_example.ipynb index d72479e..0c556cc 100644 --- a/examples/bib_example.ipynb +++ b/examples/bib_example.ipynb @@ -63,10 +63,43 @@ "pdr.fit()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Add basic description of loaded and fitted DataFrame" + ] + }, { "cell_type": "code", "execution_count": 4, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The loaded file has 374 rows and 25 columns.\n", + "\n", + " \n" + ] + } + ], + "source": [ + "pdr.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Head view of the DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, "outputs": [ { "data": { @@ -305,7 +338,7 @@ "[5 rows x 25 columns]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -323,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -332,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -341,7 +374,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -350,7 +383,7 @@ "'@misc{Prazosin2020, year={2020}, title={Prazosin Might Be A Treatment for COVID-19. More Data Is Urg'" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -368,7 +401,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -377,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -386,7 +419,7 @@ "'@misc{Prazosin2020, year={2020}, title={Prazosin Might Be A Treatment for COVID-19. More Data Is Urg'" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -397,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -406,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -530,7 +563,7 @@ "4 Mone, Amy and Mehl, Valerie press " ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -548,7 +581,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -557,7 +590,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -566,7 +599,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -575,7 +608,7 @@ "\"% ****************************************% * File: SAMPLE.BIB *% ****************************************% * An invented bib file *% * For the sample texts *% * The order is unimportant and there *% * may be more entries than references *% * in the text *% ****************************************% @ARTICLE{smit54,\\tAUTHOR = {J. G. Smith and H. K. Weston},\\tTITLE = {Nothing Particular in this Year's History},\\tYEAR = {1954},\\tJOURNAL = {J. Geophys. Res.},\\tVOLUME = {2},\\tPAGES = {14-15}}@BOOK{colu92,\\tAUTHOR = {Christopher Columbus},\\tTITLE = {How {I} Discovered {America}},\\tYEAR = {1492},\\tPUBLISHER = {Hispanic Press},\\tADDRESS = {Barcelona}}@ARTICLE{gree00,\\tAUTHOR = {R. J. Green and U. P. Fred and W. P. Norbert},\\tTITLE = {Things that Go Bump in the Night},\\tYEAR = {1900},\\tJOURNAL = {Psych. Today},\\tVOLUME = {46},\\tPAGES = {345-678}}@ARTICLE{phil99,\\tAUTHOR = {T. P. Phillips},\\tTITLE = {Possible Influence of the Magnetosphere on {American} History},\\tYEAR = {1999},\\tJOURNAL = {J. Oddball Res.},\\tVOLUME \"" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -586,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -595,7 +628,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -724,7 +757,7 @@ "4 32 34-55 NaN NaN " ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } From af0b76b8ac82e6d23703d3703a68f1070f4f3458 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sun, 25 Dec 2022 21:52:23 -0500 Subject: [PATCH 07/31] Add config loader (#51) --- pandarize/_util.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandarize/_util.py b/pandarize/_util.py index a392aa0..7f5bb4a 100644 --- a/pandarize/_util.py +++ b/pandarize/_util.py @@ -4,6 +4,8 @@ from pylatexenc.latex2text import LatexNodes2Text import re import os +import yaml as pyyaml +import pkgutil def source_loader(source, savefile): if check_url(string=source): @@ -33,6 +35,32 @@ def source_loader(source, savefile): return raw +def validate_config(obj): + '''Validates yaml config files''' + pass + +def load_config(yaml, path, ftype='bib'): + '''Loads yaml config file and returns a yaml object''' + def load(data): + try: + dic = {} + for i in pyyaml.safe_load(data)[ftype]: + for key, val in i.items(): + dic[key] = val + + print('Configuration applied. Please change the setting via .settings as needed.') + return dic + + except: + print('The config file is either not found or corrupted.') + + if yaml and path: + with open(path) as f: + return load(f) + else: + data = pkgutil.get_data(__name__, "/config/config.yaml").decode('utf-8') + return load(data) + def rfindall(string, pattern): '''Find index of all occurrence of the pattern''' From 3a6c8c9040def4b3da87df54e59ad4567d4b5bbf Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sun, 25 Dec 2022 21:52:48 -0500 Subject: [PATCH 08/31] Load settings when initialize (#51) --- pandarize/frame.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandarize/frame.py b/pandarize/frame.py index 86ea48e..56b1f73 100644 --- a/pandarize/frame.py +++ b/pandarize/frame.py @@ -1,11 +1,21 @@ import pandas as pd from pandarize._util import * + class Pandarizer: def __init__(self): self.raw = None self.df = None self.idxkey = None + self.settings = None + + + def initialize(self, yaml=False, path=None): + '''Initializes the setting either for the first time by + loading a default yaml config file in system dir or + load from an user-specified existing the file in `path` + ''' + self.settings = load_config(yaml=yaml, path=path) def load(self, source=None, savefile=None): '''Loads raw data from either local file or the url From 0a5fc1e201f308210bd8bb01327443c2c4f620a9 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sun, 25 Dec 2022 21:53:37 -0500 Subject: [PATCH 09/31] Find package automatically and include all yaml (#51) --- setup.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 8fcbd26..44bb137 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ from distutils.core import setup +from setuptools import find_packages import pandarize VERSION = pandarize.__version__ @@ -10,18 +11,19 @@ version=VERSION, author='Jong M. Shin', author_email='jshinm@gmail.com', - packages=['pandarize'], + packages=find_packages(), + package_data = {"": ['*.yaml']}, url='https://github.com/jshinm/pandarize/', license='MIT', description='Turns data into panda dataframe', readme='README.md', long_description_content_type='text/markdown', long_description=README, - requires=['pandas', 'requests'], - install_requires=["pandas", 'requests'], + requires=['pandas', 'requests', 'pyyaml'], + install_requires=["pandas", 'requests', 'pyyaml'], classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", -] + "Operating System :: OS Independent" + ] ) \ No newline at end of file From 6de706475d8c6f994ec8a6ccac4eb527c3397e9b Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sun, 25 Dec 2022 21:53:55 -0500 Subject: [PATCH 10/31] Add yaml config file (#51) --- pandarize/config/config.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 pandarize/config/config.yaml diff --git a/pandarize/config/config.yaml b/pandarize/config/config.yaml new file mode 100644 index 0000000..12dde3b --- /dev/null +++ b/pandarize/config/config.yaml @@ -0,0 +1,4 @@ +- bib: + convert_names: True + remove_latex: True + remove_empty_entries: True \ No newline at end of file From 409ddf7c8fb7379a77a26ed90b76a81118c9dfb5 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Mon, 26 Dec 2022 15:43:00 -0500 Subject: [PATCH 11/31] Add an entry for `truncate_author_list` (#39) --- pandarize/config/config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandarize/config/config.yaml b/pandarize/config/config.yaml index 12dde3b..9671280 100644 --- a/pandarize/config/config.yaml +++ b/pandarize/config/config.yaml @@ -1,4 +1,5 @@ - bib: convert_names: True remove_latex: True - remove_empty_entries: True \ No newline at end of file + remove_empty_entries: True + truncate_author_list: False \ No newline at end of file From deed76260d0fe64a244e7f37f9feacb4ed10075e Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Tue, 27 Dec 2022 19:13:32 -0500 Subject: [PATCH 12/31] Fix yaml file structure (#51) --- pandarize/config/config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandarize/config/config.yaml b/pandarize/config/config.yaml index 9671280..d962b6d 100644 --- a/pandarize/config/config.yaml +++ b/pandarize/config/config.yaml @@ -1,5 +1,5 @@ -- bib: - convert_names: True - remove_latex: True - remove_empty_entries: True - truncate_author_list: False \ No newline at end of file +bib: + - convert_names: True + - remove_latex: True + - remove_empty_entries: True + - truncate_author_list: False #happens after `fit` \ No newline at end of file From 4146588d122d13ea2d3e79b160dee9578445547a Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Wed, 28 Dec 2022 11:23:54 -0500 Subject: [PATCH 13/31] Add comments (#51) --- pandarize/config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandarize/config/config.yaml b/pandarize/config/config.yaml index d962b6d..0dfd0ba 100644 --- a/pandarize/config/config.yaml +++ b/pandarize/config/config.yaml @@ -2,4 +2,4 @@ bib: - convert_names: True - remove_latex: True - remove_empty_entries: True - - truncate_author_list: False #happens after `fit` \ No newline at end of file + - truncate_author_list: False #applied after `fit` \ No newline at end of file From 59d3aed4b843753e39929651643b075bb20dbed7 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Wed, 28 Dec 2022 11:50:12 -0500 Subject: [PATCH 14/31] Add placeholder for `truncate_names` (#51) --- pandarize/_util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandarize/_util.py b/pandarize/_util.py index 7f5bb4a..dd121b4 100644 --- a/pandarize/_util.py +++ b/pandarize/_util.py @@ -155,6 +155,10 @@ def bib_parser(raw, idxkey, postprocess): return df +def truncate_names(srs): + '''Truncates names in Pandas series''' + pass + def _itemize_bib(lst): '''Itemizes bib structured string into a json format''' new_lst = [] From 285898e97439adb2209e36bcc15b1e2452c612ce Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Thu, 29 Dec 2022 21:12:59 -0500 Subject: [PATCH 15/31] Add docstring for `convert_names` (#39) --- pandarize/_util.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandarize/_util.py b/pandarize/_util.py index dd121b4..74fa4ce 100644 --- a/pandarize/_util.py +++ b/pandarize/_util.py @@ -300,8 +300,17 @@ def check_names(string, sep, connector): return False def convert_names(string, sep=',', connector='and'): - '''Convert First MI Last names to Last, First MI format. - ''' + """Convert First MI Last names to Last, First MI format. + + Args: + string (str): parsed string that contains names with (name)(sep)(name) format + sep (str, optional): original string separator between names. Defaults to ','. + connector (str, optional): new name connector that will connect converted names. Defaults to 'and'. + + Returns: + str: converted names connected by `connector` + """ + padded_connector = f' {connector} ' if check_names(string, sep=sep, connector=padded_connector): @@ -322,6 +331,9 @@ def convert_names(string, sep=',', connector='and'): except Exception as e: print(f'{e} for {nms} at {i}th index') + # conditional here for truncate author list + + return names def bib_writer(df, types, alias, dirs): From 076c5eefe20b91ea15c3b5decc9f0c5c0bbbbad0 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 16:28:19 -0500 Subject: [PATCH 16/31] Remove old parser (#52) --- pandarize/_util.py | 49 ---------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/pandarize/_util.py b/pandarize/_util.py index 74fa4ce..35563c9 100644 --- a/pandarize/_util.py +++ b/pandarize/_util.py @@ -238,55 +238,6 @@ def postprocessing(df): return df -def bib_parser_old(raw): - '''Old bib parsing logic (deprecated and replaced by the new logic)''' - df_out = pd.DataFrame() - raw = manual_drop(raw, keys=['\n']) - raw = check_string(raw) - is_newRow = True - - for i, char in enumerate(raw[:]): - - if char == '@' and is_newRow: - new_row = {} - get_type = i+1 - elif char == '{': - if get_type: - new_row['type'] = raw[get_type:i].strip() - get_type = None - get_alias = i+1 #get the alias - elif curr_name != None: - get_item = i+1 - else: - pass - elif char == '}': - if get_item: - new_row[curr_name] = raw[get_item:i] - get_item = None - curr_name = None - else: - df_row = pd.DataFrame.from_dict(new_row, orient='index').T - df_out = pd.concat([df_out, df_row]) - is_newRow = True - elif char == '=' and get_name: - curr_name = raw[get_name:i].strip() - new_row[curr_name] = None - get_name = None - elif char == ',': - if get_alias: - new_row['alias'] = raw[get_alias:i] - get_alias = None - is_newRow = False - elif curr_name: - continue #edge case to handle comma (,) in the content - get_name = i+1 - else: - pass - - df_out.reset_index(drop=True, inplace=True) - - return df_out - def check_names(string, sep, connector): '''Checks for valid author names''' if connector in string: From c7554b7aaa5666ee536a1b569c0701ad284e0cca Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 22:47:37 -0500 Subject: [PATCH 17/31] Create a separate loader class for modularization (#52) --- pandarize/loader.py | 64 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 pandarize/loader.py diff --git a/pandarize/loader.py b/pandarize/loader.py new file mode 100644 index 0000000..38ef81e --- /dev/null +++ b/pandarize/loader.py @@ -0,0 +1,64 @@ +import os +import yaml as pyyaml +import requests +import pkgutil +from ._util import * + +class Loader: + def __init__(self): + self.settings = None + self.raw = None + + def source_loader(self, source, savefile): + if check_url(string=source): + r = requests.get(url=source) + r = r.content + else: + try: + with open(source, 'r', encoding='UTF-8', newline='') as f: + r = f.read() + except Exception as e: + print('Error while reading from local file') + + if isinstance(r, bytes): + raw = r.decode('utf-8') + elif isinstance(r, str): + raw = r + else: + raise Exception('The source cannot be parsed') + + if savefile: + folder, files = os.path.split(savefile) + if not os.path.exists(path=folder): + os.mkdir(path=folder) + + with open(savefile, 'w', encoding='UTF-8', newline='') as f: + f.write(raw) + + self.raw = raw + + def validate_config(self, obj): + '''Validates yaml config files''' + pass + + def load_config(self, yaml=None, path=None, ftype='bib'): + '''Loads yaml config file and returns a yaml object''' + def load(data): + try: + dic = {} + for i in pyyaml.safe_load(data)[ftype]: + for key, val in i.items(): + dic[key] = val + + print('Configuration applied. Please change the setting via .settings as needed.') + return dic + + except: + print('The config file is either not found or corrupted.') + + if yaml and path: + with open(path) as f: + return load(f) + else: + data = pkgutil.get_data(__name__, "/config/config.yaml").decode('utf-8') + self.settings = load(data) \ No newline at end of file From 57eda06a4c0fe9c0c00fa621340be68490ac82d7 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 22:49:35 -0500 Subject: [PATCH 18/31] Create a parser class that parse out a bib format (#52) --- pandarize/parser.py | 151 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 pandarize/parser.py diff --git a/pandarize/parser.py b/pandarize/parser.py new file mode 100644 index 0000000..46671d1 --- /dev/null +++ b/pandarize/parser.py @@ -0,0 +1,151 @@ +import re +from ._util import * +from pylatexenc.latex2text import LatexNodes2Text + +class Parser: + def __init__(self): + self.df = None + + def bib_preprocessing(self): + '''Pre-processes raw bib file''' + + raw = self.raw.replace('\n', '').replace('\r', '') #remove linebreaks and linefeed + raw = re.sub(' +', ' ', raw) #contract whitespace + + self.raw = raw + # return raw + + def postprocessing(self, df): + '''Post-process of constructed pandas DataFrame. Runs multiple checks.''' + + # Author Name Check for Biber + df['author'] = df['author'].apply(lambda x: convert_names(x)) + + return df + + def bib_parser(self, raw, idxkey, postprocess): + '''Main bib parsing logic''' + all_lst = [] + lst = [] + start = None + standby = None + + for i, c in enumerate(raw): + if c == '@': + if not i in idxkey: #skip if not true start + continue + + if lst: + # fixes cases when extra comma is added to the last key:value item + fix = raw[curr_idx:last_pair-2] + raw[last_pair-2:last_pair+1].replace(',', '') + lst.append(fix) #edge case for last key:value pair + all_lst.append(self._itemize_bib(lst)) + lst = [] + curr_idx = i + start = True + elif c == ',' and start: + lst.append(raw[curr_idx:i+1]) + start = False + curr_idx = i+1 + elif c == '}' and i != len(raw)-1: + last_pair = i #catches last pair and saves position as index + standby = True + elif c == ',' and standby: + # second check to account for misused bracket edge cases + # e.g., author = {A and B and C and {D} and F} + standby = False + + for check_i in raw[i+1:]: + if check_i == '}': + break + elif check_i == '=': + if raw[curr_idx:i+1]: + lst.append(raw[curr_idx:i+1]) #remove linebreak + curr_idx = i+1 + else: + break + elif i == len(raw)-1: + lst.append(raw[curr_idx:i+1]) + all_lst.append(self._itemize_bib(lst)) + elif c == ' ': + pass + else: + standby = False + + df = pd.DataFrame(all_lst) + if postprocess: + df = postprocessing(df) + + self.df = df + + def bib_writer(self, df, types, alias, dirs): + '''bib writer and formatter that converts pandas + dataframe into a bib file + ''' + + def parse(row, types=types, alias=alias): + items = [] + + for i, (idx, item) in enumerate(zip(row.index, row)): + if pd.isnull(item) or item == '': + continue + item = str(item) + if idx == types: + header = f'@{item}' + '{' + elif idx == alias: + alias = item + ',\n' + else: + item_i = f'\t{idx} = ' + '{' + f'{item}' + '},\n' + items.append(item_i) + + out_text = header + alias + for i in items: + out_text += i + out_text = out_text[:-2] #remove last comma + out_text += '\n},\n' + + return out_text + + N = df.shape[0] + + # Add stamper before the first header + out = stamper(target='bib') + + for i in range(N): + if i == N-1: #remove the very last comma + out += parse(df.iloc[i,:])[:-3] + parse(df.iloc[i,:])[-3:].replace(',', '') + '\n' + else: + out += parse(df.iloc[i,:]) + '\n' + + if not os.path.exists(path=dirs): + os.mkdir(path=dirs) + + with open(f'{dirs}output.bib', 'w', encoding='utf-8') as f: + f.write(out) + + @staticmethod + def _itemize_bib(lst): + '''Itemizes bib structured string into a json format''' + new_lst = [] + dic = {} + + for i, s in enumerate(lst): + if i == 0: + ii = s.rfind('@') + jj = s.rfind('{') + kk = s.rfind(',') + dic['type'] = s[ii:jj].replace('@', '') + dic['alias'] = s[jj:kk].replace('{', '') + else: + if s: + # print(s, sorted(rfindall(s, '='))) + ii = sorted(rfindall(s, '='))[0] + if s[-1] == ',': + s = s[:-1] + out = LatexNodes2Text().latex_to_text(s[ii+1:]).strip() + dic[s[:ii].strip()] = out + + for i in lst: + new_lst.append(LatexNodes2Text().latex_to_text(i)) + + return dic \ No newline at end of file From e13e128c2fa44db17fb1bb114f9da126924c6872 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 22:50:53 -0500 Subject: [PATCH 19/31] Refactor code to use other modules (#52) --- pandarize/frame.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pandarize/frame.py b/pandarize/frame.py index 56b1f73..ea7c1f8 100644 --- a/pandarize/frame.py +++ b/pandarize/frame.py @@ -1,40 +1,37 @@ import pandas as pd -from pandarize._util import * +from ._util import * +from .loader import Loader +from .parser import Parser - -class Pandarizer: +class Pandarizer(Loader, Parser): def __init__(self): - self.raw = None - self.df = None self.idxkey = None - self.settings = None - def initialize(self, yaml=False, path=None): '''Initializes the setting either for the first time by loading a default yaml config file in system dir or load from an user-specified existing the file in `path` ''' - self.settings = load_config(yaml=yaml, path=path) + self.load_config(yaml=yaml, path=path) def load(self, source=None, savefile=None): '''Loads raw data from either local file or the url ''' - self.raw = source_loader(source=source, savefile=savefile) - self.raw = bib_preprocessing(raw=self.raw) + self.source_loader(source=source, savefile=savefile) + self.bib_preprocessing() self.idxkey = rfindall_matched(self.raw, r'[.*]?@[^}]*{*[,]', '@') def fit(self, kind='bib', postprocess=False): '''Method that infers data structure (in the future) ''' if kind == 'bib': - self.df = bib_parser(raw=self.raw, idxkey=self.idxkey, postprocess=postprocess) + self.bib_parser(raw=self.raw, idxkey=self.idxkey, postprocess=postprocess) def transform(self, formats='bib', types=None, alias=None, dirs=None): '''Transform loaded data into a specified data type ''' if formats == 'bib': - bib_writer(df=self.df, types=types, alias=alias, dirs=dirs) + self.bib_writer(df=self.df, types=types, alias=alias, dirs=dirs) def describe(self): '''Generates basic metadata''' From a4a8f2dba68efb8a3cfd9b0d33a11c627da60c2e Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 22:52:14 -0500 Subject: [PATCH 20/31] Move functions to their corresponding modules (#52) --- pandarize/_util.py | 203 --------------------------------------------- 1 file changed, 203 deletions(-) diff --git a/pandarize/_util.py b/pandarize/_util.py index 35563c9..dae8fcc 100644 --- a/pandarize/_util.py +++ b/pandarize/_util.py @@ -1,65 +1,5 @@ -import requests -import pandas as pd from datetime import datetime -from pylatexenc.latex2text import LatexNodes2Text import re -import os -import yaml as pyyaml -import pkgutil - -def source_loader(source, savefile): - if check_url(string=source): - r = requests.get(url=source) - r = r.content - else: - try: - with open(source, 'r', encoding='UTF-8', newline='') as f: - r = f.read() - except Exception as e: - print('Error while reading from local file') - - if isinstance(r, bytes): - raw = r.decode('utf-8') - elif isinstance(r, str): - raw = r - else: - raise Exception('The source cannot be parsed') - - if savefile: - folder, files = os.path.split(savefile) - if not os.path.exists(path=folder): - os.mkdir(path=folder) - - with open(savefile, 'w', encoding='UTF-8', newline='') as f: - f.write(raw) - - return raw - -def validate_config(obj): - '''Validates yaml config files''' - pass - -def load_config(yaml, path, ftype='bib'): - '''Loads yaml config file and returns a yaml object''' - def load(data): - try: - dic = {} - for i in pyyaml.safe_load(data)[ftype]: - for key, val in i.items(): - dic[key] = val - - print('Configuration applied. Please change the setting via .settings as needed.') - return dic - - except: - print('The config file is either not found or corrupted.') - - if yaml and path: - with open(path) as f: - return load(f) - else: - data = pkgutil.get_data(__name__, "/config/config.yaml").decode('utf-8') - return load(data) def rfindall(string, pattern): '''Find index of all occurrence of the pattern''' @@ -92,99 +32,10 @@ def rfindall_matched(string, pattern, key): match_index.append(match.start() + match.group().rfind(key)) return match_index -def bib_preprocessing(raw): - '''Pre-processes raw bib file''' - - raw = raw.replace('\n', '').replace('\r', '') #remove linebreaks and linefeed - raw = re.sub(' +', ' ', raw) #contract whitespace - - return raw - -def bib_parser(raw, idxkey, postprocess): - '''Main bib parsing logic''' - all_lst = [] - lst = [] - start = None - standby = None - - for i, c in enumerate(raw): - if c == '@': - if not i in idxkey: #skip if not true start - continue - - if lst: - # fixes cases when extra comma is added to the last key:value item - fix = raw[curr_idx:last_pair-2] + raw[last_pair-2:last_pair+1].replace(',', '') - lst.append(fix) #edge case for last key:value pair - all_lst.append(_itemize_bib(lst)) - lst = [] - curr_idx = i - start = True - elif c == ',' and start: - lst.append(raw[curr_idx:i+1]) - start = False - curr_idx = i+1 - elif c == '}' and i != len(raw)-1: - last_pair = i #catches last pair and saves position as index - standby = True - elif c == ',' and standby: - # second check to account for misused bracket edge cases - # e.g., author = {A and B and C and {D} and F} - standby = False - - for check_i in raw[i+1:]: - if check_i == '}': - break - elif check_i == '=': - if raw[curr_idx:i+1]: - lst.append(raw[curr_idx:i+1]) #remove linebreak - curr_idx = i+1 - else: - break - elif i == len(raw)-1: - lst.append(raw[curr_idx:i+1]) - all_lst.append(_itemize_bib(lst)) - elif c == ' ': - pass - else: - standby = False - - df = pd.DataFrame(all_lst) - if postprocess: - df = postprocessing(df) - - return df - def truncate_names(srs): '''Truncates names in Pandas series''' pass -def _itemize_bib(lst): - '''Itemizes bib structured string into a json format''' - new_lst = [] - dic = {} - - for i, s in enumerate(lst): - if i == 0: - ii = s.rfind('@') - jj = s.rfind('{') - kk = s.rfind(',') - dic['type'] = s[ii:jj].replace('@', '') - dic['alias'] = s[jj:kk].replace('{', '') - else: - if s: - # print(s, sorted(rfindall(s, '='))) - ii = sorted(rfindall(s, '='))[0] - if s[-1] == ',': - s = s[:-1] - out = LatexNodes2Text().latex_to_text(s[ii+1:]).strip() - dic[s[:ii].strip()] = out - - for i in lst: - new_lst.append(LatexNodes2Text().latex_to_text(i)) - - return dic - def check_string(string): '''Screens for misinterpreted strings that interferes parsing (deprecated)''' @@ -230,14 +81,6 @@ def manual_drop(raw, keys): return raw -def postprocessing(df): - '''Post-process of constructed pandas DataFrame. Runs multiple checks.''' - - # Author Name Check for Biber - df['author'] = df['author'].apply(lambda x: convert_names(x)) - - return df - def check_names(string, sep, connector): '''Checks for valid author names''' if connector in string: @@ -283,51 +126,5 @@ def convert_names(string, sep=',', connector='and'): print(f'{e} for {nms} at {i}th index') # conditional here for truncate author list - return names - -def bib_writer(df, types, alias, dirs): - '''bib writer and formatter that converts pandas - dataframe into a bib file - ''' - - def parse(row, types=types, alias=alias): - items = [] - - for i, (idx, item) in enumerate(zip(row.index, row)): - if pd.isnull(item) or item == '': - continue - item = str(item) - if idx == types: - header = f'@{item}' + '{' - elif idx == alias: - alias = item + ',\n' - else: - item_i = f'\t{idx} = ' + '{' + f'{item}' + '},\n' - items.append(item_i) - - out_text = header + alias - for i in items: - out_text += i - out_text = out_text[:-2] #remove last comma - out_text += '\n},\n' - - return out_text - - N = df.shape[0] - - # Add stamper before the first header - out = stamper(target='bib') - - for i in range(N): - if i == N-1: #remove the very last comma - out += parse(df.iloc[i,:])[:-3] + parse(df.iloc[i,:])[-3:].replace(',', '') + '\n' - else: - out += parse(df.iloc[i,:]) + '\n' - - if not os.path.exists(path=dirs): - os.mkdir(path=dirs) - - with open(f'{dirs}output.bib', 'w', encoding='utf-8') as f: - f.write(out) \ No newline at end of file From 8248ba5df2c18d28b53fe72a62b61a76da75a0c7 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 22:54:26 -0500 Subject: [PATCH 21/31] Remove return statement and save as class variables (#52) --- pandarize/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandarize/loader.py b/pandarize/loader.py index 38ef81e..649222e 100644 --- a/pandarize/loader.py +++ b/pandarize/loader.py @@ -58,7 +58,7 @@ def load(data): if yaml and path: with open(path) as f: - return load(f) + self.settings = load(f) else: data = pkgutil.get_data(__name__, "/config/config.yaml").decode('utf-8') self.settings = load(data) \ No newline at end of file From 1a0a103f3f76c32adac3a764af9db6df726aa36e Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 23:39:55 -0500 Subject: [PATCH 22/31] Refactor `bib_parser` to use class variable (#52) --- pandarize/frame.py | 4 ++-- pandarize/parser.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandarize/frame.py b/pandarize/frame.py index ea7c1f8..f3040e1 100644 --- a/pandarize/frame.py +++ b/pandarize/frame.py @@ -19,13 +19,13 @@ def load(self, source=None, savefile=None): ''' self.source_loader(source=source, savefile=savefile) self.bib_preprocessing() - self.idxkey = rfindall_matched(self.raw, r'[.*]?@[^}]*{*[,]', '@') + # self.idxkey = rfindall_matched(self.raw, r'[.*]?@[^}]*{*[,]', '@') def fit(self, kind='bib', postprocess=False): '''Method that infers data structure (in the future) ''' if kind == 'bib': - self.bib_parser(raw=self.raw, idxkey=self.idxkey, postprocess=postprocess) + self.bib_parser(postprocess=postprocess) def transform(self, formats='bib', types=None, alias=None, dirs=None): '''Transform loaded data into a specified data type diff --git a/pandarize/parser.py b/pandarize/parser.py index 46671d1..1cac45c 100644 --- a/pandarize/parser.py +++ b/pandarize/parser.py @@ -1,4 +1,6 @@ import re +import os +import pandas as pd from ._util import * from pylatexenc.latex2text import LatexNodes2Text @@ -23,12 +25,14 @@ def postprocessing(self, df): return df - def bib_parser(self, raw, idxkey, postprocess): + def bib_parser(self, postprocess): '''Main bib parsing logic''' all_lst = [] lst = [] start = None standby = None + raw = self.raw + idxkey = rfindall_matched(raw, r'[.*]?@[^}]*{*[,]', '@') for i, c in enumerate(raw): if c == '@': From 4bebdd61ed54acb13e688f739d3924bdc84bec7b Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 23:41:50 -0500 Subject: [PATCH 23/31] Refactor `bib_writer` to use class variable (#52) --- pandarize/frame.py | 2 +- pandarize/parser.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandarize/frame.py b/pandarize/frame.py index f3040e1..4c14144 100644 --- a/pandarize/frame.py +++ b/pandarize/frame.py @@ -31,7 +31,7 @@ def transform(self, formats='bib', types=None, alias=None, dirs=None): '''Transform loaded data into a specified data type ''' if formats == 'bib': - self.bib_writer(df=self.df, types=types, alias=alias, dirs=dirs) + self.bib_writer(types=types, alias=alias, dirs=dirs) def describe(self): '''Generates basic metadata''' diff --git a/pandarize/parser.py b/pandarize/parser.py index 1cac45c..648ef4b 100644 --- a/pandarize/parser.py +++ b/pandarize/parser.py @@ -82,11 +82,13 @@ def bib_parser(self, postprocess): self.df = df - def bib_writer(self, df, types, alias, dirs): + def bib_writer(self, types, alias, dirs): '''bib writer and formatter that converts pandas dataframe into a bib file ''' + df = self.df + def parse(row, types=types, alias=alias): items = [] From 89ceef3be593b3eba9dfcbd8fcf40cc4e89b0e14 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 23:43:54 -0500 Subject: [PATCH 24/31] Remove frame init (#52) --- pandarize/frame.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandarize/frame.py b/pandarize/frame.py index 4c14144..9201412 100644 --- a/pandarize/frame.py +++ b/pandarize/frame.py @@ -4,9 +4,7 @@ from .parser import Parser class Pandarizer(Loader, Parser): - def __init__(self): - self.idxkey = None - + def initialize(self, yaml=False, path=None): '''Initializes the setting either for the first time by loading a default yaml config file in system dir or @@ -19,7 +17,6 @@ def load(self, source=None, savefile=None): ''' self.source_loader(source=source, savefile=savefile) self.bib_preprocessing() - # self.idxkey = rfindall_matched(self.raw, r'[.*]?@[^}]*{*[,]', '@') def fit(self, kind='bib', postprocess=False): '''Method that infers data structure (in the future) @@ -46,5 +43,3 @@ def describe(self): print(f'''The loaded file has {self.df.shape[0]} rows and {self.df.shape[1]} columns.\n ''') - - ## Config Setting \ No newline at end of file From 2481cce731efb5353905db4ef3eaf6c80d1aa181 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 23:47:38 -0500 Subject: [PATCH 25/31] Add comments for each switch (#51) --- pandarize/config/config.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandarize/config/config.yaml b/pandarize/config/config.yaml index 0dfd0ba..c1c191c 100644 --- a/pandarize/config/config.yaml +++ b/pandarize/config/config.yaml @@ -1,5 +1,6 @@ bib: - - convert_names: True - - remove_latex: True - - remove_empty_entries: True + - convert_names: True #change name format to Last, First MI + - remove_latex: True #remove latex syntax + - remove_html: False #TODO: remove html tags + - remove_empty_entries: True #empty entries are removed after `transform` - truncate_author_list: False #applied after `fit` \ No newline at end of file From ce55b3b9684e6daad74b52be7b952545733a066a Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 23:53:03 -0500 Subject: [PATCH 26/31] Add conditional for `convert_names` option (#51) --- pandarize/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandarize/parser.py b/pandarize/parser.py index 648ef4b..0a11e14 100644 --- a/pandarize/parser.py +++ b/pandarize/parser.py @@ -15,13 +15,13 @@ def bib_preprocessing(self): raw = re.sub(' +', ' ', raw) #contract whitespace self.raw = raw - # return raw def postprocessing(self, df): '''Post-process of constructed pandas DataFrame. Runs multiple checks.''' # Author Name Check for Biber - df['author'] = df['author'].apply(lambda x: convert_names(x)) + if self.settings['convert_names']: + df['author'] = df['author'].apply(lambda x: convert_names(x)) return df From f9103bc9cafc4a4f6990154a41ed74375ebf2b35 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Fri, 30 Dec 2022 23:54:28 -0500 Subject: [PATCH 27/31] Add comments (#51) --- pandarize/config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandarize/config/config.yaml b/pandarize/config/config.yaml index c1c191c..5d0dfb8 100644 --- a/pandarize/config/config.yaml +++ b/pandarize/config/config.yaml @@ -3,4 +3,4 @@ bib: - remove_latex: True #remove latex syntax - remove_html: False #TODO: remove html tags - remove_empty_entries: True #empty entries are removed after `transform` - - truncate_author_list: False #applied after `fit` \ No newline at end of file + - truncate_author_list: False #shorten names in the list by only stating the first person followed by optional `suffix` \ No newline at end of file From 26560686b1945746d4b0bb904ed7e892da6d8535 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sat, 31 Dec 2022 00:14:21 -0500 Subject: [PATCH 28/31] Removed latex option (#51) --- pandarize/config/config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/pandarize/config/config.yaml b/pandarize/config/config.yaml index 5d0dfb8..5ab03b8 100644 --- a/pandarize/config/config.yaml +++ b/pandarize/config/config.yaml @@ -1,6 +1,5 @@ bib: - convert_names: True #change name format to Last, First MI - - remove_latex: True #remove latex syntax - remove_html: False #TODO: remove html tags - remove_empty_entries: True #empty entries are removed after `transform` - truncate_author_list: False #shorten names in the list by only stating the first person followed by optional `suffix` \ No newline at end of file From bc8741399a1284ef4b49e3b9426126d88ec760a2 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sat, 31 Dec 2022 00:14:48 -0500 Subject: [PATCH 29/31] Removed outdated lines (#51) --- pandarize/parser.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandarize/parser.py b/pandarize/parser.py index 0a11e14..7e9aacc 100644 --- a/pandarize/parser.py +++ b/pandarize/parser.py @@ -144,14 +144,10 @@ def _itemize_bib(lst): dic['alias'] = s[jj:kk].replace('{', '') else: if s: - # print(s, sorted(rfindall(s, '='))) ii = sorted(rfindall(s, '='))[0] if s[-1] == ',': s = s[:-1] out = LatexNodes2Text().latex_to_text(s[ii+1:]).strip() dic[s[:ii].strip()] = out - - for i in lst: - new_lst.append(LatexNodes2Text().latex_to_text(i)) - + return dic \ No newline at end of file From e3a10f9ab296b407aa2b4305bf0cd0307c88d5d6 Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sat, 31 Dec 2022 01:09:41 -0500 Subject: [PATCH 30/31] Add `remove_empty_entries` option and fix bug (#51) --- pandarize/parser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandarize/parser.py b/pandarize/parser.py index 7e9aacc..b0c57e2 100644 --- a/pandarize/parser.py +++ b/pandarize/parser.py @@ -69,7 +69,8 @@ def bib_parser(self, postprocess): else: break elif i == len(raw)-1: - lst.append(raw[curr_idx:i+1]) + fix = raw[curr_idx:-3] + raw[-3:].replace(',', '') + lst.append(fix) all_lst.append(self._itemize_bib(lst)) elif c == ' ': pass @@ -93,7 +94,7 @@ def parse(row, types=types, alias=alias): items = [] for i, (idx, item) in enumerate(zip(row.index, row)): - if pd.isnull(item) or item == '': + if pd.isnull(item) or item == '' and self.settings['remove_empty_entries']: continue item = str(item) if idx == types: From 38a3a4ee5e42370698cdd0851b10a15d41ade2fb Mon Sep 17 00:00:00 2001 From: Jong Shin Date: Sat, 31 Dec 2022 01:11:36 -0500 Subject: [PATCH 31/31] Update version (#50, #51, #52) --- pandarize/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandarize/__init__.py b/pandarize/__init__.py index 7d53ea3..71eb32d 100644 --- a/pandarize/__init__.py +++ b/pandarize/__init__.py @@ -1 +1 @@ -__version__ = "0.0.7" \ No newline at end of file +__version__ = "0.0.8" \ No newline at end of file