From 3ff9702106850501ed61100917d2ba8682e760f1 Mon Sep 17 00:00:00 2001 From: Nicklas Reincke Date: Mon, 30 Jul 2018 16:42:06 +0200 Subject: [PATCH 01/15] Added Docker setup --- Dockerfile | 11 +++++++++++ docker-compose.yml | 8 ++++++++ requirements.txt | 0 3 files changed, 19 insertions(+) create mode 100644 Dockerfile create mode 100644 docker-compose.yml create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..67169d0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +ARG PYTHON_VERSION=3.7 +FROM python:${PYTHON_VERSION}-alpine + +WORKDIR /usr/src/app + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD [ "python", "./gedcom/__init__.py" ] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4c35a33 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,8 @@ +version: '2' + +services: + + python: + build: ./ + volumes: + - ./:/usr/src/app:z diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 From f004b07f79790fc113f57a59d71652ff604dab27 Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Thu, 15 Nov 2018 09:40:25 -0500 Subject: [PATCH 02/15] Support BOM control characters --- gedcom/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gedcom/__init__.py b/gedcom/__init__.py index f1d78d1..5d8554f 100644 --- a/gedcom/__init__.py +++ b/gedcom/__init__.py @@ -221,7 +221,7 @@ def __parse(self, file_path): line_number = 1 last_element = self.__root_element for line in gedcom_file: - last_element = self.__parse_line(line_number, line.decode('utf-8'), last_element) + last_element = self.__parse_line(line_number, line.decode('utf-8-sig'), last_element) line_number += 1 @staticmethod From 588aa5dfa774c0aa7eb56b71e684405292227a35 Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Thu, 15 Nov 2018 09:46:00 -0500 Subject: [PATCH 03/15] Uprev to 0.2.2 and update README --- README.md | 13 +++++++++++++ setup.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e715033..c2dbc73 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,13 @@ gedcom = Gedcom(file_path) # Then run methods on `gedcom` ... :) ``` +## Quirk Handling + +Some genealogical websites like MyHeritage and Ancestry produce GEDCOM files that are somewhat out of spec. This module aims to handle these quirks. Specifically that includes: + +- Ignoring BOM control characters +- Multi-line fields that don't use CONC or CONT + ## History This module was originally based on a GEDCOM parser written by @@ -44,8 +51,14 @@ Daniel Zappala at Brigham Young University (Copyright (C) 2005) which was licensed under the GPL v2 and then continued by [Mad Price Ball](https://github.com/madprime) in 2012. +Further updates by [Nicklas Reincke](https://github.com/nickreynke) and [Damon Brodie](https://github.com/nomadyow) in 2018. + ## Changelog +**v0.2.2dev** + +- Support BOM control characters + **v0.2.1dev** - Changed broken links to GEDCOM format specification ([#2](https://github.com/nickreynke/python-gedcom/issues/2)) diff --git a/setup.py b/setup.py index d84dcd9..5266523 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='python-gedcom', - version='0.2.0dev', + version='0.2.2dev', packages=['gedcom', ], license='GPLv2', package_dir={'': '.'}, From cfe9c4d7b51401c3d06cebc8498fab7fee16b4db Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Sat, 17 Nov 2018 12:22:03 -0500 Subject: [PATCH 04/15] Handle lines that don't have the level + tag. Insert CONT/CONC tags --- gedcom/__init__.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/gedcom/__init__.py b/gedcom/__init__.py index 5d8554f..91e707a 100644 --- a/gedcom/__init__.py +++ b/gedcom/__init__.py @@ -1,5 +1,6 @@ # Python GEDCOM Parser # +# Copyright (C) 2018 Damon Brodie (damon.brodie at gmail.com) # Copyright (C) 2018 Nicklas Reincke (contact at reynke.com) # Copyright (C) 2016 Andreas Oberritter # Copyright (C) 2012 Madeleine Price Ball @@ -257,17 +258,29 @@ def __parse_line(line_number, line, last_element): regex_match = regex.match(gedcom_line_regex, line) if regex_match is None: - error_message = ("Line `%d` of document violates GEDCOM format 5.5" % line_number + - "\nSee: https://chronoplexsoftware.com/gedcomvalidator/gedcom/gedcom-5.5.pdf") - raise SyntaxError(error_message) - - line_parts = regex_match.groups() - - level = int(line_parts[0]) - pointer = line_parts[1].rstrip(' ') - tag = line_parts[2] - value = line_parts[3][1:] - crlf = line_parts[4] + # Sometimes a gedcom has a text field with a CR. This creates a line without the standard + # level and pointer. If this is detected then turn it into a CONC or CONT + line_regex = '([^\n\r]*|)' + cont_line_regex = line_regex + end_of_line_regex + regex_match = regex.match(cont_line_regex, line) + line_parts = regex_match.groups() + level = last_element.get_level() + tag = last_element.get_tag() + pointer = None + value = line_parts[0][1:] + crlf = line_parts[1] + if tag != GEDCOM_TAG_CONTINUED and tag != GEDCOM_TAG_CONCATENATION: + # Increment level and change this line to a CONC + level += 1 + tag = GEDCOM_TAG_CONCATENATION + else: + line_parts = regex_match.groups() + + level = int(line_parts[0]) + pointer = line_parts[1].rstrip(' ') + tag = line_parts[2] + value = line_parts[3][1:] + crlf = line_parts[4] # Check level: should never be more than one higher than previous line. if level > last_element.get_level() + 1: From 0d1f717fe413bc78355e81957cd27ff99b267a36 Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Sat, 17 Nov 2018 13:28:07 -0500 Subject: [PATCH 05/15] Cleanup and document --- gedcom/__init__.py | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/gedcom/__init__.py b/gedcom/__init__.py index 91e707a..d030294 100644 --- a/gedcom/__init__.py +++ b/gedcom/__init__.py @@ -258,21 +258,34 @@ def __parse_line(line_number, line, last_element): regex_match = regex.match(gedcom_line_regex, line) if regex_match is None: - # Sometimes a gedcom has a text field with a CR. This creates a line without the standard - # level and pointer. If this is detected then turn it into a CONC or CONT - line_regex = '([^\n\r]*|)' - cont_line_regex = line_regex + end_of_line_regex - regex_match = regex.match(cont_line_regex, line) - line_parts = regex_match.groups() - level = last_element.get_level() - tag = last_element.get_tag() - pointer = None - value = line_parts[0][1:] - crlf = line_parts[1] - if tag != GEDCOM_TAG_CONTINUED and tag != GEDCOM_TAG_CONCATENATION: - # Increment level and change this line to a CONC - level += 1 - tag = GEDCOM_TAG_CONCATENATION + # Quirk check - see if this is a line without a CRLF (which could be the last line) + last_line_regex = level_regex + pointer_regex + tag_regex + value_regex + regex_match = regex.match(last_line_regex, line) + if regex_match is not None: + line_parts = regex_match.groups() + + level = int(line_parts[0]) + pointer = line_parts[1].rstrip(' ') + tag = line_parts[2] + value = line_parts[3][1:] + crlf = '\n' + else: + # Quirck check - Sometimes a gedcom has a text field with a CR. + # This creates a line without the standard level and pointer. If this is detected + # then turn it into a CONC or CONT + line_regex = '([^\n\r]*|)' + cont_line_regex = line_regex + end_of_line_regex + regex_match = regex.match(cont_line_regex, line) + line_parts = regex_match.groups() + level = last_element.get_level() + tag = last_element.get_tag() + pointer = None + value = line_parts[0][1:] + crlf = line_parts[1] + if tag != GEDCOM_TAG_CONTINUED and tag != GEDCOM_TAG_CONCATENATION: + # Increment level and change this line to a CONC + level += 1 + tag = GEDCOM_TAG_CONCATENATION else: line_parts = regex_match.groups() From a404bf85514ccee8ba466fdf0cf34733ffe44edb Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Sat, 17 Nov 2018 13:29:45 -0500 Subject: [PATCH 06/15] Add myself to the copyright in readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c2dbc73..80bfd32 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ Further updates by [Nicklas Reincke](https://github.com/nickreynke) and [Damon B Licensed under the [GNU General Public License v2](http://www.gnu.org/licenses/gpl-2.0.html) **Python GEDCOM Parser** +
Copyright (C) 2018 Damon Brodie (damon.brodie at gmail.com)
Copyright (C) 2018 Nicklas Reincke (contact at reynke.com)
Copyright (C) 2016 Andreas Oberritter
Copyright (C) 2012 Madeleine Price Ball From c279ba068145ecc23382c3a82d1eef2b4562fc35 Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Sat, 17 Nov 2018 13:32:17 -0500 Subject: [PATCH 07/15] Update README --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 80bfd32..43fa101 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ Some genealogical websites like MyHeritage and Ancestry produce GEDCOM files tha - Ignoring BOM control characters - Multi-line fields that don't use CONC or CONT +- Handle the last line not ending in a CRLF ## History @@ -58,6 +59,8 @@ Further updates by [Nicklas Reincke](https://github.com/nickreynke) and [Damon B **v0.2.2dev** - Support BOM control characters +- Support the last line not having a CR and/or LF +- Support incorrect line splitting generated by Ancestry. Insert CONT/CONC tag as necessary **v0.2.1dev** From 5173a6205ae7b76e87833b4c967c320df2b5fc54 Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Sat, 17 Nov 2018 22:46:29 -0500 Subject: [PATCH 08/15] Add documentation on methods --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 43fa101..1cd319f 100644 --- a/README.md +++ b/README.md @@ -33,8 +33,15 @@ from gedcom import Gedcom file_path = '' # Path to your `.ged` file gedcom = Gedcom(file_path) +``` + +#Iterate through all records, print the name of individuals -# Then run methods on `gedcom` ... :) +```python +all_records = gedcom.get_root_child_elements() +for record in all_records: + if (record.is_individual()): + print(record.get_name()) ``` ## Quirk Handling From 0236cbec4361f69ab0b391b90c83eb33c0379104 Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Sun, 18 Nov 2018 14:31:18 -0500 Subject: [PATCH 09/15] Table check --- README.md | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1cd319f..ab6668d 100644 --- a/README.md +++ b/README.md @@ -35,15 +35,69 @@ file_path = '' # Path to your `.ged` file gedcom = Gedcom(file_path) ``` -#Iterate through all records, print the name of individuals +Iterate through all records, search last names and print matches ```python all_records = gedcom.get_root_child_elements() for record in all_records: - if (record.is_individual()): - print(record.get_name()) + if record.is_individual(): + (first, last) = record.get_name() + if last == 'Brodie': + print(first + " " + last) ``` +## Reference + +# Single Record Methods + +Method | Parameters | Returns | Description +-------------|------------|---------|------------ +is_deceased | none | Boolean | +is_private | none | Boolean | +is_individual| none | Boolean | + + +get_occupation +get_last_changed_date +get_census +get_burial +get_death_year +get_death_data +get_birth_year +get_birth_data +get_gender +get_name +death_range_match +death_year_match +birth_range_match +birth_year_match +given_match +surname_match +criteria_match +is_object +is_file +is_family +set_parent_element +add_child_element +new_child_element +get_parent_element +get_child_elements +get_family_members +find_path_to_ancestors +get_parents +get_ancestors +get_families +marriage_range_match +marriage_year_match +get_marriage_years +get_marriages + +These operate on gedcom +get_root_element +get_root_child_elements +get_element_dictionary +get_element_list + ## Quirk Handling Some genealogical websites like MyHeritage and Ancestry produce GEDCOM files that are somewhat out of spec. This module aims to handle these quirks. Specifically that includes: From d3b369adecc8056229f8bc6b9702dfc6e7ad83f6 Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Sun, 18 Nov 2018 14:53:45 -0500 Subject: [PATCH 10/15] Update method table structure --- README.md | 86 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index ab6668d..ff48d60 100644 --- a/README.md +++ b/README.md @@ -48,55 +48,59 @@ for record in all_records: ## Reference -# Single Record Methods - -Method | Parameters | Returns | Description --------------|------------|---------|------------ -is_deceased | none | Boolean | -is_private | none | Boolean | -is_individual| none | Boolean | - - -get_occupation -get_last_changed_date -get_census -get_burial -get_death_year -get_death_data -get_birth_year -get_birth_data -get_gender -get_name -death_range_match -death_year_match -birth_range_match -birth_year_match -given_match -surname_match -criteria_match -is_object -is_file -is_family -set_parent_element -add_child_element -new_child_element -get_parent_element -get_child_elements -get_family_members +### Single Record Methods + +Method | Parameters | Returns | Description +-----------------------|------------|---------|------------ +get_child_elements ||| +get_parent_element ||| +new_child_element ||| +add_child_element ||| +set_parent_element | | | +is_individual | none | Boolean | +is_family | none | Boolean | +is_file | none | Boolean | +is_object | none | Boolean | +is_private | none | Boolean | +is_deceased | none | Boolean | +criteria_match ||| +surname_match ||| +given_match ||| +death_range_match ||| +death_year_match ||| +birth_range_match ||| +birth_year_match ||| +get_name ||| +get_gender ||| +get_birth_data ||| +get_birth_year ||| +get_death_data ||| +get_death_year ||| +get_burial ||| +get_census ||| +get_last_change_date ||| +get_occupation ||| +get_individual ||| + +### Multiple Record Methods + +These operate on gedcom +get_root_element +get_root_child_elements +get_element_dictionary +get_element_list +get_marriages find_path_to_ancestors +get_family_members get_parents get_ancestors get_families marriage_range_match marriage_year_match get_marriage_years -get_marriages +print_gedcom +save_gedcom -These operate on gedcom -get_root_element -get_root_child_elements -get_element_dictionary -get_element_list ## Quirk Handling From 616c66237849a9f6794c87b4bd1ee0d7f9ad223b Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Sun, 18 Nov 2018 17:32:28 -0500 Subject: [PATCH 11/15] Add a use_strict parameter (enabled by default) that when set to False enables the quirk handling from previous commits. --- README.md | 18 ++++++++++-- gedcom/__init__.py | 70 +++++++++++++++++++++++++--------------------- 2 files changed, 53 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index ff48d60..6c8ca72 100644 --- a/README.md +++ b/README.md @@ -35,14 +35,26 @@ file_path = '' # Path to your `.ged` file gedcom = Gedcom(file_path) ``` -Iterate through all records, search last names and print matches +### GEDCOM Quirks + +Large sites like Ancesty and MyHeritage (among others) don't always produce perfectly formatted GEDCOM files. If you encounter errors in parsing, you might consider disabling strict parsing which will make a best effort to parse file: + + +```python +from gedcom import Gedcom + +file_path = '' # Path to your `.ged` file +gedcom = Gedcom(file_path, False) # Disable strict parsing +``` + +### Iterate through all records, search last names and print matches ```python all_records = gedcom.get_root_child_elements() for record in all_records: if record.is_individual(): - (first, last) = record.get_name() - if last == 'Brodie': + if record.surname_match('Brodie'): + (first, last) = record.get_name() print(first + " " + last) ``` diff --git a/gedcom/__init__.py b/gedcom/__init__.py index d030294..a775f6e 100644 --- a/gedcom/__init__.py +++ b/gedcom/__init__.py @@ -138,7 +138,7 @@ class Gedcom: - a dict (only elements with pointers, which are the keys) """ - def __init__(self, file_path): + def __init__(self, file_path, use_strict = True): """Initialize a GEDCOM data object. You must supply a GEDCOM file :type file_path: str """ @@ -146,7 +146,8 @@ def __init__(self, file_path): self.__element_dictionary = {} self.invalidate_cache() self.__root_element = Element(-1, "", "ROOT", "") - self.__parse(file_path) + self.__parse(file_path, use_strict) + self.__use_strict = use_strict def invalidate_cache(self): """Cause get_element_list() and get_element_dictionary() to return updated data @@ -214,7 +215,7 @@ def get_root_child_elements(self): # Private methods - def __parse(self, file_path): + def __parse(self, file_path, use_strict = True): """Open and parse file path as GEDCOM 5.5 formatted data :type file_path: str """ @@ -222,11 +223,11 @@ def __parse(self, file_path): line_number = 1 last_element = self.__root_element for line in gedcom_file: - last_element = self.__parse_line(line_number, line.decode('utf-8-sig'), last_element) + last_element = self.__parse_line(line_number, line.decode('utf-8-sig'), last_element, use_strict ) line_number += 1 @staticmethod - def __parse_line(line_number, line, last_element): + def __parse_line(line_number, line, last_element, use_strict = True): """Parse a line from a GEDCOM 5.5 formatted document Each line should have the following (bracketed items optional): @@ -258,34 +259,39 @@ def __parse_line(line_number, line, last_element): regex_match = regex.match(gedcom_line_regex, line) if regex_match is None: - # Quirk check - see if this is a line without a CRLF (which could be the last line) - last_line_regex = level_regex + pointer_regex + tag_regex + value_regex - regex_match = regex.match(last_line_regex, line) - if regex_match is not None: - line_parts = regex_match.groups() - - level = int(line_parts[0]) - pointer = line_parts[1].rstrip(' ') - tag = line_parts[2] - value = line_parts[3][1:] - crlf = '\n' + if use_strict: + error_message = ("Line %d of document violates GEDCOM format 5.5" % line_number + + "\nSee: https://chronoplexsoftware.com/gedcomvalidator/gedcom/gedcom-5.5.pdf") + raise SyntaxError(error_message) else: - # Quirck check - Sometimes a gedcom has a text field with a CR. - # This creates a line without the standard level and pointer. If this is detected - # then turn it into a CONC or CONT - line_regex = '([^\n\r]*|)' - cont_line_regex = line_regex + end_of_line_regex - regex_match = regex.match(cont_line_regex, line) - line_parts = regex_match.groups() - level = last_element.get_level() - tag = last_element.get_tag() - pointer = None - value = line_parts[0][1:] - crlf = line_parts[1] - if tag != GEDCOM_TAG_CONTINUED and tag != GEDCOM_TAG_CONCATENATION: - # Increment level and change this line to a CONC - level += 1 - tag = GEDCOM_TAG_CONCATENATION + # Quirk check - see if this is a line without a CRLF (which could be the last line) + last_line_regex = level_regex + pointer_regex + tag_regex + value_regex + regex_match = regex.match(last_line_regex, line) + if regex_match is not None: + line_parts = regex_match.groups() + + level = int(line_parts[0]) + pointer = line_parts[1].rstrip(' ') + tag = line_parts[2] + value = line_parts[3][1:] + crlf = '\n' + else: + # Quirk check - Sometimes a gedcom has a text field with a CR. + # This creates a line without the standard level and pointer. If this is detected + # then turn it into a CONC or CONT + line_regex = '([^\n\r]*|)' + cont_line_regex = line_regex + end_of_line_regex + regex_match = regex.match(cont_line_regex, line) + line_parts = regex_match.groups() + level = last_element.get_level() + tag = last_element.get_tag() + pointer = None + value = line_parts[0][1:] + crlf = line_parts[1] + if tag != GEDCOM_TAG_CONTINUED and tag != GEDCOM_TAG_CONCATENATION: + # Increment level and change this line to a CONC + level += 1 + tag = GEDCOM_TAG_CONCATENATION else: line_parts = regex_match.groups() From a1bd8a78ff982ba759301f3629c488c7bc41f1f6 Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Sun, 18 Nov 2018 17:41:28 -0500 Subject: [PATCH 12/15] Make another table for gedcom methods --- README.md | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 6c8ca72..48b1f6b 100644 --- a/README.md +++ b/README.md @@ -94,24 +94,25 @@ get_last_change_date ||| get_occupation ||| get_individual ||| -### Multiple Record Methods - -These operate on gedcom -get_root_element -get_root_child_elements -get_element_dictionary -get_element_list -get_marriages -find_path_to_ancestors -get_family_members -get_parents -get_ancestors -get_families -marriage_range_match -marriage_year_match -get_marriage_years -print_gedcom -save_gedcom +### Gedcom operations + +Method | Parameters | Returns | Description | +------------------------|------------|---------|-------------| +get_root_element |||| +get_root_child_elements |||| +get_element_dictionary |||| +get_element_list |||| +get_marriages |||| +find_path_to_ancestors |||| +get_family_members |||| +get_parents |||| +get_ancestors |||| +get_families |||| +marriage_range_match |||| +marriage_year_match |||| +get_marriage_years |||| +print_gedcom |||| +save_gedcom |||| ## Quirk Handling From 3bcec6ac27a8aff0f95a7838db5b8cbd677407d7 Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Sun, 18 Nov 2018 23:07:20 -0500 Subject: [PATCH 13/15] Updates to the Readme documentation --- README.md | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 48b1f6b..bacdedd 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ gedcom = Gedcom(file_path) ### GEDCOM Quirks -Large sites like Ancesty and MyHeritage (among others) don't always produce perfectly formatted GEDCOM files. If you encounter errors in parsing, you might consider disabling strict parsing which will make a best effort to parse file: +Large sites like Ancesty and MyHeritage (among others) don't always produce perfectly formatted GEDCOM files. If you encounter errors in parsing, you might consider disabling strict parsing which will make a best effort to parse the file: ```python @@ -69,30 +69,30 @@ get_parent_element ||| new_child_element ||| add_child_element ||| set_parent_element | | | -is_individual | none | Boolean | -is_family | none | Boolean | +is_individual | none | Boolean | Is this record of a person +is_family | none | Boolean | is_file | none | Boolean | is_object | none | Boolean | -is_private | none | Boolean | -is_deceased | none | Boolean | -criteria_match ||| -surname_match ||| -given_match ||| -death_range_match ||| -death_year_match ||| -birth_range_match ||| -birth_year_match ||| -get_name ||| -get_gender ||| -get_birth_data ||| -get_birth_year ||| -get_death_data ||| -get_death_year ||| -get_burial ||| -get_census ||| -get_last_change_date ||| -get_occupation ||| -get_individual ||| +is_private | none | Boolean | Returns True if the record is marked Private +is_deceased | none | Boolean | Returns True if the individual is marked deceased +criteria_match |colon separated string "surname=[name]:name=[name]:birth][year]:birth_range=[year-to-year]:death=[year]:death_range[year-to-year]"| Boolean | Returns True if the criteria matches +surname_match | String | Boolean | Returns True if substring matches +given_match | String | Boolean | Returns True if subscring matches +death_range_match | Int from, Int to | Boolean | Returns True if Death Year is in the supplied range +death_year_match | Int | Boolean | Returns True if Death Year equals parameter +birth_range_match | Int from, Int to | Boolean | Returns True if Birth Year is in the supplied range +birth_year_match | Int | Boolean | Returns True if Birth Year equals parameter +get_name | none | (String given, String surname) | Returns the Given name(s) and Surname in a tuple +get_gender | none | String | Returns individual's gender +get_birth_data | none | (String date, String place, Array sources) | Returns a tuple of the birth data +get_birth_year | none | Int | Returns the Birth Year +get_death_data | none | (String date, String place, Array sources) | Returns a tuple of the death data +get_death_year | none | Int | Returns the Death Year +get_burial | none | (String date, String place, Array sources) | Returns a tuple of the burial data +get_census | none | List [String date, String place, Array sources] | Returns a List of tuple of the census data +get_last_change_date | none | String | Returns the date of the last update to this individual +get_occupation | none | String | Returns the individual's occupation +get_individual | none | Individual | Returns the individual ### Gedcom operations From 6e49b2d91db4df674dc25ee87baed872181b779e Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Mon, 19 Nov 2018 08:57:36 -0500 Subject: [PATCH 14/15] Update the documentation --- README.md | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index bacdedd..441fa43 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,11 @@ file_path = '' # Path to your `.ged` file gedcom = Gedcom(file_path, False) # Disable strict parsing ``` +Disabling strict parsing will allow the parser to gracefully handle the following quirks: + +- Multi-line fields that don't use CONC or CONT +- Handle the last line not ending in a CRLF + ### Iterate through all records, search last names and print matches ```python @@ -60,6 +65,8 @@ for record in all_records: ## Reference +The Element class contains all the information for a single record in the GEDCOM file, for example and individual. + ### Single Record Methods Method | Parameters | Returns | Description @@ -96,32 +103,23 @@ get_individual | none | Individual | Returns the individual ### Gedcom operations -Method | Parameters | Returns | Description | -------------------------|------------|---------|-------------| -get_root_element |||| -get_root_child_elements |||| -get_element_dictionary |||| -get_element_list |||| -get_marriages |||| -find_path_to_ancestors |||| -get_family_members |||| -get_parents |||| -get_ancestors |||| -get_families |||| -marriage_range_match |||| -marriage_year_match |||| -get_marriage_years |||| -print_gedcom |||| -save_gedcom |||| - - -## Quirk Handling - -Some genealogical websites like MyHeritage and Ancestry produce GEDCOM files that are somewhat out of spec. This module aims to handle these quirks. Specifically that includes: - -- Ignoring BOM control characters -- Multi-line fields that don't use CONC or CONT -- Handle the last line not ending in a CRLF +Method | Parameters | Returns | Description +------------------------|------------|---------|------------ +get_root_element | none | Element root | Returns the virtual "root" individual +get_root_child_elements | none | List of Element | Returns a List of all Elements +get_element_dictionary | none | Dict of Element | Returns a Dict of all Elements +get_element_list | none | List of Element | Returns a List of all Elements +get_marriages | Element individual | List of Marriage ("Date", "Place") | Returns List of Tuples of Marriage data (Date and Place) +find_path_to_ancestors | Element descendant, Element ancestor|| +get_family_members | Element individual, optional String members_type - one of "ALL" (default), "PARENTS", "HUSB", "WIFE", "CHIL" | List of Element individuals|| +get_parents | Element individual, optional String parent_type - one of "ALL" (default) or "NAT" | List of Element individuals| +get_ancestors | Element individual, optional String ancestor_type - one of "All" (default) or "NAT" || +get_families | Element individual optional String family_type - one of "FAMS" (default), "FAMC"|| +marriage_range_match | Element individual, Int from, Int to| Boolean | Check if individual is married within the specified range +marriage_year_match | Element individual, Int year| Boolean | Check if individual is married in the year specified +get_marriage_years | Element individual |List of Int| Returns Marriage event years +print_gedcom | none | none | Prints the gedcom to STDOUT +save_gedcom | String filename | none | Writes gedcom to specified filename ## History From 422273edc6da2c8ffaea40a04ee1503674c6a936 Mon Sep 17 00:00:00 2001 From: Damon Brodie Date: Mon, 19 Nov 2018 11:29:23 -0500 Subject: [PATCH 15/15] Add docs for element operations --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 441fa43..55b3d15 100644 --- a/README.md +++ b/README.md @@ -71,18 +71,18 @@ The Element class contains all the information for a single record in the GEDCOM Method | Parameters | Returns | Description -----------------------|------------|---------|------------ -get_child_elements ||| -get_parent_element ||| -new_child_element ||| -add_child_element ||| -set_parent_element | | | -is_individual | none | Boolean | Is this record of a person -is_family | none | Boolean | -is_file | none | Boolean | -is_object | none | Boolean | -is_private | none | Boolean | Returns True if the record is marked Private -is_deceased | none | Boolean | Returns True if the individual is marked deceased -criteria_match |colon separated string "surname=[name]:name=[name]:birth][year]:birth_range=[year-to-year]:death=[year]:death_range[year-to-year]"| Boolean | Returns True if the criteria matches +get_child_elements | none | List of Element | Returns all the child elements of this record +get_parent_element | none | Element | Returns parent Element +new_child_element | String tag, String pointer, String value | Element | Create a new Element +add_child_element | Element child | Element | Adds the child record +set_parent_element | Element parent| none | Not normally required to be called (add_child_element calls this automatically +is_individual | none | Boolean | Is this record of a person +is_family | none | Boolean | +is_file | none | Boolean | +is_object | none | Boolean | +is_private | none | Boolean | Returns True if the record is marked Private +is_deceased | none | Boolean | Returns True if the individual is marked deceased +criteria_match | colon separated string "surname=[name]:name=[name]:birth][year]:birth_range=[year-to-year]:death=[year]:death_range[year-to-year]"| Boolean | Returns True if the criteria matches surname_match | String | Boolean | Returns True if substring matches given_match | String | Boolean | Returns True if subscring matches death_range_match | Int from, Int to | Boolean | Returns True if Death Year is in the supplied range