From 967e0b6d150efc1ebc12a08bec96ac4bcc684d58 Mon Sep 17 00:00:00 2001 From: sb Date: Tue, 18 Oct 2022 10:04:01 -0400 Subject: [PATCH 1/3] Moves config.py and config.yaml into bigbang/ dir. Addresses #569 --- bigbang/analysis/listserv.py | 2 +- bigbang/analysis/repo_loader.py | 2 +- bigbang/analysis/utils.py | 2 +- bigbang/archive.py | 2 +- bigbang/bigbang_io.py | 6 +- bigbang/config.py | 27 ++++++ {config => bigbang}/config.yml | 2 +- {config => bigbang}/icann_certificate.pem | 0 bigbang/ingress/abstract.py | 10 +-- bigbang/ingress/git_repo.py | 2 +- bigbang/ingress/listserv.py | 2 +- bigbang/ingress/mailman.py | 2 +- bigbang/ingress/pipermail.py | 84 ++++++++++--------- bigbang/ingress/utils.py | 2 +- bigbang/ingress/w3c.py | 2 +- bigbang/utils.py | 2 +- bin/collect_draft_metadata.py | 60 ++++++------- bin/collect_listserv_urls.py | 6 +- config/__init__.py | 0 config/config.py | 23 ----- .../Classifying Email Domains.ipynb | 2 +- .../organizations/Full Archive Study.ipynb | 2 +- tests/analysis/test_listserv.py | 2 +- tests/analysis/test_utils.py | 58 +++++++++++++ tests/ingress/test_listserv.py | 2 +- tests/ingress/test_pipermail.py | 15 ++-- tests/ingress/test_w3c.py | 2 +- tests/test_bigbang_io.py | 3 +- tests/unit/test_archive.py | 2 +- tests/unit/test_bigbang.py | 2 +- tests/unit/test_listserv.py | 3 +- tests/unit/test_mailman.py | 2 +- 32 files changed, 203 insertions(+), 130 deletions(-) create mode 100644 bigbang/config.py rename {config => bigbang}/config.yml (97%) rename {config => bigbang}/icann_certificate.pem (100%) delete mode 100644 config/__init__.py delete mode 100644 config/config.py create mode 100644 tests/analysis/test_utils.py diff --git a/bigbang/analysis/listserv.py b/bigbang/analysis/listserv.py index 938cbd62..c55dd397 100644 --- a/bigbang/analysis/listserv.py +++ b/bigbang/analysis/listserv.py @@ -19,7 +19,7 @@ import yaml from bs4 import BeautifulSoup -from config.config import CONFIG +from bigbang.config import CONFIG import bigbang.bigbang_io as bio from bigbang.utils import ( diff --git a/bigbang/analysis/repo_loader.py b/bigbang/analysis/repo_loader.py index 68189ff6..4f6988a7 100644 --- a/bigbang/analysis/repo_loader.py +++ b/bigbang/analysis/repo_loader.py @@ -12,7 +12,7 @@ from nbconvert import PythonExporter import nbformat -from config.config import CONFIG +from bigbang.config import CONFIG from bigbang.ingress.git_repo import GitRepo, MultiGitRepo diff --git a/bigbang/analysis/utils.py b/bigbang/analysis/utils.py index 78cef893..bc06477a 100644 --- a/bigbang/analysis/utils.py +++ b/bigbang/analysis/utils.py @@ -9,7 +9,7 @@ from pathlib import Path import numpy as np import pandas as pd -from config.config import CONFIG +from bigbang.config import CONFIG filepath_auth = CONFIG.config_path + "authentication.yaml" directory_project = str(Path(os.path.abspath(__file__)).parent.parent) diff --git a/bigbang/archive.py b/bigbang/archive.py index 6093db55..63d5c467 100644 --- a/bigbang/archive.py +++ b/bigbang/archive.py @@ -19,7 +19,7 @@ from bigbang.parse import get_date, get_text import bigbang.analysis.process as process from bigbang.analysis.thread import Node, Thread -from config.config import CONFIG +from bigbang.config import CONFIG from . import utils diff --git a/bigbang/bigbang_io.py b/bigbang/bigbang_io.py index bbe58fb3..c0becf15 100644 --- a/bigbang/bigbang_io.py +++ b/bigbang/bigbang_io.py @@ -10,7 +10,7 @@ from pathlib import Path import numpy as np import pandas as pd -from config.config import CONFIG +from bigbang.config import CONFIG from bigbang.analysis import utils from bigbang.data_types import Message, MailList, MailListDomain @@ -196,7 +196,7 @@ def mlist_to_mbox( msgs: MailList, dir_out: str, filename: str, - mode: Optional[str]='w', + mode: Optional[str] = "w", ) -> None: """ Saves a List[mailbox.mboxMessage] as .mbox file. @@ -213,7 +213,7 @@ def mlist_to_mbox( # create filepath filepath = f"{dir_out}/{filename}.mbox" # delete file if there is one at the filepath - if Path(filepath).is_file() and mode == 'w': + if Path(filepath).is_file() and mode == "w": Path(filepath).unlink() mbox = mailbox.mbox(filepath) mbox.lock() diff --git a/bigbang/config.py b/bigbang/config.py new file mode 100644 index 00000000..beaf0ce0 --- /dev/null +++ b/bigbang/config.py @@ -0,0 +1,27 @@ +import yaml +import os + +bigbang_path = os.path.dirname(os.path.realpath(__file__)) +base_loc = os.path.abspath( + os.path.join(bigbang_path, os.pardir) +) # parent directory of config directory +config_filepath = os.path.join(base_loc, "bigbang", "config.yml") +stream = open(config_filepath, "r") +dictionary = yaml.safe_load(stream) + + +class Config(object): + def __init__(self, conf): + self.CONFIG = conf + + def __getattr__(self, query): + if query in self.CONFIG: + ans = self.CONFIG[query] + if "path" in query: + ans = os.path.join(base_loc, ans) + return ans + else: + return None + + +CONFIG = Config(dictionary) diff --git a/config/config.yml b/bigbang/config.yml similarity index 97% rename from config/config.yml rename to bigbang/config.yml index 8fb18df1..593dbb44 100644 --- a/config/config.yml +++ b/bigbang/config.yml @@ -10,7 +10,7 @@ # For Configuration to work properly, paths need to have "path" # in their keyword. # NOTE: BE CAFEFUL NOT TO PUT 'path' IN OTHER ATTRIBUTE NAMES -config_path : "config/" +config_path : "bigbang/" repo_path : "archives/sample-git-repos/" mail_path : "archives/" datatracker_path : "archives/datatracker" diff --git a/config/icann_certificate.pem b/bigbang/icann_certificate.pem similarity index 100% rename from config/icann_certificate.pem rename to bigbang/icann_certificate.pem diff --git a/bigbang/ingress/abstract.py b/bigbang/ingress/abstract.py index de273afc..4934c85c 100644 --- a/bigbang/ingress/abstract.py +++ b/bigbang/ingress/abstract.py @@ -15,7 +15,7 @@ import yaml from bs4 import BeautifulSoup -from config.config import CONFIG +from bigbang.config import CONFIG from bigbang.utils import get_paths_to_files_in_directory import bigbang.bigbang_io as bio from bigbang.data_types import Message, MailList @@ -108,7 +108,7 @@ def create_email_message( for key, value in header.items(): if "from" == key: - value = value.replace(" at ", '@') + value = value.replace(" at ", "@") if "content-type" == key: msg.set_param("Content-Type", value) @@ -157,12 +157,12 @@ def from_url( the Email. The latter is the default. """ soup = get_website_content(url, session=self.session) - + if soup == "RequestException": header = self.empty_header body = "RequestException" attachments = "RequestException" - + else: if fields in ["header", "total"]: header = self._get_header_from_html(soup) @@ -179,7 +179,7 @@ def from_url( else: body = None attachments = None - + return self.create_email_message(url, body, attachments, **header) @staticmethod diff --git a/bigbang/ingress/git_repo.py b/bigbang/ingress/git_repo.py index fe8d8ef9..8e77fcc7 100644 --- a/bigbang/ingress/git_repo.py +++ b/bigbang/ingress/git_repo.py @@ -8,7 +8,7 @@ from git import * from git import Repo -from config.config import CONFIG +from bigbang.config import CONFIG from bigbang import utils from bigbang.analysis.entity_resolution import entity_resolve diff --git a/bigbang/ingress/listserv.py b/bigbang/ingress/listserv.py index 67262f10..b42aa2b8 100644 --- a/bigbang/ingress/listserv.py +++ b/bigbang/ingress/listserv.py @@ -20,7 +20,7 @@ import yaml from bs4 import BeautifulSoup -from config.config import CONFIG +from bigbang.config import CONFIG import bigbang.bigbang_io as bio from bigbang.data_types import MailList diff --git a/bigbang/ingress/mailman.py b/bigbang/ingress/mailman.py index 5389cac1..b29250ca 100644 --- a/bigbang/ingress/mailman.py +++ b/bigbang/ingress/mailman.py @@ -22,7 +22,7 @@ import bigbang.archive as archive -from config.config import CONFIG +from bigbang.config import CONFIG from . import listserv, w3c, pipermail from .. import parse diff --git a/bigbang/ingress/pipermail.py b/bigbang/ingress/pipermail.py index 809d3d72..db941c98 100644 --- a/bigbang/ingress/pipermail.py +++ b/bigbang/ingress/pipermail.py @@ -20,7 +20,7 @@ import yaml from bs4 import BeautifulSoup -from config.config import CONFIG +from bigbang.config import CONFIG import bigbang.bigbang_io as bio from bigbang.data_types import MailList @@ -103,17 +103,23 @@ def from_pipermail_file( fields: str = "total", ) -> mboxMessage: """ """ - header_start_line_nr = self.find_start_of_header(fcontent, header_end_line_nr) + header_start_line_nr = self.find_start_of_header( + fcontent, header_end_line_nr + ) if header_start_line_nr is None: - logger.info("The start of header in {list_name}" +\ - " {header_end_line_nr} couldnt be found.") - print(f"The start of header in {list_name}" +\ - f"{header_end_line_nr} couldnt be found.") + logger.info( + "The start of header in {list_name}" + + " {header_end_line_nr} couldnt be found." + ) + print( + f"The start of header in {list_name}" + + f"{header_end_line_nr} couldnt be found." + ) archived_at = None body = None header = {} - + else: if fields in ["header", "total"]: header = self._get_header_from_pipermail_file( @@ -128,7 +134,7 @@ def from_pipermail_file( else: body = None archived_at = f"{list_name}_line_nr_{header_start_line_nr}" - + return self.create_email_message(archived_at, body, **header) def _get_header_from_pipermail_file( @@ -146,7 +152,7 @@ def _get_header_from_pipermail_file( """ fheader = fcontent[header_start_line_nr:header_end_line_nr] header = {} - + for lnr in range(len(fheader)): line = fheader[lnr] # get header keyword and value @@ -154,9 +160,9 @@ def _get_header_from_pipermail_file( key = line.split(":")[0] value = line.replace(key + ":", "").strip().rstrip("\n") header[key.lower()] = value - + return header - + def _get_body_from_pipermail_file( self, fcontent: List[str], @@ -169,21 +175,21 @@ def _get_body_from_pipermail_file( # remove empty lines and join into one string body = ("\n").join([line for line in body if len(line) > 1]) return body - + def find_start_of_header( self, fcontent: List[str], header_end_line_nr: int, ) -> int: header_start_line_nr = None - + for i in range(200): # 200 lines up just to make sure... - if fcontent[header_end_line_nr - i - 1] == '': + if fcontent[header_end_line_nr - i - 1] == "": header_start_line_nr = header_end_line_nr - i + 1 break - + return header_start_line_nr - + def find_end_of_body( self, fcontent: List[str], @@ -191,15 +197,15 @@ def find_end_of_body( ) -> int: found = False line_nr = body_start_line_nr + 2 - + while found is False: line_nr += 1 if line_nr >= len(fcontent): body_end_line_nr = -1 found = True - elif fcontent[line_nr].startswith('Message-ID:'): + elif fcontent[line_nr].startswith("Message-ID:"): for i in range(200): - if 'From:' in fcontent[line_nr - i]: + if "From:" in fcontent[line_nr - i]: body_end_line_nr = line_nr - i - 2 found = True break @@ -246,7 +252,7 @@ def from_url( name: str, url: str, select: Optional[dict] = {"fields": "total"}, - instant_save: Optional[bool]=True, + instant_save: Optional[bool] = True, ) -> "PipermailMailList": """Docstring in `AbstractMailList`.""" if "fields" not in list(select.keys()): @@ -281,7 +287,7 @@ def from_period_urls( url: str, period_urls: List[str], fields: str = "total", - instant_save: Optional[bool]=True, + instant_save: Optional[bool] = True, ) -> "PipermailMailList": """ Parameters @@ -294,18 +300,18 @@ def from_period_urls( period_url, verify=f"{directory_project}/config/icann_certificate.pem", ) - + try: fcontent = gzip.decompress(file.content).decode("utf-8") except Exception: print(f"File {period_url} in {name} could not be decoded") continue - - fcontent = fcontent.split('\n') + + fcontent = fcontent.split("\n") header_end_line_nrs = [ - idx+1 + idx + 1 for idx, fl in enumerate(fcontent) - if fl.startswith('Message-ID:') + if fl.startswith("Message-ID:") ] for header_end_line_nr in header_end_line_nrs: msgs.append( @@ -315,10 +321,13 @@ def from_period_urls( ) if (len(msgs) > 1e3) and (instant_save): bio.mlist_to_mbox( - msgs, CONFIG.mail_path+"ICANN/", name, 'a', + msgs, + CONFIG.mail_path + "ICANN/", + name, + "a", ) msgs = [] - + return cls(name, url, msgs) @classmethod @@ -388,32 +397,28 @@ def get_all_periods_and_their_urls( ) periods = [] urls_of_periods = [] - + if soup != "RequestException": rows = soup.select(f'a[href*=".txt.gz"]') for row in rows: - filename = row.get("href") + filename = row.get("href") if filename.endswith(".txt.gz") is False: continue year = re.findall(r"\d{4}", filename)[0] - month = filename.split('.')[0].replace(f"{year}-", '') + month = filename.split(".")[0].replace(f"{year}-", "") periods.append(f"{month} {year}") urls_of_periods.append(url + "/" + filename) - + return periods, urls_of_periods @staticmethod def get_name_from_url(url: str) -> str: """Get name of mailing list.""" - return url.split('/')[-1] - + return url.split("/")[-1] -class PipermailMailListDomain(): - - def __init__( - self, name: str, lists: List[Union[AbstractMailList, str]] - ): +class PipermailMailListDomain: + def __init__(self, name: str, lists: List[Union[AbstractMailList, str]]): self.name = name self.lists = lists @@ -461,7 +466,6 @@ def from_mailing_lists( return cls(name, lists) - def text_for_selector(soup: BeautifulSoup, selector: str): """ Filter out header or body field from website and return them as utf-8 string. diff --git a/bigbang/ingress/utils.py b/bigbang/ingress/utils.py index 0ddc1927..771f0b3f 100644 --- a/bigbang/ingress/utils.py +++ b/bigbang/ingress/utils.py @@ -15,7 +15,7 @@ import networkx as nx import pandas as pd -from config.config import CONFIG +from bigbang.config import CONFIG filepath_auth = CONFIG.config_path + "authentication.yaml" directory_project = str(Path(os.path.abspath(__file__)).parent.parent) diff --git a/bigbang/ingress/w3c.py b/bigbang/ingress/w3c.py index adcd726d..73112a9a 100644 --- a/bigbang/ingress/w3c.py +++ b/bigbang/ingress/w3c.py @@ -16,7 +16,7 @@ import yaml from bs4 import BeautifulSoup -from config.config import CONFIG +from bigbang.config import CONFIG import bigbang.bigbang_io as bio from bigbang.data_types import MailList diff --git a/bigbang/utils.py b/bigbang/utils.py index 384560fb..f0efb266 100644 --- a/bigbang/utils.py +++ b/bigbang/utils.py @@ -14,7 +14,7 @@ import networkx as nx import pandas as pd -from config.config import CONFIG +from bigbang.config import CONFIG filepath_auth = CONFIG.config_path + "authentication.yaml" directory_project = str(Path(os.path.abspath(__file__)).parent.parent) diff --git a/bin/collect_draft_metadata.py b/bin/collect_draft_metadata.py index 97af9e6f..20df9a44 100644 --- a/bin/collect_draft_metadata.py +++ b/bin/collect_draft_metadata.py @@ -1,48 +1,45 @@ -from config.config import CONFIG +from bigbang.config import CONFIG from ietfdata.datatracker import * import os import pandas as pd ## set up directory for storing metadata files + def setup_path(wg): if not os.path.exists(CONFIG.datatracker_path): os.makedirs(CONFIG.datatracker_path) - wg_path = os.path.join(CONFIG.datatracker_path, - wg) + wg_path = os.path.join(CONFIG.datatracker_path, wg) if not os.path.exists(wg_path): os.makedirs(wg_path) return wg_path + ## metadata extraction function + def extract_data(doc, dt): data = {} - data['title'] = doc.title + data["title"] = doc.title ## TODO: do this in only one loop over authors - data['person'] = [ - dt.person(doc_author.person) - for doc_author - in dt.document_authors(doc) + data["person"] = [ + dt.person(doc_author.person) for doc_author in dt.document_authors(doc) ] - data['affiliation'] = [ - doc_author.affiliation - for doc_author - in dt.document_authors(doc) + data["affiliation"] = [ + doc_author.affiliation for doc_author in dt.document_authors(doc) ] - data['group-acronym'] = dt.group(doc.group).acronym - data['type'] = doc.type.uri + data["group-acronym"] = dt.group(doc.group).acronym + data["type"] = doc.type.uri # use submissions for dates sub_data = [ - {'date' : dt.submission(sub_url).document_date} - for sub_url - in doc.submissions + {"date": dt.submission(sub_url).document_date} + for sub_url in doc.submissions ] for sd in sub_data: @@ -62,30 +59,34 @@ def collect_drafts(wg): if group is None: raise Exception(f"Group {wg} not found in datatracker") - ## Begin execution print("Collecting drafts from datatracker") # This returns a generator drafts = dt.documents( - group = group, - doctype = dt.document_type( - DocumentTypeURI("/api/v1/name/doctypename/draft/"))) + group=group, + doctype=dt.document_type( + DocumentTypeURI("/api/v1/name/doctypename/draft/") + ), + ) fn = os.path.join(wg_path, "draft_metadata.csv") - collection = [sub_data - for draft in drafts - for sub_data in extract_data(draft, dt)] + collection = [ + sub_data for draft in drafts for sub_data in extract_data(draft, dt) + ] draft_df = pd.DataFrame(collection) draft_df.to_csv(fn) + import argparse from argparse import RawTextHelpFormatter import logging -parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description=r""" +parser = argparse.ArgumentParser( + formatter_class=RawTextHelpFormatter, + description=r""" Collects files from public mailing list archives. Please include an IETF working group acronym @@ -94,20 +95,21 @@ def collect_drafts(wg): python bin/datatracker.py -w httpbis -""") -parser.add_argument('-w', type=str, help='IETF working group acronym') +""", +) +parser.add_argument("-w", type=str, help="IETF working group acronym") args = parser.parse_args() logging.basicConfig(level=logging.INFO) + def main(args): if args.w: collect_drafts(args.w) else: raise Exception("No working group given") + if __name__ == "__main__": main(args) - - diff --git a/bin/collect_listserv_urls.py b/bin/collect_listserv_urls.py index 009804cb..ee4bdc7b 100644 --- a/bin/collect_listserv_urls.py +++ b/bin/collect_listserv_urls.py @@ -4,7 +4,7 @@ import sys from argparse import RawTextHelpFormatter -from config.config import CONFIG +from bigbang.config import CONFIG from bigbang.listserv import ListservArchive march = ListservArchive.from_url( @@ -20,5 +20,5 @@ "w", ) for element in march.lists: - textfile.write(element+"\n") -textfile. close() + textfile.write(element + "\n") +textfile.close() diff --git a/config/__init__.py b/config/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/config/config.py b/config/config.py deleted file mode 100644 index d76a3a29..00000000 --- a/config/config.py +++ /dev/null @@ -1,23 +0,0 @@ -import yaml -import os - -file_path = os.path.dirname(os.path.realpath(__file__)) -base_loc = os.path.abspath(os.path.join(file_path, os.pardir)) # parent directory of config directory -config_filepath = os.path.join(base_loc, "config", "config.yml") -stream = open(config_filepath, "r") -dictionary = yaml.safe_load(stream) - -class Config(object): - def __init__(self, conf): - self.CONFIG = conf; - - def __getattr__(self, query): - if query in self.CONFIG: - ans = self.CONFIG[query]; - if "path" in query: - ans = os.path.join(base_loc, ans) - return ans - else: - return None - -CONFIG = Config(dictionary) diff --git a/examples/organizations/Classifying Email Domains.ipynb b/examples/organizations/Classifying Email Domains.ipynb index 1de73c2c..d4607f8d 100644 --- a/examples/organizations/Classifying Email Domains.ipynb +++ b/examples/organizations/Classifying Email Domains.ipynb @@ -18,7 +18,7 @@ "outputs": [], "source": [ "from bigbang.archive import Archive\n", - "from config.config import CONFIG\n", + "from bigbang.config import CONFIG\n", "import bigbang.datasets.domains as domains\n", "import bigbang.analysis.utils as utils\n", "import numpy as np\n", diff --git a/examples/organizations/Full Archive Study.ipynb b/examples/organizations/Full Archive Study.ipynb index c6e0bf7d..16a86f57 100644 --- a/examples/organizations/Full Archive Study.ipynb +++ b/examples/organizations/Full Archive Study.ipynb @@ -8,7 +8,7 @@ "outputs": [], "source": [ "from bigbang.archive import Archive\n", - "from config.config import CONFIG\n", + "from bigbang.config import CONFIG\n", "import bigbang.analysis.utils as utils\n", "import bigbang.datasets.domains as domains\n", "import os\n", diff --git a/tests/analysis/test_listserv.py b/tests/analysis/test_listserv.py index 130731fb..81617486 100644 --- a/tests/analysis/test_listserv.py +++ b/tests/analysis/test_listserv.py @@ -11,7 +11,7 @@ from bigbang.analysis.listserv import ListservMailListDomain from bigbang.analysis.listserv import ListservMailList -from config.config import CONFIG +from bigbang.config import CONFIG dir_temp = tempfile.gettempdir() file_temp_mbox = dir_temp + "/listserv.mbox" diff --git a/tests/analysis/test_utils.py b/tests/analysis/test_utils.py new file mode 100644 index 00000000..5d3e14a7 --- /dev/null +++ b/tests/analysis/test_utils.py @@ -0,0 +1,58 @@ +import bigbang.analysis.utils as utils + +import unittest +import yaml as yaml + +import os + +TEST_FILENAME = os.path.join( + os.path.dirname(__file__), "../data/address_header_test_file.yaml" +) + + +def extract_email(from_field): + """ + Returns an email address from a string. + """ + match = re.search(email_regex, from_field) + + if match is not None: + return match[0].lower() + + else: + return None + + +def extract_domain(from_field): + """ + Returns the domain of an email address from a string. + """ + match = re.search(email_regex, from_field) + + if match is not None: + return re.search(domain_regex, match[0])[1] + + else: + return None + + +""" +class TestExtractEmail(unittest.TestCase): + def test_extract_email(self): + with open(TEST_FILENAME) as file: + data = file.read() + + dct = yaml.safe_load(data) + + +class TestExtractDomain(unittest.TestCase): + def test_extract_domain(self): + with open(TEST_FILENAME) as file: + data = file.read() + + dct = yaml.safe_load(data) + + for header in dct: + import pdb; pdb.set_trace() + self.assertEqual(utils.extract_domain(header), dct[header]['domain'].lower()) +""" diff --git a/tests/ingress/test_listserv.py b/tests/ingress/test_listserv.py index 5e4845aa..94cc3cd1 100644 --- a/tests/ingress/test_listserv.py +++ b/tests/ingress/test_listserv.py @@ -14,7 +14,7 @@ ListservMailListDomain, ) from bigbang.ingress.utils import get_login_from_terminal -from config.config import CONFIG +from bigbang.config import CONFIG dir_temp = tempfile.gettempdir() url_mlistdom = "https://listserv.ieee.org/cgi-bin/wa?" diff --git a/tests/ingress/test_pipermail.py b/tests/ingress/test_pipermail.py index d81e9bfd..cc6f1731 100644 --- a/tests/ingress/test_pipermail.py +++ b/tests/ingress/test_pipermail.py @@ -14,7 +14,12 @@ PipermailMessageParser, PipermailMailList, ) -from config.config import CONFIG +from bigbang.config import CONFIG + +# import pdb; +# pdb.set_trace() + +# f"{CONFIG.bigbang_path}/icann_certificate.pem" directory_project = str(Path(os.path.abspath(__file__)).parent.parent) url_mlistdom = "https://mm.icann.org/pipermail/" @@ -29,7 +34,7 @@ def get_message(self): verify=f"{CONFIG.config_path}/icann_certificate.pem", ) fcontent = gzip.decompress(file.content).decode("utf-8") - fcontent = fcontent.split('\n') + fcontent = fcontent.split("\n") msg_parser = PipermailMessageParser(website=False) msg = msg_parser.from_pipermail_file( list_name="accred-model", @@ -40,7 +45,7 @@ def get_message(self): return msg def test__message_content(self, msg): - firstline = msg.get_payload().split('=')[0] + firstline = msg.get_payload().split("=")[0] assert "Theo, hope you are well." in firstline assert len(firstline) == 635 assert msg["subject"] == "[Accred-Model] Codes of conduct" @@ -48,7 +53,7 @@ def test__message_content(self, msg): assert msg["to"] is None assert msg["date"] == "Wed, 01 Aug 2018 13:29:58 +0300" assert msg["Content-Type"] == 'text/plain; charset="utf-8"' - assert msg["Archived-At"] == '' + assert msg["Archived-At"] == "" def test__to_dict(self, msg): dic = PipermailMessageParser.to_dict(msg) @@ -74,7 +79,7 @@ def test__mailinglist_content(self, mlist): # On 13/09/22 the mailing list contained 175 Emails. assert len(mlist) >= 175 subjects = [msg["subject"] for msg in mlist.messages] - assert '[Accred-Model] Codes of conduct' in subjects + assert "[Accred-Model] Codes of conduct" in subjects def test__to_dict(self, mlist): dic = mlist.to_dict() diff --git a/tests/ingress/test_w3c.py b/tests/ingress/test_w3c.py index 148ea674..b8e3a9f0 100644 --- a/tests/ingress/test_w3c.py +++ b/tests/ingress/test_w3c.py @@ -13,7 +13,7 @@ W3CMailList, W3CMailListDomain, ) -from config.config import CONFIG +from bigbang.config import CONFIG url_mlistdom = "https://lists.w3.org/Archives/Public/" url_list = url_mlistdom + "public-testtwf/" diff --git a/tests/test_bigbang_io.py b/tests/test_bigbang_io.py index 514f7f78..44d3bcac 100644 --- a/tests/test_bigbang_io.py +++ b/tests/test_bigbang_io.py @@ -12,7 +12,7 @@ from bigbang.analysis.listserv import ListservMailList import bigbang.bigbang_io as bio -from config.config import CONFIG +from bigbang.config import CONFIG dir_temp = tempfile.gettempdir() file_temp_mbox = dir_temp + "/listserv.mbox" @@ -28,6 +28,7 @@ def get_mailinglist(): ) return mlist + def test__mlist_to_list_of_mboxMessage(mlist): mbox = bio.mlist_to_list_of_mboxMessage(mlist.df, include_body=False) msgcount = len(mbox) diff --git a/tests/unit/test_archive.py b/tests/unit/test_archive.py index e030946f..f60bd1dc 100644 --- a/tests/unit/test_archive.py +++ b/tests/unit/test_archive.py @@ -7,7 +7,7 @@ import pandas as pd import bigbang.archive as archive -from config.config import CONFIG +from bigbang.config import CONFIG test_txt = "" TEMP_DIR = os.path.join(CONFIG.test_data_path, "tmp") diff --git a/tests/unit/test_bigbang.py b/tests/unit/test_bigbang.py index 4582fe84..82bebf67 100644 --- a/tests/unit/test_bigbang.py +++ b/tests/unit/test_bigbang.py @@ -17,7 +17,7 @@ import bigbang.analysis.process as process import bigbang.utils as utils from bigbang.analysis import repo_loader -from config.config import CONFIG +from bigbang.config import CONFIG test_txt = "" TEMP_DIR = os.path.join(CONFIG.test_data_path, "tmp") diff --git a/tests/unit/test_listserv.py b/tests/unit/test_listserv.py index 136f5298..0817bfa1 100644 --- a/tests/unit/test_listserv.py +++ b/tests/unit/test_listserv.py @@ -13,7 +13,7 @@ ListservMailList, ListservMailListDomain, ) -from config.config import CONFIG +from bigbang.config import CONFIG dir_temp = tempfile.gettempdir() file_temp_mbox = dir_temp + "/listserv.mbox" @@ -82,7 +82,6 @@ def test__to_mbox(self, msg_parser, msg): class TestListservMailList: - def test__from_mbox(self): mlist_name = "3GPP_TSG_SA_WG4_EVS" mlist = ListservMailList.from_mbox( diff --git a/tests/unit/test_mailman.py b/tests/unit/test_mailman.py index e3ea5d2d..2e2a156b 100644 --- a/tests/unit/test_mailman.py +++ b/tests/unit/test_mailman.py @@ -14,7 +14,7 @@ import bigbang.archive as archive import bigbang.ingress.mailman as mailman -from config.config import CONFIG +from bigbang.config import CONFIG test_txt = "" TEMP_DIR = os.path.join(CONFIG.test_data_path, "tmp") From fa38080784174a660703c9ddd1e1a157d15d64e8 Mon Sep 17 00:00:00 2001 From: sb Date: Fri, 21 Oct 2022 09:42:14 -0400 Subject: [PATCH 2/3] fixing pipermail ingress and tests for new config location --- bigbang/ingress/pipermail.py | 6 ++++-- tests/ingress/test_pipermail.py | 8 +------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/bigbang/ingress/pipermail.py b/bigbang/ingress/pipermail.py index db941c98..0deb9bff 100644 --- a/bigbang/ingress/pipermail.py +++ b/bigbang/ingress/pipermail.py @@ -298,7 +298,9 @@ def from_period_urls( for period_url in tqdm(period_urls, ascii=True, desc=name): file = requests.get( period_url, - verify=f"{directory_project}/config/icann_certificate.pem", + verify=os.path.join( + CONFIG.config_path, "icann_certificate.pem" + ), ) try: @@ -393,7 +395,7 @@ def get_all_periods_and_their_urls( time.sleep(0.5) soup = get_website_content( url, - verify=f"{directory_project}/config/icann_certificate.pem", + verify=os.path.join(CONFIG.config_path, "icann_certificate.pem"), ) periods = [] urls_of_periods = [] diff --git a/tests/ingress/test_pipermail.py b/tests/ingress/test_pipermail.py index cc6f1731..8251dc6b 100644 --- a/tests/ingress/test_pipermail.py +++ b/tests/ingress/test_pipermail.py @@ -9,18 +9,12 @@ import pytest import yaml -import bigbang from bigbang.ingress import ( PipermailMessageParser, PipermailMailList, ) from bigbang.config import CONFIG -# import pdb; -# pdb.set_trace() - -# f"{CONFIG.bigbang_path}/icann_certificate.pem" - directory_project = str(Path(os.path.abspath(__file__)).parent.parent) url_mlistdom = "https://mm.icann.org/pipermail/" url_list = url_mlistdom + "accred-model" @@ -31,7 +25,7 @@ class TestPipermailMessageParser: def get_message(self): file = requests.get( "https://mm.icann.org/pipermail/accred-model/2018-August.txt.gz", - verify=f"{CONFIG.config_path}/icann_certificate.pem", + verify=os.path.join(CONFIG.config_path, "icann_certificate.pem"), ) fcontent = gzip.decompress(file.content).decode("utf-8") fcontent = fcontent.split("\n") From bda97dbed9cc7e51ce5c496b7fd88586337df395 Mon Sep 17 00:00:00 2001 From: sb Date: Thu, 10 Nov 2022 16:08:49 -0500 Subject: [PATCH 3/3] remove test_utils --- tests/analysis/test_utils.py | 58 ------------------------------------ 1 file changed, 58 deletions(-) delete mode 100644 tests/analysis/test_utils.py diff --git a/tests/analysis/test_utils.py b/tests/analysis/test_utils.py deleted file mode 100644 index 5d3e14a7..00000000 --- a/tests/analysis/test_utils.py +++ /dev/null @@ -1,58 +0,0 @@ -import bigbang.analysis.utils as utils - -import unittest -import yaml as yaml - -import os - -TEST_FILENAME = os.path.join( - os.path.dirname(__file__), "../data/address_header_test_file.yaml" -) - - -def extract_email(from_field): - """ - Returns an email address from a string. - """ - match = re.search(email_regex, from_field) - - if match is not None: - return match[0].lower() - - else: - return None - - -def extract_domain(from_field): - """ - Returns the domain of an email address from a string. - """ - match = re.search(email_regex, from_field) - - if match is not None: - return re.search(domain_regex, match[0])[1] - - else: - return None - - -""" -class TestExtractEmail(unittest.TestCase): - def test_extract_email(self): - with open(TEST_FILENAME) as file: - data = file.read() - - dct = yaml.safe_load(data) - - -class TestExtractDomain(unittest.TestCase): - def test_extract_domain(self): - with open(TEST_FILENAME) as file: - data = file.read() - - dct = yaml.safe_load(data) - - for header in dct: - import pdb; pdb.set_trace() - self.assertEqual(utils.extract_domain(header), dct[header]['domain'].lower()) -"""