From 3be2401884ab7add8e8ee8b4b6b5a21e153f4c9d Mon Sep 17 00:00:00 2001 From: rfm-targa Date: Thu, 11 Jul 2024 19:14:32 +0100 Subject: [PATCH] Converting to tabs. --- CHEWBBACA/utils/chewiens_requests.py | 864 ++++----- CHEWBBACA/utils/constants.py | 316 ++-- CHEWBBACA/utils/distance_matrix.py | 738 ++++---- CHEWBBACA/utils/fasttree_wrapper.py | 38 +- CHEWBBACA/utils/file_operations.py | 1050 +++++------ CHEWBBACA/utils/gene_prediction.py | 422 ++--- CHEWBBACA/utils/iterables_manipulation.py | 1646 ++++++++--------- CHEWBBACA/utils/join_profiles.py | 172 +- CHEWBBACA/utils/mafft_wrapper.py | 72 +- CHEWBBACA/utils/multiprocessing_operations.py | 420 ++--- CHEWBBACA/utils/process_datetime.py | 220 +-- CHEWBBACA/utils/profile_hasher.py | 360 ++-- CHEWBBACA/utils/profiles_sqlitedb.py | 1190 ++++++------ CHEWBBACA/utils/sequence_clustering.py | 936 +++++----- CHEWBBACA/utils/sequence_manipulation.py | 1166 ++++++------ CHEWBBACA/utils/uniprot_requests.py | 578 +++--- 16 files changed, 5094 insertions(+), 5094 deletions(-) diff --git a/CHEWBBACA/utils/chewiens_requests.py b/CHEWBBACA/utils/chewiens_requests.py index 6a546217..a798d56a 100644 --- a/CHEWBBACA/utils/chewiens_requests.py +++ b/CHEWBBACA/utils/chewiens_requests.py @@ -19,480 +19,480 @@ from urllib.parse import urlparse, urlencode, urlsplit, parse_qs try: - from utils import constants as ct + from utils import constants as ct except ModuleNotFoundError: - from CHEWBBACA.utils import constants as ct + from CHEWBBACA.utils import constants as ct def simple_get_request(url, headers, endpoint_list=None, - parameters=None, verify=False, timeout=30): - """Request data through a GET method sent to an endpoint. - - Construct an URI for an endpoint and request data through - a GET method sent to that endpoint. - - Parameters - ---------- - url : str - The base URL for a Chewie-NS instance. - headers : dict - HTTP headers for GET requests. - endpoint_list : list - List with elements that will be concatenated to the - base URI to create the URI for the API endpoint. - parameters : dict or list or bytes - Dictionary, list of tuples or bytes to send in the - query string for the Request. - verify : bool - If the SSL certificates should be verified in HTTPS - requests (False for no verification, True otherwise). - timeout : int - Maximum time in seconds to wait for response. - - Returns - ------- - url : str - Complete URI for the API endpoint. - res : requests.models.Response - The response returned through the GET method. - """ - # unpack list of sequential endpoints and pass to create URI - if endpoint_list is not None: - url = make_url(url, *endpoint_list) - - res = requests.get(url, headers=headers, params=parameters, - timeout=timeout, verify=verify) - - # return queried endpoint URI and response - return [url, res] + parameters=None, verify=False, timeout=30): + """Request data through a GET method sent to an endpoint. + + Construct an URI for an endpoint and request data through + a GET method sent to that endpoint. + + Parameters + ---------- + url : str + The base URL for a Chewie-NS instance. + headers : dict + HTTP headers for GET requests. + endpoint_list : list + List with elements that will be concatenated to the + base URI to create the URI for the API endpoint. + parameters : dict or list or bytes + Dictionary, list of tuples or bytes to send in the + query string for the Request. + verify : bool + If the SSL certificates should be verified in HTTPS + requests (False for no verification, True otherwise). + timeout : int + Maximum time in seconds to wait for response. + + Returns + ------- + url : str + Complete URI for the API endpoint. + res : requests.models.Response + The response returned through the GET method. + """ + # unpack list of sequential endpoints and pass to create URI + if endpoint_list is not None: + url = make_url(url, *endpoint_list) + + res = requests.get(url, headers=headers, params=parameters, + timeout=timeout, verify=verify) + + # return queried endpoint URI and response + return [url, res] def simple_post_request(url, headers, endpoint_list=None, - data=None, files=None, verify=False): - """Send data through a POST method. - - Construct an URI for an endpoint and send data through - a POST method. - - Parameters - ---------- - url : str - The base URL for a Chewie-NS instance. - headers : dict - HTTP headers for POST requests. - endpoint_list : list - List with elements that will be concatenated to the - base URI to create the URI for the API endpoint. - data - Data to send. Must be an object that can be JSON - serialized. - files : dict - Dictionary with file data to send (see: - https://requests.readthedocs.io/en/master/user - /quickstart/#post-a-multipart-encoded-file). - verify : bool - If the SSL certificates should be verified in HTTPS - requests (False for no verification, True otherwise). - - Returns - ------- - url : str - Complete URI for the API endpoint. - res : requests.models.Response - The response returned through the POST method. - """ - # unpack list of sequential endpoints and pass to create URI - if endpoint_list is not None: - url = make_url(url, *endpoint_list) - - # if the content is defined as 'application/json' - # the data has to be passed after being processed with - # json.dumps() - res = requests.post(url, headers=headers, data=data, - files=files, verify=verify) - - return [url, res] + data=None, files=None, verify=False): + """Send data through a POST method. + + Construct an URI for an endpoint and send data through + a POST method. + + Parameters + ---------- + url : str + The base URL for a Chewie-NS instance. + headers : dict + HTTP headers for POST requests. + endpoint_list : list + List with elements that will be concatenated to the + base URI to create the URI for the API endpoint. + data + Data to send. Must be an object that can be JSON + serialized. + files : dict + Dictionary with file data to send (see: + https://requests.readthedocs.io/en/master/user + /quickstart/#post-a-multipart-encoded-file). + verify : bool + If the SSL certificates should be verified in HTTPS + requests (False for no verification, True otherwise). + + Returns + ------- + url : str + Complete URI for the API endpoint. + res : requests.models.Response + The response returned through the POST method. + """ + # unpack list of sequential endpoints and pass to create URI + if endpoint_list is not None: + url = make_url(url, *endpoint_list) + + # if the content is defined as 'application/json' + # the data has to be passed after being processed with + # json.dumps() + res = requests.post(url, headers=headers, data=data, + files=files, verify=verify) + + return [url, res] def check_connection(url, headers=ct.HEADERS_GET_JSON): - """Verify connection to a Chewie-NS instance. - - Perform a GET request to the stats/summary endpoint of a - Chewie-NS instance to determine if it is possible to - establish a connection. - - Parameters - ---------- - url : str - The base URL for a Chewie-NS instance. - headers : dict - HTTP headers for GET requests. - - Returns - ------- - connection : bool - True if the GET request to the stats/summary - endpoint was successful (HTTP response status - code 200 or 201), False otherwise. - """ - try: - url, res = simple_get_request(url, headers, ['stats', 'summary']) - server_status = res.status_code - if server_status in [200, 201]: - connection = True - else: - connection = False - except Exception: - connection = False - - return connection + """Verify connection to a Chewie-NS instance. + + Perform a GET request to the stats/summary endpoint of a + Chewie-NS instance to determine if it is possible to + establish a connection. + + Parameters + ---------- + url : str + The base URL for a Chewie-NS instance. + headers : dict + HTTP headers for GET requests. + + Returns + ------- + connection : bool + True if the GET request to the stats/summary + endpoint was successful (HTTP response status + code 200 or 201), False otherwise. + """ + try: + url, res = simple_get_request(url, headers, ['stats', 'summary']) + server_status = res.status_code + if server_status in [200, 201]: + connection = True + else: + connection = False + except Exception: + connection = False + + return connection def user_info(base_url, headers_get): - """Get the information about a registered user in Chewie-NS. - - Get the user identifier, role and permission level for a - user registered in a Chewie-NS instance. The permission - level is used to determine if the user can upload schemas - to Chewie-NS. - - Parameters - ---------- - base_url : str - The base URL for a Chewie-NS instance. - headers_get : dict - HTTP headers for GET requests. - - Returns - ------- - user_id : str - User identifier in the Chewie-NS instance. - user_role : str - User role in the Chewie-NS instance. - permission : bool - True if the user has Admin or Contributor privileges, - False otherwise. - """ - # verify user role to check permission - user_url, user_info = simple_get_request(base_url, headers_get, - ['user', 'current_user']) - user_info = user_info.json() - - user_id = str(user_info['id']) - user_role = user_info['roles'].split(' ')[-1][:-1] - permission = any(role in user_role for role in ['Admin', 'Contributor']) - - return [user_id, user_role, permission] + """Get the information about a registered user in Chewie-NS. + + Get the user identifier, role and permission level for a + user registered in a Chewie-NS instance. The permission + level is used to determine if the user can upload schemas + to Chewie-NS. + + Parameters + ---------- + base_url : str + The base URL for a Chewie-NS instance. + headers_get : dict + HTTP headers for GET requests. + + Returns + ------- + user_id : str + User identifier in the Chewie-NS instance. + user_role : str + User role in the Chewie-NS instance. + permission : bool + True if the user has Admin or Contributor privileges, + False otherwise. + """ + # verify user role to check permission + user_url, user_info = simple_get_request(base_url, headers_get, + ['user', 'current_user']) + user_info = user_info.json() + + user_id = str(user_info['id']) + user_role = user_info['roles'].split(' ')[-1][:-1] + permission = any(role in user_role for role in ['Admin', 'Contributor']) + + return [user_id, user_role, permission] def species_ids(species_id, base_url, headers_get): - """Get a species identifier or name in Chewie-NS. - - Parameters - ---------- - species_id : str - The identifier of the species in Chewie-NS or the - name of the species. - base_url : str - The base URL for a Chewie-NS instance. - headers_get : dict - HTTP headers for GET requests. - - Returns - ------- - A list with the species identifier and name if the species - exists in Chewie-NS or a 404 HTTP status code if the species - was not found. - """ - try: - # check if species unique integer identifier was provided - int(species_id) - species_url, species_info = simple_get_request(base_url, headers_get, - ['species', species_id]) - if species_info.status_code in [200, 201]: - species_name = species_info.json()[0]['name']['value'] - return [species_id, species_name] - else: - return 404 - except ValueError: - # check if the scientific name for species was provided - species_name = species_id - # get the list of species added to Chewie-NS - ns_species = species_list(base_url, headers_get, ['species', 'list']) - # try to get the species unique integer identifier - species_id = ns_species.get(species_name, 'not_found') - if species_id != 'not_found': - return [species_id, species_name] - else: - return 404 + """Get a species identifier or name in Chewie-NS. + + Parameters + ---------- + species_id : str + The identifier of the species in Chewie-NS or the + name of the species. + base_url : str + The base URL for a Chewie-NS instance. + headers_get : dict + HTTP headers for GET requests. + + Returns + ------- + A list with the species identifier and name if the species + exists in Chewie-NS or a 404 HTTP status code if the species + was not found. + """ + try: + # check if species unique integer identifier was provided + int(species_id) + species_url, species_info = simple_get_request(base_url, headers_get, + ['species', species_id]) + if species_info.status_code in [200, 201]: + species_name = species_info.json()[0]['name']['value'] + return [species_id, species_name] + else: + return 404 + except ValueError: + # check if the scientific name for species was provided + species_name = species_id + # get the list of species added to Chewie-NS + ns_species = species_list(base_url, headers_get, ['species', 'list']) + # try to get the species unique integer identifier + species_id = ns_species.get(species_name, 'not_found') + if species_id != 'not_found': + return [species_id, species_name] + else: + return 404 def species_list(base_url, headers_get, endpoint_list): - """Get the list of species in a Chewie-NS instance. - - Parameters - ---------- - base_url : str - The base URL for a Chewie Nomenclature Server - instance. - headers_get : dict - HTTP headers for GET requests. - endpoint_list : list - List with elements that will be concatenated - to the base URI to create the URI for the API endpoint. - - Returns - ------- - species_lst : dict - Dictionary with species names as keys and - species identifiers as values. - """ - species_url, res = simple_get_request(base_url, headers_get, endpoint_list) - res = res.json() - species_lst = {} - for sp in res: - # get species scientific name - species = sp['name']['value'] - # get species unique integer identifier from URl - species_url = sp['species']['value'] - species_id = species_url.split('/')[-1] - species_lst[species] = species_id - - return species_lst + """Get the list of species in a Chewie-NS instance. + + Parameters + ---------- + base_url : str + The base URL for a Chewie Nomenclature Server + instance. + headers_get : dict + HTTP headers for GET requests. + endpoint_list : list + List with elements that will be concatenated + to the base URI to create the URI for the API endpoint. + + Returns + ------- + species_lst : dict + Dictionary with species names as keys and + species identifiers as values. + """ + species_url, res = simple_get_request(base_url, headers_get, endpoint_list) + res = res.json() + species_lst = {} + for sp in res: + # get species scientific name + species = sp['name']['value'] + # get species unique integer identifier from URl + species_url = sp['species']['value'] + species_id = species_url.split('/')[-1] + species_lst[species] = species_id + + return species_lst def is_url(url): - """Check if a URL is valid. + """Check if a URL is valid. - Parameters - ---------- - url : str - The URL to be validated. + Parameters + ---------- + url : str + The URL to be validated. - Returns - ------- - True if the URL is valid, False otherwise. - """ - try: - result = urlparse(url) - return all([result.scheme, result.netloc, result.path]) - except Exception: - return False + Returns + ------- + True if the URL is valid, False otherwise. + """ + try: + result = urlparse(url) + return all([result.scheme, result.netloc, result.path]) + except Exception: + return False def make_url(base_url, *res, **params): - """Create a URL. - - Parameters - ---------- - base_url : str - The base URL. - res : list - List with elements that will be concatenated to the - base URI to create the URI for the API endpoint. - params : str - Addtional parameters (WIP). - - Returns - ------- - url : str - URL constructed by adding endpoints and additional - parameters. If the provided base URL is invalid - returns 'An invalid URL was provided.' - """ - url = base_url - # Check if the url is valid - if is_url(url): - - if url[-1] == '/': - url = url[:-1] - - # Add the endpoints - for r in res: - url = '{0}/{1}'.format(url, r) - - # Add params if they are provided - if params: - url = '{0}?{1}'.format(url, urlencode(params)) - - return url - else: - return 'An invalid URL was provided.' + """Create a URL. + + Parameters + ---------- + base_url : str + The base URL. + res : list + List with elements that will be concatenated to the + base URI to create the URI for the API endpoint. + params : str + Addtional parameters (WIP). + + Returns + ------- + url : str + URL constructed by adding endpoints and additional + parameters. If the provided base URL is invalid + returns 'An invalid URL was provided.' + """ + url = base_url + # Check if the url is valid + if is_url(url): + + if url[-1] == '/': + url = url[:-1] + + # Add the endpoints + for r in res: + url = '{0}/{1}'.format(url, r) + + # Add params if they are provided + if params: + url = '{0}?{1}'.format(url, urlencode(params)) + + return url + else: + return 'An invalid URL was provided.' def get_sequence_from_url(url): - """Extract sequence from URL query component. + """Extract sequence from URL query component. - Parameters - ---------- - url : str - URL that contains the sequence. + Parameters + ---------- + url : str + URL that contains the sequence. - Returns - ------- - seq : str - Sequence extracted from the URL. - """ - seq = parse_qs(urlsplit(url).query)['sequence'][0] + Returns + ------- + seq : str + Sequence extracted from the URL. + """ + seq = parse_qs(urlsplit(url).query)['sequence'][0] - return seq + return seq def login_user_to_NS(server_url, email, password): - """Login user to a Chewie-NS instance. - - Parameters - ---------- - server_url : str - The base URL for a Chewie-NS instance. - email : str - Email linked to user's account in Chewie-NS. - password : str - Password for the user's account in Chewie-NS. - - Returns - ------- - token : str - Authorization token to perform requests to Chewie-NS - instance if login is successful. Otherwise, returns - False. - """ - auth_params = {} - auth_params['email'] = email - auth_params['password'] = password - - auth_url, auth_r = simple_post_request(server_url, - ct.HEADERS_POST_JSON, - ['auth', 'login'], - data=json.dumps(auth_params)) - - auth_result = auth_r.json() - if auth_result['status'] == 'success': - token = auth_result['access_token'] - else: - token = False - - return token + """Login user to a Chewie-NS instance. + + Parameters + ---------- + server_url : str + The base URL for a Chewie-NS instance. + email : str + Email linked to user's account in Chewie-NS. + password : str + Password for the user's account in Chewie-NS. + + Returns + ------- + token : str + Authorization token to perform requests to Chewie-NS + instance if login is successful. Otherwise, returns + False. + """ + auth_params = {} + auth_params['email'] = email + auth_params['password'] = password + + auth_url, auth_r = simple_post_request(server_url, + ct.HEADERS_POST_JSON, + ['auth', 'login'], + data=json.dumps(auth_params)) + + auth_result = auth_r.json() + if auth_result['status'] == 'success': + token = auth_result['access_token'] + else: + token = False + + return token def capture_login_credentials(base_url): - """Capture login credentials to login user into Chewie-NS. - - Parameters - ---------- - base_url : str - The base URL for a Chewie-NS instance. - - Returns - ------- - token : str - Authorization token to perform requests to Chewie-NS. - """ - print('\nPlease provide login credentials:') - user = input('USERNAME: ') - password = getpass('PASSWORD: ') - print() - # get token - token = login_user_to_NS(base_url, user, password) - # if login was not successful, stop the program - if token is False: - sys.exit('Invalid credentials.') - - return token + """Capture login credentials to login user into Chewie-NS. + + Parameters + ---------- + base_url : str + The base URL for a Chewie-NS instance. + + Returns + ------- + token : str + Authorization token to perform requests to Chewie-NS. + """ + print('\nPlease provide login credentials:') + user = input('USERNAME: ') + password = getpass('PASSWORD: ') + print() + # get token + token = login_user_to_NS(base_url, user, password) + # if login was not successful, stop the program + if token is False: + sys.exit('Invalid credentials.') + + return token def get_species_schemas(schema_id, species_id, base_url, headers_get): - """Determine if there is a schema with specified identifier. - - Retrieves the list of schemas for a species in a Chewie-NS - instance and determines if it has a schema with the - specified identifier. - - Parameters - ---------- - schema_id : str - The identifier of the schema in Chewie-NS. - species_id : str - The identifier of the schema's species in Chewie-NS. - base_url : str - The base URL for the Chewie-NS instance. - headers_get : dict - HTTP headers for GET requests. - - Returns - ------- - A list with the schema identifier, the schema URI, the - schema name and a dictionary with all properties for - the schema in Chewie-NS. - - Raises - ------ - SystemExit - - If the schema with the specified identifier does - not exist. - - If the process cannot retrieve the list of schemas - for the species. - """ - # get the list of schemas for the species - schemas_url, schema_get = simple_get_request(base_url, headers_get, - ['species', species_id, - 'schemas']) - schema_get_status = schema_get.status_code - if schema_get_status in [200, 201]: - species_schemas = schema_get.json() - - # extract schemas identifiers, URIs and names from response - schemas_info = [] - for s in species_schemas: - schema_uri = s['schemas']['value'] - scid = schema_uri.split('/')[-1] - schema_name = s['name']['value'] - schemas_info.append([scid, schema_uri, schema_name]) - - # select schema with specified identifier - schema = [s for s in schemas_info if schema_id in s] - if len(schema) > 0: - # get schema parameters - schema = schema[0] - url, schema_params = simple_get_request(schema[1], headers_get) - schema_params = schema_params.json()[0] - schema.append(schema_params) - return schema - else: - sys.exit('Could not find a schema with provided identifier.') - else: - sys.exit('Could not retrieve schemas for current species.') + """Determine if there is a schema with specified identifier. + + Retrieves the list of schemas for a species in a Chewie-NS + instance and determines if it has a schema with the + specified identifier. + + Parameters + ---------- + schema_id : str + The identifier of the schema in Chewie-NS. + species_id : str + The identifier of the schema's species in Chewie-NS. + base_url : str + The base URL for the Chewie-NS instance. + headers_get : dict + HTTP headers for GET requests. + + Returns + ------- + A list with the schema identifier, the schema URI, the + schema name and a dictionary with all properties for + the schema in Chewie-NS. + + Raises + ------ + SystemExit + - If the schema with the specified identifier does + not exist. + - If the process cannot retrieve the list of schemas + for the species. + """ + # get the list of schemas for the species + schemas_url, schema_get = simple_get_request(base_url, headers_get, + ['species', species_id, + 'schemas']) + schema_get_status = schema_get.status_code + if schema_get_status in [200, 201]: + species_schemas = schema_get.json() + + # extract schemas identifiers, URIs and names from response + schemas_info = [] + for s in species_schemas: + schema_uri = s['schemas']['value'] + scid = schema_uri.split('/')[-1] + schema_name = s['name']['value'] + schemas_info.append([scid, schema_uri, schema_name]) + + # select schema with specified identifier + schema = [s for s in schemas_info if schema_id in s] + if len(schema) > 0: + # get schema parameters + schema = schema[0] + url, schema_params = simple_get_request(schema[1], headers_get) + schema_params = schema_params.json()[0] + schema.append(schema_params) + return schema + else: + sys.exit('Could not find a schema with provided identifier.') + else: + sys.exit('Could not retrieve schemas for current species.') def upload_file(file, filename, url, headers, verify_ssl): - """Upload a file to a Chewie-NS instance. - - Parameters - ---------- - file : str - Path to the file to upload. - filename : str - Name used to save the file in Chewie-NS. - url : str - Endpoint URL that receives the POST request. - headers : dict - HTTP POST request headers. - verify_sll : bool - If the SSL certificates should be verified in - HTTPS requests (False for no verification, True - otherwise). - - Returns - ------- - response : requests.models.Response - Response object from the 'requests' module. - """ - file_handle = open(file, 'rb') - files = {'file': (filename, file_handle)} - up_url, response = simple_post_request(url, headers, - files=files, - verify=verify_ssl) - - file_handle.close() - - return response + """Upload a file to a Chewie-NS instance. + + Parameters + ---------- + file : str + Path to the file to upload. + filename : str + Name used to save the file in Chewie-NS. + url : str + Endpoint URL that receives the POST request. + headers : dict + HTTP POST request headers. + verify_sll : bool + If the SSL certificates should be verified in + HTTPS requests (False for no verification, True + otherwise). + + Returns + ------- + response : requests.models.Response + Response object from the 'requests' module. + """ + file_handle = open(file, 'rb') + files = {'file': (filename, file_handle)} + up_url, response = simple_post_request(url, headers, + files=files, + verify=verify_ssl) + + file_handle.close() + + return response diff --git a/CHEWBBACA/utils/constants.py b/CHEWBBACA/utils/constants.py index 08648b00..1ba6bf84 100755 --- a/CHEWBBACA/utils/constants.py +++ b/CHEWBBACA/utils/constants.py @@ -80,12 +80,12 @@ GENETIC_CODES = {1: 'The Standard Code', 2: 'The Vertebrate Mitochondrial Code', 3: 'The Yeast Mitochondrial Code', - 4: 'The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code', + 4: 'The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code', 5: 'The Invertebrate Mitochondrial Code', 6: 'The Ciliate, Dasycladacean and Hexamita Nuclear Code', 9: 'The Echinoderm and Flatworm Mitochondrial Code', 10: 'The Euplotid Nuclear Code', - 11: 'The Bacterial, Archaeal and Plant Plastid Code', + 11: 'The Bacterial, Archaeal and Plant Plastid Code', 12: 'The Alternative Yeast Nuclear Code', 13: 'The Ascidian Mitochondrial Code', 14: 'The Alternative Flatworm Mitochondrial Code', @@ -95,7 +95,7 @@ 22: 'Scenedesmus obliquus Mitochondrial Code', 23: 'Thraustochytrium Mitochondrial Code', 24: 'Rhabdopleuridae Mitochondrial Code', - 25: 'Candidate Division SR1 and Gracilibacteria Code'} + 25: 'Candidate Division SR1 and Gracilibacteria Code'} GENETIC_CODES_DEFAULT = 11 @@ -111,23 +111,23 @@ # Chewie-NS related constants HEADERS_GET_ = {'Authorization': None, - 'accept': 'application/octet-stream'} + 'accept': 'application/octet-stream'} HEADERS_GET_JSON = {'Authorization': None, - 'accept': 'application/json'} + 'accept': 'application/json'} HEADERS_POST = {'Authorization': None, - 'user_id': None} + 'user_id': None} HEADERS_POST_JSON = {'Authorization': None, - 'Content-type': 'application/json', - 'accept': 'application/json', - 'user_id': None} + 'Content-type': 'application/json', + 'accept': 'application/json', + 'user_id': None} # List of Chewie-NS instance identifiers and URLs HOST_NS = {'main': 'https://chewbbaca.online/api/NS/api/', - 'tutorial': 'https://tutorial.chewbbaca.online/api/NS/api/', - 'local': 'http://127.0.0.1:5000/NS/api/'} + 'tutorial': 'https://tutorial.chewbbaca.online/api/NS/api/', + 'local': 'http://127.0.0.1:5000/NS/api/'} # Authors, GitHub repository, documentation, tutorial and contacts AUTHORS = 'Rafael Mamede, Pedro Cerqueira, Mickael Silva, João Carriço, Mário Ramirez' @@ -152,18 +152,18 @@ # To avoid this warning the TXT file must be converted to binary with the blastdb_aliastool # Performance can be severely affected if the TXT is not converted to binary IGNORE_RAISED = ['Warning: [blastp] To obtain better run time ' - 'performance, please run blastdb_aliastool ' - '-seqid_file_in -seqid_file_out ' - ' and use as the ' - 'argument to -seqidlist'] + 'performance, please run blastdb_aliastool ' + '-seqid_file_in -seqid_file_out ' + ' and use as the ' + 'argument to -seqidlist'] # Path to MAFFT executable MAFFT_ALIAS = shutil.which('mafft') # Replacements for genome and loci identifiers CHAR_REPLACEMENTS = [("|", "_"), ("_", "-"), ("(", ""), - (")", ""), ("'", ""), ("\"", ""), - (":", "")] + (")", ""), ("'", ""), ("\"", ""), + (":", "")] # Minimum Python version MIN_PYTHON = [(3, 6, 0), '3.6.0'] @@ -185,11 +185,11 @@ # AlleleCall logfile basename and content template LOGFILE_BASENAME = 'logging_info.txt' LOGFILE_TEMPLATE = ('Started script at: {0}\n' - 'Finished script at: {1}\n' - 'Number of inputs: {2}\n' - 'Number of loci: {3}\n' - 'Used this number of CPU cores: {4}\n' - 'Used a BSR of: {5}\n') + 'Finished script at: {1}\n' + 'Number of inputs: {2}\n' + 'Number of loci: {3}\n' + 'Used this number of CPU cores: {4}\n' + 'Used a BSR of: {5}\n') # Basename for files created by the AlleleCall module RESULTS_COORDINATES_BASENAME = 'results_contigsInfo.tsv' @@ -208,7 +208,7 @@ GENE_LIST_BASENAME = '.genes_list' # Header for TSV file with loci stats LOCI_STATS_HEADER = ('Locus\tEXC\tINF\tPLOT3\tPLOT5\tLOTSC\tNIPH\t' - 'NIPHEM\tALM\tASM\tPAMA\tLNF\tTotal_CDS') + 'NIPHEM\tALM\tASM\tPAMA\tLNF\tTotal_CDS') # Header for TSV file with information about extracted CDSs CDS_TABLE_HEADER = 'Genome\tContig\tStart\tStop\tProtein_ID\tCoding_Strand\n' # Headers for TSV files with paralogous loci count and per genome @@ -219,8 +219,8 @@ # Allele calling classifications ALLELECALL_CLASSIFICATIONS = ['EXC', 'INF', 'PLOT3', 'PLOT5', - 'LOTSC', 'NIPH', 'NIPHEM', 'ALM', - 'ASM', 'PAMA', 'LNF'] + 'LOTSC', 'NIPH', 'NIPHEM', 'ALM', + 'ASM', 'PAMA', 'LNF'] # PLNF classificaton for modes {1,2,3} PROBABLE_LNF = 'PLNF' @@ -245,17 +245,17 @@ # Dictionary template to map variables returned by AlleleCall ALLELECALL_DICT = {'classification_files': None, - 'basename_map': None, - 'cds_coordinates': None, - 'cds_counts': None, - 'dna_fasta': None, - 'protein_fasta': None, - 'dna_hashtable': None, - 'protein_hashtable': None, - 'invalid_alleles': None, - 'unclassified_ids': None, - 'self_scores': None, - 'representatives': None} + 'basename_map': None, + 'cds_coordinates': None, + 'cds_counts': None, + 'dna_fasta': None, + 'protein_fasta': None, + 'dna_hashtable': None, + 'protein_hashtable': None, + 'invalid_alleles': None, + 'unclassified_ids': None, + 'self_scores': None, + 'representatives': None} GENOME_LIST = 'listGenomes2Call.txt' LOCI_LIST = 'listGenes2Call.txt' @@ -282,7 +282,7 @@ # File header for file with summary statistics created by PrepExternalSchema PREPEXTERNAL_SUMMARY_STATS_HEADER = ('Gene\tTotal_alleles\tValid_alleles\t' - 'Number_representatives') + 'Number_representatives') # Default loci presence thresholds used to compute the cgMLST CGMLST_THRESHOLDS = [0.95, 0.99, 1] @@ -293,15 +293,15 @@ - - - Schema Evaluator - React Edition + + + Schema Evaluator - React Edition - -
- - + +
+ + """) @@ -311,19 +311,19 @@ LOCUS_REPORT_HTML = (""" - - - - Schema Evaluator - Individual Analysis - - - -
- - - - - + + + + Schema Evaluator - Individual Analysis + + + +
+ + + + + """) @@ -332,15 +332,15 @@ - - - AlleleCall Report - React Edition + + + AlleleCall Report - React Edition - -
- - + +
+ + """) @@ -377,107 +377,107 @@ # Table header for the Schema report summary data SCHEMA_SUMMARY_TABLE_HEADERS = ('Loci\tAlleles\tValid Alleles\tInvalid ' - 'alleles\tIncomplete ORF\tAmbiguous Bases\t' - 'Missing Start/Stop Codon\tIn-frame Stop ' - 'Codon\tAlleles < {0}bp\tAlleles below ' - 'threshold\tAlleles above threshold') + 'alleles\tIncomplete ORF\tAmbiguous Bases\t' + 'Missing Start/Stop Codon\tIn-frame Stop ' + 'Codon\tAlleles < {0}bp\tAlleles below ' + 'threshold\tAlleles above threshold') # Column headers for the Loci Analysis Table in the Schema report LOCI_ANALYSIS_COLUMNS = ('Locus\tTotal Alleles\tValid Alleles\tInvalid ' - 'Alleles\tProportion of Validated Alleles\tDistinct ' - 'Protein Alleles\tIncomplete ' - 'ORF\tAmbiguous Bases\tMissing ' - 'Start/Stop Codon\tIn-frame Stop Codon\tAlleles ' - '< {0}bp\tAlleles below threshold\tAlleles above ' - 'threshold\tMissing Allele IDs') + 'Alleles\tProportion of Validated Alleles\tDistinct ' + 'Protein Alleles\tIncomplete ' + 'ORF\tAmbiguous Bases\tMissing ' + 'Start/Stop Codon\tIn-frame Stop Codon\tAlleles ' + '< {0}bp\tAlleles below threshold\tAlleles above ' + 'threshold\tMissing Allele IDs') # Column headers for the Summary Table in the Loci reports LOCUS_COLUMNS = ('Locus\tTotal Alleles\tValid Alleles\tInvalid ' - 'Alleles\tProportion of Validated Alleles\tDistinct ' - 'Protein Alleles\t' - 'Incomplete ORF\tAmbiguous Bases\tMissing Start/Stop ' - 'Codon\tIn-frame Stop Codon\tAlleles < {0}bp\tSize Range ' - '(bp)\tLength Median (bp)\tLength Mode (bp)\tAlleles below ' - 'threshold ({1}bp)\tAlleles above threshold ({2}bp)\t' - 'Missing Allele IDs') + 'Alleles\tProportion of Validated Alleles\tDistinct ' + 'Protein Alleles\t' + 'Incomplete ORF\tAmbiguous Bases\tMissing Start/Stop ' + 'Codon\tIn-frame Stop Codon\tAlleles < {0}bp\tSize Range ' + '(bp)\tLength Median (bp)\tLength Mode (bp)\tAlleles below ' + 'threshold ({1}bp)\tAlleles above threshold ({2}bp)\t' + 'Missing Allele IDs') # Column headers for the Invalid Alleles table in the loci reports INVALID_ALLELES_COLUMNS = ['Allele ID', 'Exception Category', - 'Exception Description'] + 'Exception Description'] TRANSLATION_EXCEPTIONS = ['Extra in frame stop codon', - 'is not a start codon', - 'is not a stop codon', - 'sequence length is not a multiple of 3', - 'ambiguous or invalid characters'] + 'is not a start codon', + 'is not a stop codon', + 'sequence length is not a multiple of 3', + 'ambiguous or invalid characters'] DISTINCT_ALLELES_COLUMNS = ['Protein Allele ID', - 'Count', - 'List of Distinct Alleles'] + 'Count', + 'List of Distinct Alleles'] # Column headers for the Sample Stats table in the allele calling report SAMPLE_STATS_COLUMNS = ['Sample', 'Total Contigs', 'Total CDSs', - 'Proportion of Classified CDSs', 'Identified Loci', - 'Proportion of Identified Loci', - 'Valid Classifications', 'Invalid Classifications'] + 'Proportion of Classified CDSs', 'Identified Loci', + 'Proportion of Identified Loci', + 'Valid Classifications', 'Invalid Classifications'] # Column headers for the Loci Stats table in the allele calling report LOCI_STATS_COLUMNS = ['Locus', 'Total CDSs', 'Valid Classes', - 'Invalid Classes', 'Proportion Samples'] + 'Invalid Classes', 'Proportion Samples'] # Column headers for the Summary Stats table in the allele calling report SUMMARY_STATS_COLUMNS = ['Total Samples', 'Total Loci', 'Total CDSs', - 'Total CDSs Classified', 'EXC', 'INF', - 'PLOT3', 'PLOT5', 'LOTSC', 'NIPH', - 'NIPHEM', 'ALM', 'ASM', 'PAMA', 'LNF'] + 'Total CDSs Classified', 'EXC', 'INF', + 'PLOT3', 'PLOT5', 'LOTSC', 'NIPH', + 'NIPHEM', 'ALM', 'ASM', 'PAMA', 'LNF'] # Exception messages # Input file is a FASTA file but chewBBACA expects a file with a list of # file paths FASTA_INPUT_EXCEPTION = ('Input file is a FASTA file. Please provide ' - 'the path to the parent directory that contains ' - 'the FASTA files or a file with the list of full ' - 'paths to the FASTA files (one per line).') + 'the path to the parent directory that contains ' + 'the FASTA files or a file with the list of full ' + 'paths to the FASTA files (one per line).') # Some input files have an invalid file extension INVALID_EXTENSION_EXCEPTION = ('The following input files do not have a ' - 'valid file extension:\n{0}\nPlease ensure ' - 'that the filenames end with one of the ' - f'following extensions: {FASTA_EXTENSIONS}.') + 'valid file extension:\n{0}\nPlease ensure ' + 'that the filenames end with one of the ' + f'following extensions: {FASTA_EXTENSIONS}.') # Some of the file paths provided do not exist MISSING_INPUTS_EXCEPTION = ('Could not find some of the files provided in ' - 'the input list. Please verify that you\'ve ' - 'provided valid paths to the following input ' - 'files.\n{0}') + 'the input list. Please verify that you\'ve ' + 'provided valid paths to the following input ' + 'files.\n{0}') # Files that do not have the expected format of a FASTA file NON_FASTA_EXCEPTION = ('The following input files are not in FASTA format:\n{0}') # Input directory does not contain FASTA files MISSING_FASTAS_EXCEPTION = ('Could not get input files. Please provide ' - 'a directory with FASTA files or a file with ' - 'the list of full paths to the FASTA files ' - 'and ensure that filenames end with one of ' - f'the following extensions: {FASTA_EXTENSIONS}.') + 'a directory with FASTA files or a file with ' + 'the list of full paths to the FASTA files ' + 'and ensure that filenames end with one of ' + f'the following extensions: {FASTA_EXTENSIONS}.') # Input path is neither a file nor a directory INVALID_INPUT_PATH = ('Input argument is not a valid directory or ' - 'file with a list of paths to FASTA files. Please ' - 'provide a valid input, either a folder with FASTA ' - 'files or a file with the list of full paths to FASTA ' - 'files (one per line and ending with one of the ' - f'following file extensions: {FASTA_EXTENSIONS}).') + 'file with a list of paths to FASTA files. Please ' + 'provide a valid input, either a folder with FASTA ' + 'files or a file with the list of full paths to FASTA ' + 'files (one per line and ending with one of the ' + f'following file extensions: {FASTA_EXTENSIONS}).') # Path to schema does not exist SCHEMA_PATH_MISSING = ('Path to input schema does not exist. Please provide ' - 'a valid path.') + 'a valid path.') # Path to schema does not include expected files SCHEMA_INVALID_PATH = ('Provided path does not include all the necessary ' - 'schema files. Please verify that you have passed ' - 'the correct path to the schema.') + 'schema files. Please verify that you have passed ' + 'the correct path to the schema.') # User provided legacy schema. Tell user to adapt with the PrepExternalSchema module ADAPT_LEGACY_SCHEMA = ('Schema does not include a config file or includes files that' @@ -487,31 +487,31 @@ # Output directory exists OUTPUT_DIRECTORY_EXISTS = ('Output directory already exists. Please ' - 'provide a path to a directory that will be ' - 'created to store the results.') + 'provide a path to a directory that will be ' + 'created to store the results.') # Input file is a FASTA file but chewBBACA expects a file with a list of # loci file paths FASTA_LOCI_LIST_EXCEPTION = ('Path provided to --gl is for a FASTA file. Please provide ' - 'a file with the list of locus identifiers or full ' - 'paths to the loci FASTA files (one per line).') + 'a file with the list of locus identifiers or full ' + 'paths to the loci FASTA files (one per line).') # Invalid paths to loci FASTA files MISSING_LOCI_EXCEPTION = ('Could not find some of the loci FASTA files provided in ' - 'the input list. Please verify that you\'ve ' - 'provided valid paths to the following input ' - 'files.\n{0}') + 'the input list. Please verify that you\'ve ' + 'provided valid paths to the following input ' + 'files.\n{0}') # Invalid format for loci files NON_FASTA_LOCI_EXCEPTION = ('The following loci files are not in FASTA format:\n{0}') # User does not have permissions to upload schemas to Chewie-NS LOADSCHEMA_NO_PERMISSIONS = ('Current user has no Administrator or Contributor ' - 'permissions.\nNot allowed to upload schemas.') + 'permissions.\nNot allowed to upload schemas.') # PTF is missing from schema's directory LOADSCHEMA_MISSING_PTF = ('Please ensure that the schema\'s directory includes the ' - 'Prodigal training file used to create the schema.') + 'Prodigal training file used to create the schema.') # Path for PTF does not exist INVALID_PTF_PATH = 'Invalid path for Prodigal training file.' @@ -532,27 +532,27 @@ INVALID_ST_TYPE = ('\nInvalid size threshold value used to create schema. Value must be None or a positive float in the [0.0, 1.0] interval.') INVALID_GENETIC_CODE = ('\nInvalid genetic code value.\nValue must correspond to ' - 'one of the accepted genetic codes\n\nAccepted genetic ' - 'codes:\n{0}') + 'one of the accepted genetic codes\n\nAccepted genetic ' + 'codes:\n{0}') INVALID_WS = ('\nWord size for the clustering step ' - 'must be equal or greater than {0} and ' - 'equal or smaller than {1}.') + 'must be equal or greater than {0} and ' + 'equal or smaller than {1}.') INVALID_WS_TYPE = ('\nSchema created with invalid clustering word size value.') INVALID_CS = ('\nClustering similarity threshold value ' - 'must be contained in the [0.0, 1.0] ' - 'interval.') + 'must be contained in the [0.0, 1.0] ' + 'interval.') INVALID_CS_TYPE = ('\nSchema created with invalid clustering threshold value.') INVALID_RF = ('\nRepresentative filter threshold value ' - 'must be contained in the [0.0, 1.0] ' - 'interval.') + 'must be contained in the [0.0, 1.0] ' + 'interval.') INVALID_RF_TYPE = ('\nSchema created with invalid representative filter value.') INVALID_ICF = ('\nIntra-cluster filter value ' - 'must be contained in the [0.0, 1.0] ' - 'interval.') + 'must be contained in the [0.0, 1.0] ' + 'interval.') INVALID_ICF_TYPE = ('\nSchema created with invalid intra-cluster filter value.') NS_CANNOT_CONNECT = ('Failed to establish a connection to the Chewie-NS instance at {0}.') @@ -560,50 +560,50 @@ PYTHON_VERSION = ('Python version found: {0}\nPlease use Python >= {1}') CPU_RESET_WARNING = ('Warning! You have provided a CPU core count value ' - 'that is equal to or exceeds the number of CPU ' - 'cores in your system! Resetting to: {0}') + 'that is equal to or exceeds the number of CPU ' + 'cores in your system! Resetting to: {0}') CPU_VALUE_WARNING = ('Warning! You have provided a CPU core count value ' - 'that is close to the maximum core count of your ' - 'machine ({0}/{1}). This may affect your system ' - 'responsiveness.') + 'that is close to the maximum core count of your ' + 'machine ({0}/{1}). This may affect your system ' + 'responsiveness.') BLAST_NO_PATH = ('Could not find BLAST executables.') BLAST_NO_VERSION = ('Could not determine BLAST version. Please make ' - 'sure that BLAST>={0}.{1} is installed.') + 'sure that BLAST>={0}.{1} is installed.') BLAST_UPDATE = ('Found BLAST {0}.{1}. Please update BLAST to version >={2}.{3}') MULTIPLE_PTFS = ('Found more than one Prodigal training ' - 'file in the schema directory.\nPlease maintain ' - 'only the training file used in the schema ' - 'creation process.') + 'file in the schema directory.\nPlease maintain ' + 'only the training file used in the schema ' + 'creation process.') MISSING_PTF = ('Could not find a Prodigal training file in the schema directory.') INVALID_PTF_PATH = ('Cannot find specified Prodigal training file.' - '\nPlease provide a valid training file.\nYou ' - 'can create a training file for a species of ' - 'interest with the following command:\n\n prodigal ' - '-i -t -p ' - 'single\n\nIt is strongly advised to provide a ' - 'high-quality and closed genome for the training ' - 'process.') + '\nPlease provide a valid training file.\nYou ' + 'can create a training file for a species of ' + 'interest with the following command:\n\n prodigal ' + '-i -t -p ' + 'single\n\nIt is strongly advised to provide a ' + 'high-quality and closed genome for the training ' + 'process.') DIFFERENT_PTF_PROMPT = ('Prodigal training file is not the one ' - 'used to create the schema. Using this training ' + 'used to create the schema. Using this training ' 'file might lead to results not consistent with ' 'previous runs and invalidate the schema for ' 'usage with Chewie-NS.\nContinue process?\n') MULTIPLE_PTF_PROMPT = ('Prodigal training file is not any of the {0} ' - 'used in previous runs.\nContinue?\n') + 'used in previous runs.\nContinue?\n') ARGS_DIFFER = ('Provided argument values differ from the values ' 'used for schema creation:\n') ARGS_DIFFER_PROMPT = ('\nContinuing might lead to results not ' - 'consistent with previous runs.\nProviding ' - 'parameter values that differ from the values ' - 'used for schema creation will also invalidate ' - 'the schema for uploading and synchronization ' - 'with Chewie-NS.\nContinue? (yes/no)\n') + 'consistent with previous runs.\nProviding ' + 'parameter values that differ from the values ' + 'used for schema creation will also invalidate ' + 'the schema for uploading and synchronization ' + 'with Chewie-NS.\nContinue? (yes/no)\n') MISSING_CONFIG = ('Could not find a valid config file.') diff --git a/CHEWBBACA/utils/distance_matrix.py b/CHEWBBACA/utils/distance_matrix.py index e32c6d25..478c7b43 100644 --- a/CHEWBBACA/utils/distance_matrix.py +++ b/CHEWBBACA/utils/distance_matrix.py @@ -25,408 +25,408 @@ import pandas as pd try: - from utils import ( - constants as ct, - file_operations as fo, - iterables_manipulation as im, - multiprocessing_operations as mo) + from utils import ( + constants as ct, + file_operations as fo, + iterables_manipulation as im, + multiprocessing_operations as mo) except ModuleNotFoundError: - from CHEWBBACA.utils import ( - constants as ct, - file_operations as fo, - iterables_manipulation as im, - multiprocessing_operations as mo) + from CHEWBBACA.utils import ( + constants as ct, + file_operations as fo, + iterables_manipulation as im, + multiprocessing_operations as mo) def tsv_to_nparray(input_file, array_dtype='int32'): - """Read matrix of allelic profiles and convert to Numpy array. - - File lines are read as a generator to exclude the sample identifier - and avoid loading huge files into memory. - - Parameters - ---------- - input_file : str - Path to the TSV file with the matrix with - allelic profiles. - array_dtype : str - Array data type. - - Returns - ------- - np_array : ndarray - Numpy array with the numeric values for - all allelic profiles. - """ - # import matrix without column and row identifiers - with open(input_file, 'r') as infile: - lines = ('\t'.join(line.split('\t')[1:]) - for line in infile) - # dtype=float32 should be faster than integer dtypes - # but runs faster with dtype=int32 in test setup - # dtype=int32 supports max integer of 2,147,483,647 - # should be safe even when arrays are multiplied - np_array = np.genfromtxt(fname=lines, delimiter='\t', - dtype=array_dtype, skip_header=1) - - # need to reshape array to 2D if it only has info for one sample - try: - # if it does not have columns it needs to be reshaped - np_array.shape[1] - except Exception: - np_array = np.array([np_array]) - - return np_array + """Read matrix of allelic profiles and convert to Numpy array. + + File lines are read as a generator to exclude the sample identifier + and avoid loading huge files into memory. + + Parameters + ---------- + input_file : str + Path to the TSV file with the matrix with + allelic profiles. + array_dtype : str + Array data type. + + Returns + ------- + np_array : ndarray + Numpy array with the numeric values for + all allelic profiles. + """ + # import matrix without column and row identifiers + with open(input_file, 'r') as infile: + lines = ('\t'.join(line.split('\t')[1:]) + for line in infile) + # dtype=float32 should be faster than integer dtypes + # but runs faster with dtype=int32 in test setup + # dtype=int32 supports max integer of 2,147,483,647 + # should be safe even when arrays are multiplied + np_array = np.genfromtxt(fname=lines, delimiter='\t', + dtype=array_dtype, skip_header=1) + + # need to reshape array to 2D if it only has info for one sample + try: + # if it does not have columns it needs to be reshaped + np_array.shape[1] + except Exception: + np_array = np.array([np_array]) + + return np_array def compute_distances(indexes, np_matrix, genome_ids, tmp_directory): - """Compute pairwise allelic differences and number of shared loci. - - Parameters - ---------- - indexes : list - List with the line index of the allelic profiles - that will be processed. - np_matrix : ndarray - Numpy array with dtype=int32 values for allelic profiles. - genome_ids : list - List with sample identifiers. - tmp_directory : str - Path to temporary directory where pickle files with - results will be stored. - - Returns - ------- - output_files : list - List with the paths to all pickle files that were created - to store results. - """ - # multiply one row per cycle to avoid memory overflow - # read only part of the matrix for huge files and process in chunks? - output_files = {} - for i in indexes: - current_genome = genome_ids[i] - # get one row to perform pairwise comparisons against whole matrix - current_row = np_matrix[i:i+1, :] - # do not multiply against rows that were multiplied against - # matrix's rows in previous iterations - # combinations instead of permutations - permutation_rows = np_matrix[i:, :] - - # multiply 1D-array per whole matrix - # all non-shared loci will be converted to 0 - # values different than 0 correspond to shared loci - multiplied = current_row * permutation_rows - # count number of shared loci, non-zero values - # pairwise_shared_loci = np.count_nonzero(multiplied, axis=-1) - - # subtraction will lead to values different than 0 for loci that have different alleles - # multiplying ensures that we only keep results for shared loci and not for - # loci that are not shared and that had value different than 0 from subtraction - pairwise_allelic_differences = np.count_nonzero(multiplied * (current_row - permutation_rows), axis=-1) - - output_file = os.path.join(tmp_directory, current_genome) - # fo.pickle_dumper([pairwise_shared_loci, pairwise_allelic_differences], output_file) - fo.pickle_dumper([pairwise_allelic_differences], output_file) - output_files[current_genome] = output_file - - return output_files + """Compute pairwise allelic differences and number of shared loci. + + Parameters + ---------- + indexes : list + List with the line index of the allelic profiles + that will be processed. + np_matrix : ndarray + Numpy array with dtype=int32 values for allelic profiles. + genome_ids : list + List with sample identifiers. + tmp_directory : str + Path to temporary directory where pickle files with + results will be stored. + + Returns + ------- + output_files : list + List with the paths to all pickle files that were created + to store results. + """ + # multiply one row per cycle to avoid memory overflow + # read only part of the matrix for huge files and process in chunks? + output_files = {} + for i in indexes: + current_genome = genome_ids[i] + # get one row to perform pairwise comparisons against whole matrix + current_row = np_matrix[i:i+1, :] + # do not multiply against rows that were multiplied against + # matrix's rows in previous iterations + # combinations instead of permutations + permutation_rows = np_matrix[i:, :] + + # multiply 1D-array per whole matrix + # all non-shared loci will be converted to 0 + # values different than 0 correspond to shared loci + multiplied = current_row * permutation_rows + # count number of shared loci, non-zero values + # pairwise_shared_loci = np.count_nonzero(multiplied, axis=-1) + + # subtraction will lead to values different than 0 for loci that have different alleles + # multiplying ensures that we only keep results for shared loci and not for + # loci that are not shared and that had value different than 0 from subtraction + pairwise_allelic_differences = np.count_nonzero(multiplied * (current_row - permutation_rows), axis=-1) + + output_file = os.path.join(tmp_directory, current_genome) + # fo.pickle_dumper([pairwise_shared_loci, pairwise_allelic_differences], output_file) + fo.pickle_dumper([pairwise_allelic_differences], output_file) + output_files[current_genome] = output_file + + return output_files def get_sample_ids(input_file, delimiter='\t'): - r"""Extract the sample identifiers from a matrix with allelic profiles. + r"""Extract the sample identifiers from a matrix with allelic profiles. - Parameters - ---------- - input_file : str - Path to the input file that contains a matrix - with allelic profiles. - delimiter : str, optional - Field delimiter. The default is '\t'. + Parameters + ---------- + input_file : str + Path to the input file that contains a matrix + with allelic profiles. + delimiter : str, optional + Field delimiter. The default is '\t'. - Returns - ------- - sample_ids : list - List with the sample identifiers. - """ - with open(input_file, 'r') as infile: - reader = csv.reader(infile, delimiter=delimiter) - sample_ids = [line[0] for line in reader][1:] + Returns + ------- + sample_ids : list + List with the sample identifiers. + """ + with open(input_file, 'r') as infile: + reader = csv.reader(infile, delimiter=delimiter) + sample_ids = [line[0] for line in reader][1:] - return sample_ids + return sample_ids # def write_matrices(pickled_results, genome_ids, output_pairwise, # output_p, col_ids): def write_matrices(pickled_results, genome_ids, output_pairwise, col_ids): - """Write above diagonal matrices with allelic differences and shared loci. - - Parameters - ---------- - pickled_results : dict - Dictionary with sample identifiers as keys - and paths to binary files with pickled results - as values. - genome_ids : list - List with sample identifiers. - output_pairwise : str - Path to the output file to which the matrix - with pairwise allelic differences will be saved. - output_p : str - Path to the output file to which the matrix - with pairwise shared loci will be saved. - col_ids: list - List with sample identifiers to add as headers. - - Returns - ------- - None. - """ - # sl_lines = [col_ids] - ad_lines = [col_ids] - limit = 300 - for g in genome_ids: - current_file = pickled_results[g] - # load data - data = fo.pickle_loader(current_file) - - # shared_loci = list(data[0]) - # shared_loci = list(map(str, shared_loci)) - allele_diffs = list(data[0]) - allele_diffs = list(map(str, allele_diffs)) - - padding = [''] * (len(genome_ids)-len(allele_diffs)) - - # sl_line = [g] + padding + shared_loci - # sl_lines.append(sl_line) - ad_line = [g] + padding + allele_diffs - ad_lines.append(ad_line) - - # if len(sl_lines) >= limit or g == genome_ids[-1]: - if len(ad_lines) >= limit or g == genome_ids[-1]: - ad_lines = [im.join_list(line, '\t') for line in ad_lines] - fo.write_lines(ad_lines, output_pairwise, joiner='\n', write_mode='a') - ad_lines = [] - # write_lines(sl_lines, output_p, mode='a') - # sl_lines = [] - - return True + """Write above diagonal matrices with allelic differences and shared loci. + + Parameters + ---------- + pickled_results : dict + Dictionary with sample identifiers as keys + and paths to binary files with pickled results + as values. + genome_ids : list + List with sample identifiers. + output_pairwise : str + Path to the output file to which the matrix + with pairwise allelic differences will be saved. + output_p : str + Path to the output file to which the matrix + with pairwise shared loci will be saved. + col_ids: list + List with sample identifiers to add as headers. + + Returns + ------- + None. + """ + # sl_lines = [col_ids] + ad_lines = [col_ids] + limit = 300 + for g in genome_ids: + current_file = pickled_results[g] + # load data + data = fo.pickle_loader(current_file) + + # shared_loci = list(data[0]) + # shared_loci = list(map(str, shared_loci)) + allele_diffs = list(data[0]) + allele_diffs = list(map(str, allele_diffs)) + + padding = [''] * (len(genome_ids)-len(allele_diffs)) + + # sl_line = [g] + padding + shared_loci + # sl_lines.append(sl_line) + ad_line = [g] + padding + allele_diffs + ad_lines.append(ad_line) + + # if len(sl_lines) >= limit or g == genome_ids[-1]: + if len(ad_lines) >= limit or g == genome_ids[-1]: + ad_lines = [im.join_list(line, '\t') for line in ad_lines] + fo.write_lines(ad_lines, output_pairwise, joiner='\n', write_mode='a') + ad_lines = [] + # write_lines(sl_lines, output_p, mode='a') + # sl_lines = [] + + return True def transpose_matrix(input_file, output_directory): - """Transpose lines in a TSV file. - - Parameters - ---------- - input_file : str - Path to the input TSV file. - output_directory : str - Path to the directory to which intermediate files - with transposed lines will be written. - - Returns - ------- - output_transpose : str - Path to the file with the transposed matrix. - This file is created by concatenating all - files saved into `output_directory`. - """ - file_id = 1 - transpose_files = [] - input_basename = os.path.basename(input_file) - with open(input_file, 'r') as infile: - # get columns names - columns = [e.strip() for e in (infile.__next__()).split('\t')] - # divide into smaller sets to avoid loading huge files - num_col_sets = math.ceil(len(columns)/500) - col_sets = im.divide_list_into_n_chunks(columns, num_col_sets) - # use Pandas to read columns sets and save transpose - for c in col_sets: - # dtype=str or Pandas converts values into floats - df = pd.read_csv(input_file, usecols=c, delimiter='\t', dtype=str) - output_basename = input_basename.replace('.tsv', '_{0}.tsv'.format(file_id)) - output_file = os.path.join(output_directory, output_basename) - # transpose columns - df = df.T - # do not save header that contains row indexes - df.to_csv(output_file, sep='\t', header=False) - transpose_files.append(output_file) - file_id += 1 - - # concatenate all files with transposed lines - output_transpose = input_file.replace('.tsv', '_transpose.tsv') - fo.concatenate_files(transpose_files, output_transpose) - - return output_transpose + """Transpose lines in a TSV file. + + Parameters + ---------- + input_file : str + Path to the input TSV file. + output_directory : str + Path to the directory to which intermediate files + with transposed lines will be written. + + Returns + ------- + output_transpose : str + Path to the file with the transposed matrix. + This file is created by concatenating all + files saved into `output_directory`. + """ + file_id = 1 + transpose_files = [] + input_basename = os.path.basename(input_file) + with open(input_file, 'r') as infile: + # get columns names + columns = [e.strip() for e in (infile.__next__()).split('\t')] + # divide into smaller sets to avoid loading huge files + num_col_sets = math.ceil(len(columns)/500) + col_sets = im.divide_list_into_n_chunks(columns, num_col_sets) + # use Pandas to read columns sets and save transpose + for c in col_sets: + # dtype=str or Pandas converts values into floats + df = pd.read_csv(input_file, usecols=c, delimiter='\t', dtype=str) + output_basename = input_basename.replace('.tsv', '_{0}.tsv'.format(file_id)) + output_file = os.path.join(output_directory, output_basename) + # transpose columns + df = df.T + # do not save header that contains row indexes + df.to_csv(output_file, sep='\t', header=False) + transpose_files.append(output_file) + file_id += 1 + + # concatenate all files with transposed lines + output_transpose = input_file.replace('.tsv', '_transpose.tsv') + fo.concatenate_files(transpose_files, output_transpose) + + return output_transpose def merge_triangular_matrices(upper_matrix, lower_matrix, output_file, matrix_size): - """Merge two triangular matrices to create a symmetric matrix. - - Parameters - ---------- - upper_matrix : str - Path to the TSV file that contains the upper - triangular matrix. - lower_matrix : str - Path to the TSV file that contains the lower - triangular matrix. - output_file : str - Path to the output file to which the symmetric - matrix will be saved. - matrix_size : int - Total number of lines in the triangular matrix. - - Returns - ------- - None. - """ - with open(upper_matrix, 'r') as upper_handle, open(lower_matrix, 'r') as lower_handle: - upper_reader = csv.reader(upper_handle, delimiter='\t') - lower_reader = csv.reader(lower_handle, delimiter='\t') - - merged_lines = [] - for i in range(matrix_size): - upper_line = upper_reader.__next__() - lower_line = lower_reader.__next__() - merged_line = [e - if e != '' - else lower_line[i] - for i, e in enumerate(upper_line)] - merged_lines.append(merged_line) - - if len(merged_lines) >= 200 or i == (matrix_size-1): - merged_lines = [im.join_list(line, '\t') for line in merged_lines] - fo.write_lines(merged_lines, output_file, joiner='\n', write_mode='a') - merged_lines = [] + """Merge two triangular matrices to create a symmetric matrix. + + Parameters + ---------- + upper_matrix : str + Path to the TSV file that contains the upper + triangular matrix. + lower_matrix : str + Path to the TSV file that contains the lower + triangular matrix. + output_file : str + Path to the output file to which the symmetric + matrix will be saved. + matrix_size : int + Total number of lines in the triangular matrix. + + Returns + ------- + None. + """ + with open(upper_matrix, 'r') as upper_handle, open(lower_matrix, 'r') as lower_handle: + upper_reader = csv.reader(upper_handle, delimiter='\t') + lower_reader = csv.reader(lower_handle, delimiter='\t') + + merged_lines = [] + for i in range(matrix_size): + upper_line = upper_reader.__next__() + lower_line = lower_reader.__next__() + merged_line = [e + if e != '' + else lower_line[i] + for i, e in enumerate(upper_line)] + merged_lines.append(merged_line) + + if len(merged_lines) >= 200 or i == (matrix_size-1): + merged_lines = [im.join_list(line, '\t') for line in merged_lines] + fo.write_lines(merged_lines, output_file, joiner='\n', write_mode='a') + merged_lines = [] def symmetrify_matrix(input_matrix, matrix_size, tmp_directory): - """Symmetrify a triangular matrix. + """Symmetrify a triangular matrix. - Parameters - ---------- - input_matrix : str - Path to TSV file that contains the triangular matrix. - matrix_size : int - Total number of lines in input file. - tmp_directory : str - Path to the output temporary directory. + Parameters + ---------- + input_matrix : str + Path to TSV file that contains the triangular matrix. + matrix_size : int + Total number of lines in input file. + tmp_directory : str + Path to the output temporary directory. - Returns - ------- - symmetric_output : str - Path to the output file that contains the symmetric - matrix. - """ - output_transpose = transpose_matrix(input_matrix, tmp_directory) + Returns + ------- + symmetric_output : str + Path to the output file that contains the symmetric + matrix. + """ + output_transpose = transpose_matrix(input_matrix, tmp_directory) - # merge upper and lower diagonal matrices into symmetric matrix - symmetric_output = input_matrix.replace('.tsv', '_symmetric.tsv') + # merge upper and lower diagonal matrices into symmetric matrix + symmetric_output = input_matrix.replace('.tsv', '_symmetric.tsv') - merge_triangular_matrices(input_matrix, output_transpose, - symmetric_output, matrix_size) + merge_triangular_matrices(input_matrix, output_transpose, + symmetric_output, matrix_size) - # delete files with triangular matrices - os.remove(input_matrix) - os.remove(output_transpose) + # delete files with triangular matrices + os.remove(input_matrix) + os.remove(output_transpose) - return symmetric_output + return symmetric_output def main(input_matrix, output_directory, cpu_cores, symmetric, masked): - """Compute a distance matrix based on allelic profiles. - - Parameters - ---------- - input_matrix : str - Path to a TSV file with allelic profiles determined by - the AlleleCall module. - output_directory : str - Path to the output directory. - cpu_cores : int - Number of CPU cores used to compute distances. - symmetric : bool - Determine a symmetric pairwise distance matrix, instead - of a triangular matrix. - masked : bool - False if the input matrix values are masked, True otherwise. - The process will mask the matrix values it this value is False. - - Returns - ------- - output_pairwise : str - Path to the TSV file that contains the distance matrix. - """ - # Create output directory if it does not exist - if os.path.isdir(output_directory) is False: - os.mkdir(output_directory) - - # determine input basename - input_basename = os.path.basename(input_matrix) - # remove extension that is after last '.' - input_basename = '.'.join(input_basename.split('.')[0:-1]) - - output_masked = os.path.join(output_directory, ct.MASKED_PROFILES_BASENAME) - if masked is False: - print('Masking profile matrix...', end='') - profiles_matrix = pd.read_csv(input_matrix, - header=0, index_col=0, - sep='\t', low_memory=False) - masked_profiles = profiles_matrix.apply(im.replace_chars) - masked_profiles.to_csv(output_masked, sep='\t') - print('masked matrix available at {0}'.format(output_masked)) - else: - output_masked = input_matrix - - # create temp directory to store pairwise distances per genome - tmp_directory = os.path.join(output_directory, 'temp', 'pairwise_distances') - if os.path.isdir(tmp_directory) is False: - os.mkdir(tmp_directory) - - # get sample identifiers - genome_ids = get_sample_ids(input_matrix, delimiter='\t') - total_genomes = len(genome_ids) - - np_matrix = tsv_to_nparray(output_masked) - - rows_indexes = [i for i in range(len(np_matrix))] - random.shuffle(rows_indexes) - # divide inputs into 20 lists for 5% progress resolution - parallel_inputs = im.divide_list_into_n_chunks(rows_indexes, 20) - - common_args = [[l, np_matrix, genome_ids, tmp_directory, compute_distances] - for l in parallel_inputs] - - # increasing cpu cores can greatly increase memory usage - print('Computing pairwise distances...') - results = mo.map_async_parallelizer(common_args, - mo.function_helper, - cpu_cores, - show_progress=True) - - merged = im.merge_dictionaries(results) - - print('\nCreating distance matrix...', end='') - # create files with headers - col_ids = ['FILE'] + genome_ids - output_pairwise = os.path.join(output_directory, ct.DISTANCE_MATRIX_BASENAME) - # output_p = os.path.join(output_directory, - # '{0}_shared_loci.tsv'.format(input_basename)) - - # Import arrays per genome and save to matrix file - # results = write_matrices(merged, genome_ids, output_pairwise, output_p, col_ids) - results = write_matrices(merged, genome_ids, output_pairwise, col_ids) - - if symmetric is True: - # add 1 to include header - output_pairwise = symmetrify_matrix(output_pairwise, - len(genome_ids)+1, - tmp_directory) - # output_p = symmetrify_matrix(output_p, - # len(genome_ids)+1, - # tmp_directory) - - print('done.') - - return [output_pairwise] + """Compute a distance matrix based on allelic profiles. + + Parameters + ---------- + input_matrix : str + Path to a TSV file with allelic profiles determined by + the AlleleCall module. + output_directory : str + Path to the output directory. + cpu_cores : int + Number of CPU cores used to compute distances. + symmetric : bool + Determine a symmetric pairwise distance matrix, instead + of a triangular matrix. + masked : bool + False if the input matrix values are masked, True otherwise. + The process will mask the matrix values it this value is False. + + Returns + ------- + output_pairwise : str + Path to the TSV file that contains the distance matrix. + """ + # Create output directory if it does not exist + if os.path.isdir(output_directory) is False: + os.mkdir(output_directory) + + # determine input basename + input_basename = os.path.basename(input_matrix) + # remove extension that is after last '.' + input_basename = '.'.join(input_basename.split('.')[0:-1]) + + output_masked = os.path.join(output_directory, ct.MASKED_PROFILES_BASENAME) + if masked is False: + print('Masking profile matrix...', end='') + profiles_matrix = pd.read_csv(input_matrix, + header=0, index_col=0, + sep='\t', low_memory=False) + masked_profiles = profiles_matrix.apply(im.replace_chars) + masked_profiles.to_csv(output_masked, sep='\t') + print('masked matrix available at {0}'.format(output_masked)) + else: + output_masked = input_matrix + + # create temp directory to store pairwise distances per genome + tmp_directory = os.path.join(output_directory, 'temp', 'pairwise_distances') + if os.path.isdir(tmp_directory) is False: + os.mkdir(tmp_directory) + + # get sample identifiers + genome_ids = get_sample_ids(input_matrix, delimiter='\t') + total_genomes = len(genome_ids) + + np_matrix = tsv_to_nparray(output_masked) + + rows_indexes = [i for i in range(len(np_matrix))] + random.shuffle(rows_indexes) + # divide inputs into 20 lists for 5% progress resolution + parallel_inputs = im.divide_list_into_n_chunks(rows_indexes, 20) + + common_args = [[l, np_matrix, genome_ids, tmp_directory, compute_distances] + for l in parallel_inputs] + + # increasing cpu cores can greatly increase memory usage + print('Computing pairwise distances...') + results = mo.map_async_parallelizer(common_args, + mo.function_helper, + cpu_cores, + show_progress=True) + + merged = im.merge_dictionaries(results) + + print('\nCreating distance matrix...', end='') + # create files with headers + col_ids = ['FILE'] + genome_ids + output_pairwise = os.path.join(output_directory, ct.DISTANCE_MATRIX_BASENAME) + # output_p = os.path.join(output_directory, + # '{0}_shared_loci.tsv'.format(input_basename)) + + # Import arrays per genome and save to matrix file + # results = write_matrices(merged, genome_ids, output_pairwise, output_p, col_ids) + results = write_matrices(merged, genome_ids, output_pairwise, col_ids) + + if symmetric is True: + # add 1 to include header + output_pairwise = symmetrify_matrix(output_pairwise, + len(genome_ids)+1, + tmp_directory) + # output_p = symmetrify_matrix(output_p, + # len(genome_ids)+1, + # tmp_directory) + + print('done.') + + return [output_pairwise] diff --git a/CHEWBBACA/utils/fasttree_wrapper.py b/CHEWBBACA/utils/fasttree_wrapper.py index 53fc12dc..ba30c50f 100644 --- a/CHEWBBACA/utils/fasttree_wrapper.py +++ b/CHEWBBACA/utils/fasttree_wrapper.py @@ -16,22 +16,22 @@ def call_fasttree(alignment_file, tree_file): - """Compute a phylogenetic tree based on MSA data. - - Parameters - ---------- - alignment_file : str - Path to a file with a MSA. - tree_file : str - Path to the output tree file in Newick format. - """ - proc = subprocess.Popen(['FastTree', '-fastest', '-nosupport', - '-noml', '-out', tree_file, alignment_file], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - - # Read the stdout from FastTree - stdout = proc.stdout.readlines() - stderr = proc.stderr.readlines() - - return [stdout, stderr] + """Compute a phylogenetic tree based on MSA data. + + Parameters + ---------- + alignment_file : str + Path to a file with a MSA. + tree_file : str + Path to the output tree file in Newick format. + """ + proc = subprocess.Popen(['FastTree', '-fastest', '-nosupport', + '-noml', '-out', tree_file, alignment_file], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + # Read the stdout from FastTree + stdout = proc.stdout.readlines() + stderr = proc.stderr.readlines() + + return [stdout, stderr] diff --git a/CHEWBBACA/utils/file_operations.py b/CHEWBBACA/utils/file_operations.py index 1cc418b2..f327f466 100644 --- a/CHEWBBACA/utils/file_operations.py +++ b/CHEWBBACA/utils/file_operations.py @@ -33,665 +33,665 @@ import pandas as pd try: - from utils import iterables_manipulation as im + from utils import iterables_manipulation as im except ModuleNotFoundError: - from CHEWBBACA.utils import iterables_manipulation as im + from CHEWBBACA.utils import iterables_manipulation as im def file_basename(file_path, file_extension=True): - """Extract file basename from a path. + """Extract file basename from a path. - Parameters - ---------- - file_path : str - Path to the file. - file_extension : bool - Specify if the basename should include the file - extension. + Parameters + ---------- + file_path : str + Path to the file. + file_extension : bool + Specify if the basename should include the file + extension. - Returns - ------- - basename : str - File basename extracted from input path. - """ - basename = os.path.basename(file_path) + Returns + ------- + basename : str + File basename extracted from input path. + """ + basename = os.path.basename(file_path) - if file_extension is False: - # Remove suffix after last '.' - basename = str((pathlib.Path(basename)).with_suffix('')) + if file_extension is False: + # Remove suffix after last '.' + basename = str((pathlib.Path(basename)).with_suffix('')) - return basename + return basename def split_joiner(string, indices, delimiter): - """ - """ - split_string = string.split(delimiter) - joined_string = delimiter.join([split_string[i] for i in indices]) + """ + """ + split_string = string.split(delimiter) + joined_string = delimiter.join([split_string[i] for i in indices]) - return joined_string + return joined_string def remove_files(files): - """Delete a list of files. + """Delete a list of files. - Parameters - ---------- - files : list - List with paths to the files to be deleted. - """ - for f in files: - if os.path.isfile(f) is True: - os.remove(f) + Parameters + ---------- + files : list + List with paths to the files to be deleted. + """ + for f in files: + if os.path.isfile(f) is True: + os.remove(f) def hash_file(file, hash_object, buffer_size=65536): - """Compute hash based on the contents of a file. - - Parameters - ---------- - file : str - Path to a file. - hash_object : _hashlib.HASH - Hashlib object to update based on file - contents. - buffer_size : int - Buffer size (amount of data, in KB, to - read from file). - - Returns - ------- - hash_str : str - Hash computed from file contents. - """ - updated_hash = hash_object - - with open(file, 'rb') as f: - # read file in chunks - while True: - data = f.read(buffer_size) - if not data: - break - updated_hash.update(data) - - hash_str = updated_hash.hexdigest() - - return hash_str + """Compute hash based on the contents of a file. + + Parameters + ---------- + file : str + Path to a file. + hash_object : _hashlib.HASH + Hashlib object to update based on file + contents. + buffer_size : int + Buffer size (amount of data, in KB, to + read from file). + + Returns + ------- + hash_str : str + Hash computed from file contents. + """ + updated_hash = hash_object + + with open(file, 'rb') as f: + # read file in chunks + while True: + data = f.read(buffer_size) + if not data: + break + updated_hash.update(data) + + hash_str = updated_hash.hexdigest() + + return hash_str def filter_by_substring(files, suffixes, reverse=False): - """Filter files names based on a list of suffixes. + """Filter files names based on a list of suffixes. - Parameters - ---------- - files : list - A list with filenames or file paths. - suffixes : list - List with suffixes. - reverse : bool - False to select files that contain any of the suffixes. - True to select files that do not contain any of the - suffixes. + Parameters + ---------- + files : list + A list with filenames or file paths. + suffixes : list + List with suffixes. + reverse : bool + False to select files that contain any of the suffixes. + True to select files that do not contain any of the + suffixes. - Returns - ------- - filtered : list - List with files that passed filtering. - """ - filtered = [file for file in files - if any([suffix in file for suffix in suffixes])] + Returns + ------- + filtered : list + List with files that passed filtering. + """ + filtered = [file for file in files + if any([suffix in file for suffix in suffixes])] - if reverse is True: - filtered = list(set(files)-set(filtered)) + if reverse is True: + filtered = list(set(files)-set(filtered)) - return filtered + return filtered def filter_by_extension(files, extensions): - """Filter files based on file extension. - - Parameters - ---------- - files : list - A list with filenames or file paths. - extensions : list - List with file extensions. - - Returns - ------- - has_extension : list - List with files that end with one of the provided - file extensions. - no_extension : list - List with files that do not end with one of the provided - file extensions. - """ - has_extension = [file for file in files - if any([file.endswith(ext) for ext in extensions])] - no_extension = list(set(files)-set(has_extension)) - - return has_extension, no_extension + """Filter files based on file extension. + + Parameters + ---------- + files : list + A list with filenames or file paths. + extensions : list + List with file extensions. + + Returns + ------- + has_extension : list + List with files that end with one of the provided + file extensions. + no_extension : list + List with files that do not end with one of the provided + file extensions. + """ + has_extension = [file for file in files + if any([file.endswith(ext) for ext in extensions])] + no_extension = list(set(files)-set(has_extension)) + + return has_extension, no_extension def create_directory(directory_path): - """Create a diretory if it does not exist.""" - if not os.path.exists(directory_path): - os.makedirs(directory_path) - return True - else: - return False + """Create a diretory if it does not exist.""" + if not os.path.exists(directory_path): + os.makedirs(directory_path) + return True + else: + return False def join_paths(parent_path, child_paths): - """Create path by joining a parent directory and a list of child paths.""" - joined_paths = os.path.join(parent_path, *child_paths) + """Create path by joining a parent directory and a list of child paths.""" + joined_paths = os.path.join(parent_path, *child_paths) - return joined_paths + return joined_paths def listdir_fullpath(directory_path, substring_filter=False): - """Get the full path for all files in a directory. + """Get the full path for all files in a directory. - Parameters - ---------- - directory_path : str - Path to a directory. - substring_filter : str - Only list files that contain this substring. + Parameters + ---------- + directory_path : str + Path to a directory. + substring_filter : str + Only list files that contain this substring. - Returns - ------- - file_list : list - List containing the full path for every selected - file in the input directory. - """ - file_list = [os.path.join(directory_path, file) - for file in os.listdir(directory_path)] + Returns + ------- + file_list : list + List containing the full path for every selected + file in the input directory. + """ + file_list = [os.path.join(directory_path, file) + for file in os.listdir(directory_path)] - if substring_filter is not False: - file_list = filter_by_substring(file_list, [substring_filter]) + if substring_filter is not False: + file_list = filter_by_substring(file_list, [substring_filter]) - return file_list + return file_list def delete_directory(directory_path, max_retries=5): - """Delete a directory.""" - for i in range(max_retries): - # might fail to delete files due to permission issues - try: - shutil.rmtree(directory_path) - # path does not exist - # FileNotFoundError is a subclass of OSError - # the latter will catch the former if checked first - except FileNotFoundError: - break - # sleep before retry - except OSError: - time.sleep(i) - - # check if directory still exists - exists = os.path.isdir(directory_path) - if exists is True: - print('Could not remove {0}'.format(directory_path)) - - return exists + """Delete a directory.""" + for i in range(max_retries): + # might fail to delete files due to permission issues + try: + shutil.rmtree(directory_path) + # path does not exist + # FileNotFoundError is a subclass of OSError + # the latter will catch the former if checked first + except FileNotFoundError: + break + # sleep before retry + except OSError: + time.sleep(i) + + # check if directory still exists + exists = os.path.isdir(directory_path) + if exists is True: + print('Could not remove {0}'.format(directory_path)) + + return exists def read_file(input_file): - """Read a text file. + """Read a text file. - Parameters - ---------- - input_file : str - Path to the file to read. + Parameters + ---------- + input_file : str + Path to the file to read. - Returns - ------- - file_content : str - Single string with the file content. - """ - with open(input_file, 'r') as infile: - file_content = infile.read() + Returns + ------- + file_content : str + Single string with the file content. + """ + with open(input_file, 'r') as infile: + file_content = infile.read() - return file_content + return file_content def read_lines(input_file, strip=True, num_lines=None): - """Read lines in a file. + """Read lines in a file. - Parameters - ---------- - input_file : str - Path to the input file. - strip : bool - Specify if lines should be stripped of leading - and trailing white spaces and new line characters. + Parameters + ---------- + input_file : str + Path to the input file. + strip : bool + Specify if lines should be stripped of leading + and trailing white spaces and new line characters. - Returns - ------- - lines : list - List with the lines read from the input file. - """ - with open(input_file, 'r') as infile: - if num_lines is None: - lines = [line for line in infile.readlines()] - else: - lines = list(islice(infile, num_lines)) + Returns + ------- + lines : list + List with the lines read from the input file. + """ + with open(input_file, 'r') as infile: + if num_lines is None: + lines = [line for line in infile.readlines()] + else: + lines = list(islice(infile, num_lines)) - if strip is True: - lines = [line.strip() for line in lines] + if strip is True: + lines = [line.strip() for line in lines] - return lines + return lines def pickle_dumper(content, output_file): - """Use the Pickle module to serialize an object. + """Use the Pickle module to serialize an object. - Parameters - ---------- - content : type - Variable that refers to the object that will - be serialized and written to the output file. - output_file : str - Path to the output file. - """ - with open(output_file, 'wb') as poutfile: - pickle.dump(content, poutfile) + Parameters + ---------- + content : type + Variable that refers to the object that will + be serialized and written to the output file. + output_file : str + Path to the output file. + """ + with open(output_file, 'wb') as poutfile: + pickle.dump(content, poutfile) def pickle_loader(input_file): - """Use the Pickle module to de-serialize an object. + """Use the Pickle module to de-serialize an object. - Parameters - ---------- - input_file : str - Path to file with byte stream to be de-serialized. + Parameters + ---------- + input_file : str + Path to file with byte stream to be de-serialized. - Returns - ------- - content : type - Variable that refers to the de-serialized - object. - """ - with open(input_file, 'rb') as pinfile: - content = pickle.load(pinfile) + Returns + ------- + content : type + Variable that refers to the de-serialized + object. + """ + with open(input_file, 'rb') as pinfile: + content = pickle.load(pinfile) - return content + return content def file_zipper(input_file, zip_file): - """Zip (compresses) a file. + """Zip (compresses) a file. - Parameters - ---------- - input_file : str - Path to the file that will be compressed. - zip_file : str - Path to the ZIP file that will be created. + Parameters + ---------- + input_file : str + Path to the file that will be compressed. + zip_file : str + Path to the ZIP file that will be created. - Returns - ------- - zip_file : str - Path to the ZIP file that was created by - compressing the input file. - """ - with zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf: - zf.write(input_file, os.path.basename(input_file)) + Returns + ------- + zip_file : str + Path to the ZIP file that was created by + compressing the input file. + """ + with zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf: + zf.write(input_file, os.path.basename(input_file)) - return zip_file + return zip_file def unzip_file(compressed_file, archive_type='.gz'): - """Uncompresse a file. + """Uncompresse a file. - Parameters - ---------- - compressed_file : str - Path to the compressed file. - archive_type : str - Archive format. + Parameters + ---------- + compressed_file : str + Path to the compressed file. + archive_type : str + Archive format. - Returns - ------- - uncompressed_file : str - Path to the uncompressed file. - """ - lines = [] - with gzip.open(compressed_file, 'rb') as f: - for line in f: - lines.append(line.decode()) + Returns + ------- + uncompressed_file : str + Path to the uncompressed file. + """ + lines = [] + with gzip.open(compressed_file, 'rb') as f: + for line in f: + lines.append(line.decode()) - # save uncompressed contents - uncompressed_file = compressed_file.rstrip('.gz') - write_lines(lines, uncompressed_file, joiner='') + # save uncompressed contents + uncompressed_file = compressed_file.rstrip('.gz') + write_lines(lines, uncompressed_file, joiner='') - return uncompressed_file + return uncompressed_file def download_file(file_url, outfile, max_tries=3): - """Download a file. - - Parameters - ---------- - file_url : str - URL to download file. - outfile : str - Path to the downloaded file. - max_tries : int - Maximum number of retries if the download - fails. - """ - tries = 0 - downloaded = False - while downloaded is False and tries <= max_tries: - try: - res = urllib.request.urlretrieve(file_url, outfile) - if os.path.isfile(outfile) is True: - downloaded = True - except Exception as e: - print(e) - time.sleep(1) - tries += 1 - - return downloaded + """Download a file. + + Parameters + ---------- + file_url : str + URL to download file. + outfile : str + Path to the downloaded file. + max_tries : int + Maximum number of retries if the download + fails. + """ + tries = 0 + downloaded = False + while downloaded is False and tries <= max_tries: + try: + res = urllib.request.urlretrieve(file_url, outfile) + if os.path.isfile(outfile) is True: + downloaded = True + except Exception as e: + print(e) + time.sleep(1) + tries += 1 + + return downloaded def concatenate_files(files, output_file, header=None): - """Concatenate files. - - Parameters - ---------- - files : list - List with the paths to the files to concatenate. - output_file : str - Path to the output file that will store the concatenation - of input files. - header : str or NoneType - Specify a header that should be written as the first line - in the output file. - - Returns - ------- - output_file : str - Path to the output file that was created with - the concatenation of input files. - """ - with open(output_file, 'w') as outfile: - if header is not None: - outfile.write(header) - for file in files: - with open(file, 'r') as infile: - shutil.copyfileobj(infile, outfile) - - return output_file + """Concatenate files. + + Parameters + ---------- + files : list + List with the paths to the files to concatenate. + output_file : str + Path to the output file that will store the concatenation + of input files. + header : str or NoneType + Specify a header that should be written as the first line + in the output file. + + Returns + ------- + output_file : str + Path to the output file that was created with + the concatenation of input files. + """ + with open(output_file, 'w') as outfile: + if header is not None: + outfile.write(header) + for file in files: + with open(file, 'r') as infile: + shutil.copyfileobj(infile, outfile) + + return output_file def write_to_file(text, output_file, write_mode, end_char): - """Write a single string to a file. - - Parameters - ---------- - text : str - A single string to write to the output file. - output_file : str - Path to the output file. - write_mode : str - Specify write mode ('w' creates file if it does not - exist and truncates and over-writes existing file, - 'a' creates file if it does not exist and appends to - the end of file if it exists.). - end_char : str - Character added to the end of the file. - """ - with open(output_file, write_mode) as out: - out.write(text+end_char) + """Write a single string to a file. + + Parameters + ---------- + text : str + A single string to write to the output file. + output_file : str + Path to the output file. + write_mode : str + Specify write mode ('w' creates file if it does not + exist and truncates and over-writes existing file, + 'a' creates file if it does not exist and appends to + the end of file if it exists.). + end_char : str + Character added to the end of the file. + """ + with open(output_file, write_mode) as out: + out.write(text+end_char) def write_lines(lines, output_file, joiner='\n', write_mode='w'): - """Write a list of strings to a file. - - Parameters - ---------- - lines : list - List with the lines/strings to write to the output - file. - output_file : str - Path to the output file. - joiner : str - Character used to join lines. - write_mode : str - Specify write mode ('w' creates file if it does not - exist and truncates and over-writes existing file, - 'a' creates file if it does not exist and appends to - the end of file if it exists). - """ - joined_lines = im.join_list(lines, joiner) - - write_to_file(joined_lines, output_file, write_mode, '\n') + """Write a list of strings to a file. + + Parameters + ---------- + lines : list + List with the lines/strings to write to the output + file. + output_file : str + Path to the output file. + joiner : str + Character used to join lines. + write_mode : str + Specify write mode ('w' creates file if it does not + exist and truncates and over-writes existing file, + 'a' creates file if it does not exist and appends to + the end of file if it exists). + """ + joined_lines = im.join_list(lines, joiner) + + write_to_file(joined_lines, output_file, write_mode, '\n') def read_tabular(input_file, delimiter='\t'): - """Read a tabular (TSV) file. + """Read a tabular (TSV) file. - Parameters - ---------- - input_file : str - Path to a tabular file. - delimiter : str - Delimiter used to separate file fields. + Parameters + ---------- + input_file : str + Path to a tabular file. + delimiter : str + Delimiter used to separate file fields. - Returns - ------- - lines : list - A list with a sublist per line in the input file. - Each sublist has the fields that were separated by - the defined delimiter. - """ - with open(input_file, 'r') as infile: - reader = csv.reader(infile, delimiter=delimiter) - lines = [line for line in reader] + Returns + ------- + lines : list + A list with a sublist per line in the input file. + Each sublist has the fields that were separated by + the defined delimiter. + """ + with open(input_file, 'r') as infile: + reader = csv.reader(infile, delimiter=delimiter) + lines = [line for line in reader] - return lines + return lines def input_timeout(prompt, timeout=30): - """Add timeout feature when requesting user input. - - Parameters - ---------- - prompt : str - Message to print to stdout to request for user - input. - timeout : int - Maximum number of seconds that the process will - wait for input. - - Returns - ------- - String with user input. - - Raises - ------ - SystemExit - - If there is no user input before timeout. - """ - pool = ThreadPool(processes=1) - answer = pool.apply_async(input, args=[prompt]) - - try: - return answer.get(timeout=timeout) - except TimeoutError: - sys.exit('Timed out.') + """Add timeout feature when requesting user input. + + Parameters + ---------- + prompt : str + Message to print to stdout to request for user + input. + timeout : int + Maximum number of seconds that the process will + wait for input. + + Returns + ------- + String with user input. + + Raises + ------ + SystemExit + - If there is no user input before timeout. + """ + pool = ThreadPool(processes=1) + answer = pool.apply_async(input, args=[prompt]) + + try: + return answer.get(timeout=timeout) + except TimeoutError: + sys.exit('Timed out.') def is_file_empty(file_path): - """Check if file is empty by confirming if its size is 0 bytes.""" - # Check if file exist and it is empty - return os.path.exists(file_path) and os.stat(file_path).st_size == 0 + """Check if file is empty by confirming if its size is 0 bytes.""" + # Check if file exist and it is empty + return os.path.exists(file_path) and os.stat(file_path).st_size == 0 def is_file_empty_2(file_name): - """Check if file is empty by confirming if its size is 0 bytes.""" - # Check if file exist and it is empty - return os.path.isfile(file_name) and os.path.getsize(file_name) == 0 + """Check if file is empty by confirming if its size is 0 bytes.""" + # Check if file exist and it is empty + return os.path.isfile(file_name) and os.path.getsize(file_name) == 0 def is_file_empty_3(file_name): - """Check if file is empty by reading first character in it.""" - # open ile in read mode - with open(file_name, 'r') as read_obj: - # read first character - one_char = read_obj.read(1) - # if not fetched then file is empty - if not one_char: - return True - elif one_char == " ": - return True - return False + """Check if file is empty by reading first character in it.""" + # open ile in read mode + with open(file_name, 'r') as read_obj: + # read first character + one_char = read_obj.read(1) + # if not fetched then file is empty + if not one_char: + return True + elif one_char == " ": + return True + return False def create_short(schema_files, schema_dir): - """Create the 'short' directory for a chewBBACA schema. + """Create the 'short' directory for a chewBBACA schema. - Creates the 'short' directory inside a schema's main - directory and copies the FASTA files with the representative - alleles into the 'short' folder. + Creates the 'short' directory inside a schema's main + directory and copies the FASTA files with the representative + alleles into the 'short' folder. - Parameters - ---------- - schema_files : list - List with paths to FASTA files that contain the loci - representative alleles. - schema_dir : str - Path to the schema's directory. + Parameters + ---------- + schema_files : list + List with paths to FASTA files that contain the loci + representative alleles. + schema_dir : str + Path to the schema's directory. - Returns - ------- - True on completion. - """ - short_path = join_paths(schema_dir, ['short']) - create_directory(short_path) + Returns + ------- + True on completion. + """ + short_path = join_paths(schema_dir, ['short']) + create_directory(short_path) - for file in schema_files: - locus_id = file_basename(file, False) - short_file = join_paths(short_path, [f'{locus_id}_short.fasta']) - shutil.copy(file, short_file) + for file in schema_files: + locus_id = file_basename(file, False) + short_file = join_paths(short_path, [f'{locus_id}_short.fasta']) + shutil.copy(file, short_file) - return True + return True def copy_file(source, destination): - """Copy a file to specified destination.""" - shutil.copy(source, destination) + """Copy a file to specified destination.""" + shutil.copy(source, destination) def move_file(source, destination): - """Move a file to specified destination.""" - shutil.move(source, destination) + """Move a file to specified destination.""" + shutil.move(source, destination) def matching_lines(input_file, pattern): - """Retrieve lines from a file that contain a pattern. + """Retrieve lines from a file that contain a pattern. - Parameters - ---------- - input_file : str - Path to input file. - pattern : str - Pattern to match. + Parameters + ---------- + input_file : str + Path to input file. + pattern : str + Pattern to match. - Returns - ------- - matched_lines : list - List with lines that contain the pattern. - """ - with open(input_file, 'r') as infile: - matched_lines = [line for line in infile if pattern in line] + Returns + ------- + matched_lines : list + List with lines that contain the pattern. + """ + with open(input_file, 'r') as infile: + matched_lines = [line for line in infile if pattern in line] - return matched_lines + return matched_lines def transpose_matrix(input_file, output_directory): - """Transpose a TSV file. - - Parameters - ---------- - input_file : str - Path to the input TSV file. - output_directory : str - Path to the directory to which intermediate files - and the complete transposed file will be written. - - Returns - ------- - transposed_file : str - Path to the file with the transposed matrix. - This file is created by concatenating all - intermediate files. - """ - intermediate_files = [] - with open(input_file, 'r') as infile: - # Get column identifiers - columns = [c.strip() for c in (infile.__next__()).split('\t')] - # Divide into smaller sets to avoid loading complete file - total_column_sets = math.ceil(len(columns)/100) - column_sets = im.divide_list_into_n_chunks(columns, total_column_sets) - # Use Pandas to read columns sets and save transpose - for i, c in enumerate(column_sets): - # dtype=str or Pandas converts values into floats - df = pd.read_csv(input_file, usecols=c, delimiter='\t', dtype=str) - output_file = join_paths(output_directory, ['chunk{0}.tsv'.format(i)]) - # Transpose columns - df = df.T - # Do not save header that contains row indexes - df.to_csv(output_file, sep='\t', header=False) - intermediate_files.append(output_file) - - # Concatenate all files with transposed lines - transposed_file = input_file.replace('.tsv', '_transpose.tsv') - concatenate_files(intermediate_files, transposed_file) - - # Delete intermediate files - remove_files(intermediate_files) - - return transposed_file + """Transpose a TSV file. + + Parameters + ---------- + input_file : str + Path to the input TSV file. + output_directory : str + Path to the directory to which intermediate files + and the complete transposed file will be written. + + Returns + ------- + transposed_file : str + Path to the file with the transposed matrix. + This file is created by concatenating all + intermediate files. + """ + intermediate_files = [] + with open(input_file, 'r') as infile: + # Get column identifiers + columns = [c.strip() for c in (infile.__next__()).split('\t')] + # Divide into smaller sets to avoid loading complete file + total_column_sets = math.ceil(len(columns)/100) + column_sets = im.divide_list_into_n_chunks(columns, total_column_sets) + # Use Pandas to read columns sets and save transpose + for i, c in enumerate(column_sets): + # dtype=str or Pandas converts values into floats + df = pd.read_csv(input_file, usecols=c, delimiter='\t', dtype=str) + output_file = join_paths(output_directory, ['chunk{0}.tsv'.format(i)]) + # Transpose columns + df = df.T + # Do not save header that contains row indexes + df.to_csv(output_file, sep='\t', header=False) + intermediate_files.append(output_file) + + # Concatenate all files with transposed lines + transposed_file = input_file.replace('.tsv', '_transpose.tsv') + concatenate_files(intermediate_files, transposed_file) + + # Delete intermediate files + remove_files(intermediate_files) + + return transposed_file def count_repeated_matrix(matrix_file, ignore_values): - """Count number of repeated elements per line in a matrix. - - Parameters - ---------- - matrix_file : str - Path to the file that contains the matrix. - ignore_values : list - List of values to ignore when counting repeated elements. - - Returns - ------- - repeated_values : set - Set of values that appear more than once in at least a row - of the matrix. - repeated_counts : dict - Dictionary with row identifiers as keys and the total number - of times a value is repeated in that row as values. - """ - repeated_counts = {} - repeated_values = set() - with open(matrix_file, 'r') as infile: - csv_reader = csv.reader(infile, delimiter='\t') - header = csv_reader.__next__() - for i, l in enumerate(csv_reader): - genome_id = l[0] - # Count number of ocurrences for each hash/seqid - hashes = [h for h in l[1:] if h not in ignore_values] - hash_counts = Counter(hashes) - repeated_hashes = [count - for count in hash_counts.most_common() - if count[1] > 1] - for h in repeated_hashes: - repeated_values.add(h[0]) - # Store number of times a CDS is repeated -1 to adjust - # CDS counts per input - repeated_counts[genome_id] = sum((h[1]-1) for h in repeated_hashes) - - return [repeated_values, repeated_counts] + """Count number of repeated elements per line in a matrix. + + Parameters + ---------- + matrix_file : str + Path to the file that contains the matrix. + ignore_values : list + List of values to ignore when counting repeated elements. + + Returns + ------- + repeated_values : set + Set of values that appear more than once in at least a row + of the matrix. + repeated_counts : dict + Dictionary with row identifiers as keys and the total number + of times a value is repeated in that row as values. + """ + repeated_counts = {} + repeated_values = set() + with open(matrix_file, 'r') as infile: + csv_reader = csv.reader(infile, delimiter='\t') + header = csv_reader.__next__() + for i, l in enumerate(csv_reader): + genome_id = l[0] + # Count number of ocurrences for each hash/seqid + hashes = [h for h in l[1:] if h not in ignore_values] + hash_counts = Counter(hashes) + repeated_hashes = [count + for count in hash_counts.most_common() + if count[1] > 1] + for h in repeated_hashes: + repeated_values.add(h[0]) + # Store number of times a CDS is repeated -1 to adjust + # CDS counts per input + repeated_counts[genome_id] = sum((h[1]-1) for h in repeated_hashes) + + return [repeated_values, repeated_counts] diff --git a/CHEWBBACA/utils/gene_prediction.py b/CHEWBBACA/utils/gene_prediction.py index f2a5eb95..aa352214 100644 --- a/CHEWBBACA/utils/gene_prediction.py +++ b/CHEWBBACA/utils/gene_prediction.py @@ -15,244 +15,244 @@ import pyrodigal try: - from utils import (constants as ct, - file_operations as fo, - fasta_operations as fao, - iterables_manipulation as im) + from utils import (constants as ct, + file_operations as fo, + fasta_operations as fao, + iterables_manipulation as im) except ModuleNotFoundError: - from CHEWBBACA.utils import (constants as ct, - file_operations as fo, - fasta_operations as fao, - iterables_manipulation as im) + from CHEWBBACA.utils import (constants as ct, + file_operations as fo, + fasta_operations as fao, + iterables_manipulation as im) def create_gene_finder(training_data, closed, mask, meta): - """Create a Pyrodigal GeneFinder object. - - Parameters - ---------- - training_data : pyrodigal.TrainingInfo - A training info instance used to predict genes in single - mode. - closed : bool - True to prevent prediction of partial genes at edges of - sequences, False otherwise. - meta: bool - True to run Prodigal in `meta` mode (uses pre-trained - profiles). - - Returns - ------- - gene_finder : pyrodigal.GeneFinder - A GeneFinder object configured based on provided arguments. - """ - gene_finder = pyrodigal.GeneFinder(training_info=training_data, - closed=closed, - mask=mask, - meta=meta) - - return gene_finder + """Create a Pyrodigal GeneFinder object. + + Parameters + ---------- + training_data : pyrodigal.TrainingInfo + A training info instance used to predict genes in single + mode. + closed : bool + True to prevent prediction of partial genes at edges of + sequences, False otherwise. + meta: bool + True to run Prodigal in `meta` mode (uses pre-trained + profiles). + + Returns + ------- + gene_finder : pyrodigal.GeneFinder + A GeneFinder object configured based on provided arguments. + """ + gene_finder = pyrodigal.GeneFinder(training_info=training_data, + closed=closed, + mask=mask, + meta=meta) + + return gene_finder def train_gene_finder(gene_finder, sequences, translation_table): - """Train a Pyrodigal GeneFinder object based on a set of sequences. + """Train a Pyrodigal GeneFinder object based on a set of sequences. - Parameters - ---------- - gene_finder : pyrodigal.GeneFinder - A GeneFinder object. - sequences : list - Sequences used to train the GeneFinder (list - of bytes objects). - translation_table : int - Translation table to use. + Parameters + ---------- + gene_finder : pyrodigal.GeneFinder + A GeneFinder object. + sequences : list + Sequences used to train the GeneFinder (list + of bytes objects). + translation_table : int + Translation table to use. - Return - ------ - gene_finder : pyrodigal.GeneFinder - A GeneFinder object configured based on provided arguments. - """ - gene_finder.train(*sequences, translation_table=translation_table) + Return + ------ + gene_finder : pyrodigal.GeneFinder + A GeneFinder object configured based on provided arguments. + """ + gene_finder.train(*sequences, translation_table=translation_table) - return gene_finder + return gene_finder def read_training_file(training_file): - """Load training info for Pyrodigal from Prodigal training file. + """Load training info for Pyrodigal from Prodigal training file. - Parameters - ---------- - training_file : str - Path to Prodigal training file. + Parameters + ---------- + training_file : str + Path to Prodigal training file. - Returns - ------- - training_data : pyrodigal.TrainingInfo - The deserialized training info. - """ - with open(training_file, 'rb') as infile: - training_data = pyrodigal.TrainingInfo.load(infile) + Returns + ------- + training_data : pyrodigal.TrainingInfo + The deserialized training info. + """ + with open(training_file, 'rb') as infile: + training_data = pyrodigal.TrainingInfo.load(infile) - return training_data + return training_data def get_gene_info(contig_id, genome_id, protid, genes): - """Get genes information from a pyrodigal.Genes object. - - Parameters - ---------- - contig_id : str - The unique identifier of the sequence/contig. - genome_id : str - The unique identifier of the genome/file. - protid : int - The integer identifier to attriute to the first gene. - genes : pyrodigal.Genes - The list of genes predicted by Prodigal. - - Returns - ------- - gene_info : list - List with one sublist per gene predicted. Each sublist - includes the sequence SHA256 hash, the DNA sequence, the - genome identifier, the contig identifier, the start position - in the sequence, the end position, the integer identifier and - the strand the gene was identified in. - protid : int - The integer identifier to attribute to the first gene - in the next sequence/contig. - """ - gene_info = [] - for gene in genes: - sequence = gene.sequence() - sequence_hash = im.hash_sequence(sequence) - gene_info.append([sequence_hash, sequence, genome_id, contig_id, - str(gene.begin), str(gene.end), str(protid), - str(gene.strand)]) - protid += 1 - - return gene_info, protid + """Get genes information from a pyrodigal.Genes object. + + Parameters + ---------- + contig_id : str + The unique identifier of the sequence/contig. + genome_id : str + The unique identifier of the genome/file. + protid : int + The integer identifier to attriute to the first gene. + genes : pyrodigal.Genes + The list of genes predicted by Prodigal. + + Returns + ------- + gene_info : list + List with one sublist per gene predicted. Each sublist + includes the sequence SHA256 hash, the DNA sequence, the + genome identifier, the contig identifier, the start position + in the sequence, the end position, the integer identifier and + the strand the gene was identified in. + protid : int + The integer identifier to attribute to the first gene + in the next sequence/contig. + """ + gene_info = [] + for gene in genes: + sequence = gene.sequence() + sequence_hash = im.hash_sequence(sequence) + gene_info.append([sequence_hash, sequence, genome_id, contig_id, + str(gene.begin), str(gene.end), str(protid), + str(gene.strand)]) + protid += 1 + + return gene_info, protid def write_gene_fasta(gene_info, output_file): - """Write a FASTA file based on the results returned by `get_gene_info`. - - Parameters - ---------- - gene_info : list - List with the data for the genes returned by `get_gene_info`. - output_file : str - Path to the output FASTA file. - """ - fasta_sequences = [] - for gene in gene_info: - fasta_str = ct.FASTA_CDS_TEMPLATE.format(gene[2], gene[6], gene[1]) - fasta_sequences.append(fasta_str) - fo.write_lines(fasta_sequences, output_file) + """Write a FASTA file based on the results returned by `get_gene_info`. + + Parameters + ---------- + gene_info : list + List with the data for the genes returned by `get_gene_info`. + output_file : str + Path to the output FASTA file. + """ + fasta_sequences = [] + for gene in gene_info: + fasta_str = ct.FASTA_CDS_TEMPLATE.format(gene[2], gene[6], gene[1]) + fasta_sequences.append(fasta_str) + fo.write_lines(fasta_sequences, output_file) def write_coordinates_pickle(gene_info, contig_sizes, output_file): - """Write gene coordinates to a pickle file. - - Parameters - ---------- - gene_info : list - List with the data for the genes returned by `get_gene_info`. - contig_sizes : dict - Dictionary with contig/sequence identifiers as keys and - contig/sequence size as values. - output_file : str - Path to the output file. - """ - gene_coordinates = {} - for gene in gene_info: - gene_coordinates.setdefault(gene[0], []).append(gene[2:]) - fo.pickle_dumper([gene_coordinates, contig_sizes], output_file) + """Write gene coordinates to a pickle file. + + Parameters + ---------- + gene_info : list + List with the data for the genes returned by `get_gene_info`. + contig_sizes : dict + Dictionary with contig/sequence identifiers as keys and + contig/sequence size as values. + output_file : str + Path to the output file. + """ + gene_coordinates = {} + for gene in gene_info: + gene_coordinates.setdefault(gene[0], []).append(gene[2:]) + fo.pickle_dumper([gene_coordinates, contig_sizes], output_file) def predict_genome_genes(input_file, output_directory, gene_finder, - translation_table): - """Predict genes for sequences in a FASTA file. - - Parameters - ---------- - input_file : str - Path to the FASTA file. - output_directory : str - Path to the output_directory to store files with - the results. - gene_finder : pyrodigal.GeneFinder - A GeneFinder object. - translation_table : int - Translation table used to configure the GeneFinder - (None type if the GeneFinder does not need to be - configured). - - Returns - ------- - input_file : str - Path to the input FASTA file. - total_genome : int - Total number of genes predicted. - fasta_outfile : str - Path to the output FASTA file that contains the - predited gene sequences. - coordinates_outfile : str - Path to the output pickle file that contains the gene - coordinates and contig size data. - """ - # Get genome unique identifier - genome_basename = input_file[1] - records = fao.sequence_generator(input_file[0]) - records = {rec.id: bytes(rec.seq) for rec in records} - contig_sizes = {recid: len(sequence) - for recid, sequence in records.items()} - - # Train based on input sequences - # Only train if there is no GeneFinder object - if gene_finder is not None: - current_gene_finder = gene_finder - else: - current_gene_finder = create_gene_finder(None, True, True, False) - current_gene_finder = train_gene_finder(current_gene_finder, - records.values(), - translation_table) - - # Predict genes for all input contigs - contig_genes = {} - for recid, sequence in records.items(): - genes = current_gene_finder.find_genes(sequence) - contig_genes[recid] = genes - - # Extract data from Gene objects - protid = 1 - gene_info = [] + translation_table): + """Predict genes for sequences in a FASTA file. + + Parameters + ---------- + input_file : str + Path to the FASTA file. + output_directory : str + Path to the output_directory to store files with + the results. + gene_finder : pyrodigal.GeneFinder + A GeneFinder object. + translation_table : int + Translation table used to configure the GeneFinder + (None type if the GeneFinder does not need to be + configured). + + Returns + ------- + input_file : str + Path to the input FASTA file. + total_genome : int + Total number of genes predicted. + fasta_outfile : str + Path to the output FASTA file that contains the + predited gene sequences. + coordinates_outfile : str + Path to the output pickle file that contains the gene + coordinates and contig size data. + """ + # Get genome unique identifier + genome_basename = input_file[1] + records = fao.sequence_generator(input_file[0]) + records = {rec.id: bytes(rec.seq) for rec in records} + contig_sizes = {recid: len(sequence) + for recid, sequence in records.items()} + + # Train based on input sequences + # Only train if there is no GeneFinder object + if gene_finder is not None: + current_gene_finder = gene_finder + else: + current_gene_finder = create_gene_finder(None, True, True, False) + current_gene_finder = train_gene_finder(current_gene_finder, + records.values(), + translation_table) + + # Predict genes for all input contigs + contig_genes = {} + for recid, sequence in records.items(): + genes = current_gene_finder.find_genes(sequence) + contig_genes[recid] = genes + + # Extract data from Gene objects + protid = 1 + gene_info = [] # Store data about first and last CDS in each sequence to speedup PLOT classification - close_to_tip = {genome_basename: {}} - for recid, genes in contig_genes.items(): - data = get_gene_info(recid, genome_basename, protid, genes) - gene_info.extend(data[0]) - if len(data[0]) > 0: - first_cds = data[0][0] - close_to_tip[genome_basename].setdefault(first_cds[0], []).append((contig_sizes[first_cds[3]], int(first_cds[4]), int(first_cds[5]), first_cds[-1])) - if first_cds != data[0][-1]: - last_cds = data[0][-1] - close_to_tip[genome_basename].setdefault(last_cds[0], []).append((contig_sizes[last_cds[3]], int(last_cds[4]), int(last_cds[5]), last_cds[-1])) + close_to_tip = {genome_basename: {}} + for recid, genes in contig_genes.items(): + data = get_gene_info(recid, genome_basename, protid, genes) + gene_info.extend(data[0]) + if len(data[0]) > 0: + first_cds = data[0][0] + close_to_tip[genome_basename].setdefault(first_cds[0], []).append((contig_sizes[first_cds[3]], int(first_cds[4]), int(first_cds[5]), first_cds[-1])) + if first_cds != data[0][-1]: + last_cds = data[0][-1] + close_to_tip[genome_basename].setdefault(last_cds[0], []).append((contig_sizes[last_cds[3]], int(last_cds[4]), int(last_cds[5]), last_cds[-1])) # Reset protid based on the number of CDSs predicted for the sequence - protid = data[1] - - total_genome = len(gene_info) - fasta_outfile = None - coordinates_outfile = None - if total_genome > 0: - # Create FASTA file with DNA sequences - fasta_outfile = fo.join_paths(output_directory, - [f'{genome_basename}.fasta']) - write_gene_fasta(gene_info, fasta_outfile) - - # Save gene coordinates and contig sizes to pickle - coordinates_outfile = fo.join_paths(output_directory, - [f'{genome_basename}_coordinates']) - write_coordinates_pickle(gene_info, contig_sizes, coordinates_outfile) - - return [input_file, total_genome, fasta_outfile, coordinates_outfile, close_to_tip] + protid = data[1] + + total_genome = len(gene_info) + fasta_outfile = None + coordinates_outfile = None + if total_genome > 0: + # Create FASTA file with DNA sequences + fasta_outfile = fo.join_paths(output_directory, + [f'{genome_basename}.fasta']) + write_gene_fasta(gene_info, fasta_outfile) + + # Save gene coordinates and contig sizes to pickle + coordinates_outfile = fo.join_paths(output_directory, + [f'{genome_basename}_coordinates']) + write_coordinates_pickle(gene_info, contig_sizes, coordinates_outfile) + + return [input_file, total_genome, fasta_outfile, coordinates_outfile, close_to_tip] diff --git a/CHEWBBACA/utils/iterables_manipulation.py b/CHEWBBACA/utils/iterables_manipulation.py index 7cfc94d6..0b45a21b 100644 --- a/CHEWBBACA/utils/iterables_manipulation.py +++ b/CHEWBBACA/utils/iterables_manipulation.py @@ -16,1013 +16,1013 @@ import itertools try: - from utils import (constants as ct, - fasta_operations as fao) + from utils import (constants as ct, + fasta_operations as fao) except ModuleNotFoundError: - from CHEWBBACA.utils import (constants as ct, - fasta_operations as fao) + from CHEWBBACA.utils import (constants as ct, + fasta_operations as fao) def match_regex(text, pattern): - """Extract substrings that match a regex pattern. + """Extract substrings that match a regex pattern. - Parameters - ---------- - text : str - Input text to search the pattern in. - pattern : str - Regular expression pattern. + Parameters + ---------- + text : str + Input text to search the pattern in. + pattern : str + Regular expression pattern. - Returns - ------- - match : str - Matched substring or NoneType if no matches - were found. - """ - match = re.search(pattern, text) - if match is not None: - match_span = match.span() - match = text[match_span[0]:match_span[1]] + Returns + ------- + match : str + Matched substring or NoneType if no matches + were found. + """ + match = re.search(pattern, text) + if match is not None: + match_span = match.span() + match = text[match_span[0]:match_span[1]] - return match + return match def join_list(lst, delimiter): - """Join all elements in a list into a single string. + """Join all elements in a list into a single string. - Parameters - ---------- - lst : list - List with elements to be joined. - delimiter : str - Character used to join list elements. + Parameters + ---------- + lst : list + List with elements to be joined. + delimiter : str + Character used to join list elements. - Returns - ------- - joined_list : str - A single string with all elements in the input - list joined by the character chosen as link. - """ - joined_list = delimiter.join(lst) + Returns + ------- + joined_list : str + A single string with all elements in the input + list joined by the character chosen as link. + """ + joined_list = delimiter.join(lst) - return joined_list + return joined_list def flatten_list(list_to_flatten): - """Flatten one level of a nested list. + """Flatten one level of a nested list. - Parameters - ---------- - list_to_flatten : list - Nested list to flatten. + Parameters + ---------- + list_to_flatten : list + Nested list to flatten. - Returns - ------- - flattened_list : str - Input list flattened by one level. - """ - flattened_list = list(itertools.chain(*list_to_flatten)) + Returns + ------- + flattened_list : str + Input list flattened by one level. + """ + flattened_list = list(itertools.chain(*list_to_flatten)) - return flattened_list + return flattened_list def isListEmpty(input_list): - """Check if a nested list is empty.""" - if isinstance(input_list, list): - return all(map(isListEmpty, input_list)) if isinstance(input_list, list) else False + """Check if a nested list is empty.""" + if isinstance(input_list, list): + return all(map(isListEmpty, input_list)) if isinstance(input_list, list) else False def divide_list_into_n_chunks(list_to_divide, n): - """Divides a list into a defined number of sublists. + """Divides a list into a defined number of sublists. - Parameters - ---------- - list_to_divide : list - List to divide into sublists. - n : int - Number of sublists to create. + Parameters + ---------- + list_to_divide : list + List to divide into sublists. + n : int + Number of sublists to create. - Returns - ------- - sublists : list - List with the sublists created by dividing - the input list. - """ - sublists = [] - d, r = divmod(len(list_to_divide), n) - for i in range(n): - si = (d+1)*(i if i < r else r) + d*(0 if i < r else i - r) - sublists.append(list_to_divide[si:si+(d+1 if i < r else d)]) + Returns + ------- + sublists : list + List with the sublists created by dividing + the input list. + """ + sublists = [] + d, r = divmod(len(list_to_divide), n) + for i in range(n): + si = (d+1)*(i if i < r else r) + d*(0 if i < r else i - r) + sublists.append(list_to_divide[si:si+(d+1 if i < r else d)]) - # exclude lists that are empty due to small number of elements - sublists = [line for line in sublists if len(line) > 0] + # exclude lists that are empty due to small number of elements + sublists = [line for line in sublists if len(line) > 0] - return sublists + return sublists def merge_dictionaries(dictionaries_list, overwrite=False): - """Merge several dictionaries. - - Parameters - ---------- - dictionaries_list : list - A list with the dictionaries to merge. - overwrite : bool - True to overwrite values when there is a key - collision, False otherwise. - - Returns - ------- - merged_dicts : dict - A dictionary resulting from merging - all input dictionaries. - """ - merged_dicts = dictionaries_list[0] - if overwrite is True: - for d in dictionaries_list[1:]: - merged_dicts = {**merged_dicts, **d} - elif overwrite is False: - for d in dictionaries_list[1:]: - for k, v in d.items(): - if k in merged_dicts: - merged_dicts[k] = list(set.union(set(v), set(merged_dicts[k]))) - else: - merged_dicts[k] = v - - return merged_dicts + """Merge several dictionaries. + + Parameters + ---------- + dictionaries_list : list + A list with the dictionaries to merge. + overwrite : bool + True to overwrite values when there is a key + collision, False otherwise. + + Returns + ------- + merged_dicts : dict + A dictionary resulting from merging + all input dictionaries. + """ + merged_dicts = dictionaries_list[0] + if overwrite is True: + for d in dictionaries_list[1:]: + merged_dicts = {**merged_dicts, **d} + elif overwrite is False: + for d in dictionaries_list[1:]: + for k, v in d.items(): + if k in merged_dicts: + merged_dicts[k] = list(set.union(set(v), set(merged_dicts[k]))) + else: + merged_dicts[k] = v + + return merged_dicts def invert_dictionary(dictionary): - """Invert a dictionary (key:value to value:key). + """Invert a dictionary (key:value to value:key). - Parameters - ---------- - dictionary : dict - Dictionary to be inverted. + Parameters + ---------- + dictionary : dict + Dictionary to be inverted. - Returns - ------- - inverted_dict : dict - Inverted dictionary. - """ - inverted_dict = {value: key for key, value in dictionary.items()} + Returns + ------- + inverted_dict : dict + Inverted dictionary. + """ + inverted_dict = {value: key for key, value in dictionary.items()} - return inverted_dict + return inverted_dict def split_iterable(iterable, size): - """Split a dictionary. - - Parameters - ---------- - iterable : dict - Dictionary to split. - size : int - Size of dictionaries created from the input - dictionary. - - Returns - ------- - chunks : list - List with dictionaries of defined size - resulting from splitting the input dictionary. - """ - chunks = [] - # create iterable from input object - # need to convert or slicing will always return the first elements - it = iter(iterable) - for i in range(0, len(iterable), size): - chunks.append({k: iterable[k] for k in itertools.islice(it, size)}) - - return chunks + """Split a dictionary. + + Parameters + ---------- + iterable : dict + Dictionary to split. + size : int + Size of dictionaries created from the input + dictionary. + + Returns + ------- + chunks : list + List with dictionaries of defined size + resulting from splitting the input dictionary. + """ + chunks = [] + # create iterable from input object + # need to convert or slicing will always return the first elements + it = iter(iterable) + for i in range(0, len(iterable), size): + chunks.append({k: iterable[k] for k in itertools.islice(it, size)}) + + return chunks def select_keys(dictionary, size): - """Select dictionary keys based on the number of values. + """Select dictionary keys based on the number of values. - Parameters - ---------- - dictionary : dict - Dictionary with key:value pairs to select. - size : int - Number of values used to select dictionary entries. + Parameters + ---------- + dictionary : dict + Dictionary with key:value pairs to select. + size : int + Number of values used to select dictionary entries. - Returns - ------- - selected : list - List with the dictionary keys that have a number - of matching values equal or greater/lesser than - `size`. - """ - selected = [k for k, v in dictionary.items() if len(v) == size] + Returns + ------- + selected : list + List with the dictionary keys that have a number + of matching values equal or greater/lesser than + `size`. + """ + selected = [k for k, v in dictionary.items() if len(v) == size] - return selected + return selected def prune_dictionary(dictionary, keys): - """Remove a set of keys from a dictionary. + """Remove a set of keys from a dictionary. - Parameters - ---------- - dictionary : dict - Input dictionary. - keys : list - List of keys to remove. + Parameters + ---------- + dictionary : dict + Input dictionary. + keys : list + List of keys to remove. - Returns - ------- - pruned_dict : dict - Dictionary without the keys in the list - of keys to remove. - """ - pruned_dict = {k: v for k, v in dictionary.items() if k not in keys} + Returns + ------- + pruned_dict : dict + Dictionary without the keys in the list + of keys to remove. + """ + pruned_dict = {k: v for k, v in dictionary.items() if k not in keys} - return pruned_dict + return pruned_dict def extract_subsequence(sequence, start, stop): - """Extract a substring from a string. + """Extract a substring from a string. - Parameters - ---------- - sequence : str - Input string. - start : int - Substring start position in input string. - stop : int - Substring stop position in input string. + Parameters + ---------- + sequence : str + Input string. + start : int + Substring start position in input string. + stop : int + Substring stop position in input string. - Returns - ------- - subsequence : str - Substring extracted from input string. - """ - subsequence = sequence[start:stop] + Returns + ------- + subsequence : str + Substring extracted from input string. + """ + subsequence = sequence[start:stop] - return subsequence + return subsequence def extract_single_cds(sequence, start, stop, strand): - """Extract a coding sequence from a larger sequence. + """Extract a coding sequence from a larger sequence. - Parameters - ---------- - sequence : str - Contig that contains the coding sequence. - start : int - Coding sequence start position in contig. - stop : int - Coding sequence stop position in contig. - strand : int - Coding sequence orientation. + Parameters + ---------- + sequence : str + Contig that contains the coding sequence. + start : int + Coding sequence start position in contig. + stop : int + Coding sequence stop position in contig. + strand : int + Coding sequence orientation. - Returns - ------- - coding_sequence : str - Coding sequence extracted from input contig. - """ - coding_sequence = extract_subsequence(sequence, start, stop) + Returns + ------- + coding_sequence : str + Coding sequence extracted from input contig. + """ + coding_sequence = extract_subsequence(sequence, start, stop) - if strand == 0: - coding_sequence = reverse_complement(coding_sequence, ct.DNA_BASES) + if strand == 0: + coding_sequence = reverse_complement(coding_sequence, ct.DNA_BASES) - return coding_sequence + return coding_sequence def escape_special_characters(input_string): - """Escape strings to use in regex. + """Escape strings to use in regex. - Parameters - ---------- - input_string : str - String containing characters to escape. + Parameters + ---------- + input_string : str + String containing characters to escape. - Returns - ------- - escaped_string : str - Escaped string. - """ - escaped_string = re.escape(input_string) + Returns + ------- + escaped_string : str + Escaped string. + """ + escaped_string = re.escape(input_string) - return escaped_string + return escaped_string def replace_multiple_characters(input_string, replacements): - """Replace multiple characters in a string. + """Replace multiple characters in a string. - Parameters - ---------- - input_string : str - String with characters to be replaced. - replacements : list - List that contains sublists with two elements, - the element to be replaced and the element it should - be replaced by. + Parameters + ---------- + input_string : str + String with characters to be replaced. + replacements : list + List that contains sublists with two elements, + the element to be replaced and the element it should + be replaced by. - Returns - ------- - replaced : str - Input string with replaced characters. - """ - for r in replacements: - if r[0] in input_string: - input_string = input_string.replace(*r) + Returns + ------- + replaced : str + Input string with replaced characters. + """ + for r in replacements: + if r[0] in input_string: + input_string = input_string.replace(*r) - return input_string + return input_string def reverse_complement(input_string, alphabet): - """Determine the reverse complement of a string. + """Determine the reverse complement of a string. - Parameters - ---------- - input_string : str - Input string. - alphabet : str - String that contains the alphabet used in - the input string (the reverse must be equal - to the desired complement). + Parameters + ---------- + input_string : str + Input string. + alphabet : str + String that contains the alphabet used in + the input string (the reverse must be equal + to the desired complement). - Returns - ------- - reverse_complement_string : str - The reverse complement of the input string. - """ - translation_table = str.maketrans(alphabet, alphabet[::-1]) + Returns + ------- + reverse_complement_string : str + The reverse complement of the input string. + """ + translation_table = str.maketrans(alphabet, alphabet[::-1]) - upper_string = input_string.upper() - complement_string = upper_string.translate(translation_table) - reverse_complement_string = reverse_str(complement_string) + upper_string = input_string.upper() + complement_string = upper_string.translate(translation_table) + reverse_complement_string = reverse_str(complement_string) - return reverse_complement_string + return reverse_complement_string def reverse_str(input_string): - """Reverse character order in a string. + """Reverse character order in a string. - Parameters - ---------- - input_string : str - String to be reversed. + Parameters + ---------- + input_string : str + String to be reversed. - Returns - ------- - revstr : str - Reverse of input string. - """ - revstr = input_string[::-1] + Returns + ------- + revstr : str + Reverse of input string. + """ + revstr = input_string[::-1] - return revstr + return revstr def check_str_alphabet(input_string, alphabet): - """Determine if a string only contains characters from specified alphabet. + """Determine if a string only contains characters from specified alphabet. - Parameters - ---------- - input_string : str - Input string. - alphabet : str - String that includes all characters in the - alphabet. + Parameters + ---------- + input_string : str + Input string. + alphabet : str + String that includes all characters in the + alphabet. - Returns - ------- - True if sequence only has characters from specified - alphabet, False otherwise. - """ - alphabet_chars = set(alphabet) - string_chars = set(input_string) + Returns + ------- + True if sequence only has characters from specified + alphabet, False otherwise. + """ + alphabet_chars = set(alphabet) + string_chars = set(input_string) - diff = string_chars - alphabet_chars + diff = string_chars - alphabet_chars - return len(diff) == 0 + return len(diff) == 0 def check_str_multiple(input_string, number): - """Determine if length of input string is a multiple of a specified number. + """Determine if length of input string is a multiple of a specified number. - Parameters - ---------- - input_string : str - Input string. - number : int - Length value should be a multiple of this number. + Parameters + ---------- + input_string : str + Input string. + number : int + Length value should be a multiple of this number. - Returns - ------- - True if the length of the sequence is a multiple of the - specified number, False otherwise. - """ - return (len(input_string) % number) == 0 + Returns + ------- + True if the length of the sequence is a multiple of the + specified number, False otherwise. + """ + return (len(input_string) % number) == 0 def hash_sequence(input_string, hash_type='sha256'): - """Compute hash of an input string. + """Compute hash of an input string. - Parameters - ---------- - input_string : str - Input string to hash. - hash_type : str - Hash type/function that will be used to compute the - hash (any of the hash functions available in the - hashlib module). + Parameters + ---------- + input_string : str + Input string to hash. + hash_type : str + Hash type/function that will be used to compute the + hash (any of the hash functions available in the + hashlib module). - Returns - ------- - hashed_string : str - String representation of the HASH object - in hexadecimal digits. - """ - # get hash function object from hashlib - hashing_function = getattr(hashlib, hash_type) + Returns + ------- + hashed_string : str + String representation of the HASH object + in hexadecimal digits. + """ + # get hash function object from hashlib + hashing_function = getattr(hashlib, hash_type) - # default encoding is UTF-8 - hashed_string = hashing_function(input_string.encode()).hexdigest() + # default encoding is UTF-8 + hashed_string = hashing_function(input_string.encode()).hexdigest() - return hashed_string + return hashed_string def string_kmerizer(input_string, k_value, offset=1, position=False): - """Decompose a string into k-mers. - - Parameters - ---------- - input_string : str - String to divide into k-mers. - k_value : int - Value for the size of k-mers. - offset : int - Value to indicate offset of consecutive k-mers. - position : bool - If the start position of the k-mers in the string - should be stored. - - Returns - ------- - kmers : list - List that contains the k-mers determined for the - input string. The list will contain strings if - it is not specified that positions should be - stored and tuples of k-mer and start position - if the position is stored. - """ - if position is False: - kmers = [input_string[i:i+k_value] - for i in range(0, len(input_string)-k_value+1, offset)] - elif position is True: - kmers = [(input_string[i:i+k_value], i) - for i in range(0, len(input_string)-k_value+1, offset)] - - return kmers + """Decompose a string into k-mers. + + Parameters + ---------- + input_string : str + String to divide into k-mers. + k_value : int + Value for the size of k-mers. + offset : int + Value to indicate offset of consecutive k-mers. + position : bool + If the start position of the k-mers in the string + should be stored. + + Returns + ------- + kmers : list + List that contains the k-mers determined for the + input string. The list will contain strings if + it is not specified that positions should be + stored and tuples of k-mer and start position + if the position is stored. + """ + if position is False: + kmers = [input_string[i:i+k_value] + for i in range(0, len(input_string)-k_value+1, offset)] + elif position is True: + kmers = [(input_string[i:i+k_value], i) + for i in range(0, len(input_string)-k_value+1, offset)] + + return kmers def determine_minimizers(input_string, adjacent_kmers, k_value, offset=1, - position=False): - """Determine minimizers for a input string. - - Determines minimizers for a string based on lexicographical - order. Skips windows that cannot have a minimizer based on - the minimizer computed in the previous iteration. - - Parameters - ---------- - input_string : str - String representing the sequence. - adjacent_kmers : int - Window size value. Number of adjacent k-mers per group. - k_value : int - Value of k for the k-mer size. - offset : int - Value to indicate offset of consecutive k-mers. - position : bool - If the start position of the k-mers in the sequence - should be stored. - - Returns - ------- - minimizers : list - A list with the set of minimizers determined - for the input string. - """ - # break string into k-mers - kmers = string_kmerizer(input_string, k_value, offset, position) - - i = 0 - previous = None - sell = False - minimizers = [] - # determine total number of windows - last_window = (len(kmers)-adjacent_kmers) - while i <= last_window: - # get kmers in current window - window = kmers[i:i+adjacent_kmers] - # pick smallest kmer as minimizer - minimizer = [min(window)] - # get position in window of smallest minimizer - minimizer_idx = window.index(minimizer[0]) - # sliding window that does not included last minimizer - if previous is None: - # simply store smallest minimizer - minimizers.extend(minimizer) - # sliding window includes last minimizer because we - # skipped some sliding windows - else: - # check if minimizer is the same as the one picked - # in the last window - # Do not store minimizer if it is the same - if minimizer[0] != previous: - # get kmers smaller than last minimizer - skipped = window[1:minimizer_idx] - # determine if any of the smaller kmers is - # the minimizer of a skipped window - minimal = previous - for m in skipped: - if m < minimal: - minimizer.append(m) - minimal = m - minimizers.extend(minimizer) - - # slide by 1 if minimizer has index 0 in window - if minimizer_idx == 0: - i += 1 - previous = None - # skip sliding windows based on minimizer position - else: - i += minimizer_idx - # if adding minimizer index surpasses last window value we - # might miss one last minimizer because it will fail the condition - # find a better way to control this condition! - if i > last_window and sell is False: - i = last_window - sell = True - previous = minimizer[0] - - return minimizers + position=False): + """Determine minimizers for a input string. + + Determines minimizers for a string based on lexicographical + order. Skips windows that cannot have a minimizer based on + the minimizer computed in the previous iteration. + + Parameters + ---------- + input_string : str + String representing the sequence. + adjacent_kmers : int + Window size value. Number of adjacent k-mers per group. + k_value : int + Value of k for the k-mer size. + offset : int + Value to indicate offset of consecutive k-mers. + position : bool + If the start position of the k-mers in the sequence + should be stored. + + Returns + ------- + minimizers : list + A list with the set of minimizers determined + for the input string. + """ + # break string into k-mers + kmers = string_kmerizer(input_string, k_value, offset, position) + + i = 0 + previous = None + sell = False + minimizers = [] + # determine total number of windows + last_window = (len(kmers)-adjacent_kmers) + while i <= last_window: + # get kmers in current window + window = kmers[i:i+adjacent_kmers] + # pick smallest kmer as minimizer + minimizer = [min(window)] + # get position in window of smallest minimizer + minimizer_idx = window.index(minimizer[0]) + # sliding window that does not included last minimizer + if previous is None: + # simply store smallest minimizer + minimizers.extend(minimizer) + # sliding window includes last minimizer because we + # skipped some sliding windows + else: + # check if minimizer is the same as the one picked + # in the last window + # Do not store minimizer if it is the same + if minimizer[0] != previous: + # get kmers smaller than last minimizer + skipped = window[1:minimizer_idx] + # determine if any of the smaller kmers is + # the minimizer of a skipped window + minimal = previous + for m in skipped: + if m < minimal: + minimizer.append(m) + minimal = m + minimizers.extend(minimizer) + + # slide by 1 if minimizer has index 0 in window + if minimizer_idx == 0: + i += 1 + previous = None + # skip sliding windows based on minimizer position + else: + i += minimizer_idx + # if adding minimizer index surpasses last window value we + # might miss one last minimizer because it will fail the condition + # find a better way to control this condition! + if i > last_window and sell is False: + i = last_window + sell = True + previous = minimizer[0] + + return minimizers def decode_str(str_list, encoding): - """Decode bytes objects. + """Decode bytes objects. - Decodes bytes objects in the input list and strips decoded - strings from whitespaces and newlines. + Decodes bytes objects in the input list and strips decoded + strings from whitespaces and newlines. - Parameters - ---------- - str_list - List with string or bytes objects to decode - and strip of whitespaces and newlines. - encoding : str - Encoding codec to use. + Parameters + ---------- + str_list + List with string or bytes objects to decode + and strip of whitespaces and newlines. + encoding : str + Encoding codec to use. - Returns - ------- - decoded : list - List with strings without whitespaces or - newlines. - """ - decoded = [m.decode(encoding).strip() - if type(m) == bytes - else m.strip() - for m in str_list] + Returns + ------- + decoded : list + List with strings without whitespaces or + newlines. + """ + decoded = [m.decode(encoding).strip() + if type(m) == bytes + else m.strip() + for m in str_list] - return decoded + return decoded # sorted key parameter is None by default def sort_iterable(data, sort_key=None, reverse=False): - """Sort an iterable. + """Sort an iterable. - Parameters - ---------- - data : iter - Iterable to sort. - sort_key - If provided, data will be sorted based - on this function. - reverse : bool - If sorting order should be inverted. + Parameters + ---------- + data : iter + Iterable to sort. + sort_key + If provided, data will be sorted based + on this function. + reverse : bool + If sorting order should be inverted. - Returns - ------- - sorted_data - Iterable with sorted elements. - """ - # sorted key parameter is None by default - sorted_data = sorted(data, key=sort_key, reverse=reverse) + Returns + ------- + sorted_data + Iterable with sorted elements. + """ + # sorted key parameter is None by default + sorted_data = sorted(data, key=sort_key, reverse=reverse) - return sorted_data + return sorted_data def filter_list(lst, remove): - """Remove elements from a list. + """Remove elements from a list. - Parameters - ---------- - lst : list - Input list. - remove : list - List of elements to remove from input list. + Parameters + ---------- + lst : list + Input list. + remove : list + List of elements to remove from input list. - Returns - ------- - filtered_list : list - List without the removed elements. - """ - filtered_list = list(set(lst) - set(remove)) + Returns + ------- + filtered_list : list + List without the removed elements. + """ + filtered_list = list(set(lst) - set(remove)) - return filtered_list + return filtered_list def find_missing(integer_list): - """Find the set of integers missing from a list to make it consecutive. + """Find the set of integers missing from a list to make it consecutive. - Parameters - ---------- - integer_list : list - List containing integers. + Parameters + ---------- + integer_list : list + List containing integers. - Returns - ------- - missing_integers : list - Sorted list of missing integers. - """ - first = min(integer_list) - last = max(integer_list) - missing_integers = sorted(set(range(first, last+1)).difference(integer_list)) + Returns + ------- + missing_integers : list + Sorted list of missing integers. + """ + first = min(integer_list) + last = max(integer_list) + missing_integers = sorted(set(range(first, last+1)).difference(integer_list)) - return missing_integers + return missing_integers # Add new parameter that accepts method to sample kmers def kmer_index(fasta_file, word_size, fasta=True): - """Create a k-mer index from a set of sequences in a FASTA file. - - Parameters - ---------- - fasta_file : str - Path to a Fasta file. - word_size : int - Value k for the k-mer size. - fasta : bool - True if input is a FASTA file, False if it is a dictionary - with sequence identifiers as keys and sequences as values. - - Returns - ------- - kmers_mapping : dict - Dictionary with k-mers as keys and the - list of sequence identifiers of the - sequences that contain the kmers as - values. - seqs_kmers : dict - Dictionary with sequence identifiers - as keys and the set of distinct k-mers - for each sequence as values. - """ - if fasta is True: - sequences = fao.sequence_generator(fasta_file) - else: - sequences = fasta_file - - kmers_mapping = {} - for record in sequences: - try: - seqid = record - sequence = sequences[record] - except Exception as e: - seqid = record.id - sequence = str(record.seq) - - minimizers = determine_minimizers(sequence, word_size, - word_size, position=False) - kmers = set(minimizers) - - # Create dictionary with kmers as keys and list - # of sequences with given kmers as values - for kmer in kmers: - kmers_mapping.setdefault(kmer, []).append(seqid) - - return kmers_mapping + """Create a k-mer index from a set of sequences in a FASTA file. + + Parameters + ---------- + fasta_file : str + Path to a Fasta file. + word_size : int + Value k for the k-mer size. + fasta : bool + True if input is a FASTA file, False if it is a dictionary + with sequence identifiers as keys and sequences as values. + + Returns + ------- + kmers_mapping : dict + Dictionary with k-mers as keys and the + list of sequence identifiers of the + sequences that contain the kmers as + values. + seqs_kmers : dict + Dictionary with sequence identifiers + as keys and the set of distinct k-mers + for each sequence as values. + """ + if fasta is True: + sequences = fao.sequence_generator(fasta_file) + else: + sequences = fasta_file + + kmers_mapping = {} + for record in sequences: + try: + seqid = record + sequence = sequences[record] + except Exception as e: + seqid = record.id + sequence = str(record.seq) + + minimizers = determine_minimizers(sequence, word_size, + word_size, position=False) + kmers = set(minimizers) + + # Create dictionary with kmers as keys and list + # of sequences with given kmers as values + for kmer in kmers: + kmers_mapping.setdefault(kmer, []).append(seqid) + + return kmers_mapping def contained_terms(iterable, terms): - """Find elements in an iterable that contain any term from a set of terms. - - Parameters - ---------- - iterable - An iterable such as a list with strings - or a dictionary. - terms : list - Terms to search for. - - Returns - ------- - matches : list - List with the elements of the iterable - that contain any of the searched terms. - """ - matches = [] - for e in iterable: - if any([term in e for term in terms]) is True: - matches.append(e) - - return matches + """Find elements in an iterable that contain any term from a set of terms. + + Parameters + ---------- + iterable + An iterable such as a list with strings + or a dictionary. + terms : list + Terms to search for. + + Returns + ------- + matches : list + List with the elements of the iterable + that contain any of the searched terms. + """ + matches = [] + for e in iterable: + if any([term in e for term in terms]) is True: + matches.append(e) + + return matches def integer_mapping(values, inverse=False): - """Create mapping between list elements and consecutive integers. - - Parameters - ---------- - values : iter - Input iterable. - inverse : bool - Invert mapping order, integers:values instead - of values:integers. - - Returns - ------- - mapping : dict - Dictionary with the mapping between the - elements in the input iterable and the - sequential integers. - """ - mapping = {v: i+1 for i, v in enumerate(values)} - if inverse is True: - mapping = invert_dictionary(mapping) - - return mapping + """Create mapping between list elements and consecutive integers. + + Parameters + ---------- + values : iter + Input iterable. + inverse : bool + Invert mapping order, integers:values instead + of values:integers. + + Returns + ------- + mapping : dict + Dictionary with the mapping between the + elements in the input iterable and the + sequential integers. + """ + mapping = {v: i+1 for i, v in enumerate(values)} + if inverse is True: + mapping = invert_dictionary(mapping) + + return mapping def multiprocessing_inputs(inputs, common_args, function): - """Create lists of inputs to process in parallel. - - Creates lists of inputs for the `map_async_parallelizer` - function from the `multiprocessing_operations` module. - - Parameters - ---------- - inputs : list - List with the distinct inputs. - common_args : list - List with the common arguments/inputs. - function : func - Function that will be parallelized by the - `map_async_parallelizer` function. - - Returns - ------- - input_groups : list - List with one sublist per distinct input. Each - sublist also contains the common arguments and - the function that will receive the arguments. - """ - input_groups = [] - # Create a list for each distinct input - for g in inputs: - new_input = g + common_args + [function] - input_groups.append(new_input) - - return input_groups + """Create lists of inputs to process in parallel. + + Creates lists of inputs for the `map_async_parallelizer` + function from the `multiprocessing_operations` module. + + Parameters + ---------- + inputs : list + List with the distinct inputs. + common_args : list + List with the common arguments/inputs. + function : func + Function that will be parallelized by the + `map_async_parallelizer` function. + + Returns + ------- + input_groups : list + List with one sublist per distinct input. Each + sublist also contains the common arguments and + the function that will receive the arguments. + """ + input_groups = [] + # Create a list for each distinct input + for g in inputs: + new_input = g + common_args + [function] + input_groups.append(new_input) + + return input_groups def aggregate_iterables(iterables): - """Aggregate elements with same index from several iterables. + """Aggregate elements with same index from several iterables. - Parameters - ---------- - iterables : list - List of iterables to aggreagate. + Parameters + ---------- + iterables : list + List of iterables to aggreagate. - Returns - ------- - aggregated_inputs : list - List with a sublist per group of elements - with same index that were aggregated. - """ - aggregated_inputs = [list(i) for i in zip(*iterables)] + Returns + ------- + aggregated_inputs : list + List with a sublist per group of elements + with same index that were aggregated. + """ + aggregated_inputs = [list(i) for i in zip(*iterables)] - return aggregated_inputs + return aggregated_inputs def mapping_function(values, function, args): - """Create mapping between input elements and function result. - - Parameters - ---------- - values : iter - Input iterable. - function : func - Funtion to apply to the elements in the - input iterable. - args : list - List of arguments to pass to the function. - - Returns - ------- - mapping : dict - Dictionary with the mapping between the - elements in the input iterable and the - result of applying the function to each - value. - """ - mapping = {v: function(v, *args) for v in values} - - return mapping + """Create mapping between input elements and function result. + + Parameters + ---------- + values : iter + Input iterable. + function : func + Funtion to apply to the elements in the + input iterable. + args : list + List of arguments to pass to the function. + + Returns + ------- + mapping : dict + Dictionary with the mapping between the + elements in the input iterable and the + result of applying the function to each + value. + """ + mapping = {v: function(v, *args) for v in values} + + return mapping def polyline_encoding(number_list, precision=0): - """Use the polyline algorithm to encode/compress a list of numbers. - - Parameters - ---------- - number_list : list - List with numbers to encode. - precision : int - Number of decimal places preserved by the encoding. - - Returns - ------- - compressed_values : str - A single string composed of ASCII characters - that represents the compressed list of numbers - to the desired level of precision. - - Notes - ----- - This implementation is based on the `numcompress` package - (https://github.com/amit1rrr/numcompress). - """ - compressed_values = '' - # store the precision value - # to ensure proper character display, encoded values are summed with 63 - # (the ASCII character '?') - compressed_values += chr(precision+63) - previous_num = 0 - for number in number_list: - # encode the difference between numbers - difference = number - previous_num - # multiply and round to get integer that preserves decimal places - difference = int(round(difference*(10**precision))) - # left bitwise shift 1 position (0b11111 --> 0b111110) - # and invert the encoding if the original number was negative - # the bit added by the shift will be inverted and that allows - # to know that the number is negative - difference = ~(difference << 1) if difference < 0 else difference << 1 - - # process 5-bit chunks from right to left until we get a value that - # is smaller than 0b100000/0x20/32 - while difference >= 0x20: - # use 0x1f (0b11111) as bitmask to get the smallest 5-bit chunk - # and 0x20 is used as continuation bit to help decode - compressed_values += (chr((0x20 | (difference & 0x1f)) + 63)) - # right bitwise shift to exclude the 5-bit chunk that was encoded - difference >>= 5 - - # encode the last 5-bit chunk - compressed_values += (chr(difference + 63)) - # store number to subtract from next - previous_num = number - - return compressed_values + """Use the polyline algorithm to encode/compress a list of numbers. + + Parameters + ---------- + number_list : list + List with numbers to encode. + precision : int + Number of decimal places preserved by the encoding. + + Returns + ------- + compressed_values : str + A single string composed of ASCII characters + that represents the compressed list of numbers + to the desired level of precision. + + Notes + ----- + This implementation is based on the `numcompress` package + (https://github.com/amit1rrr/numcompress). + """ + compressed_values = '' + # store the precision value + # to ensure proper character display, encoded values are summed with 63 + # (the ASCII character '?') + compressed_values += chr(precision+63) + previous_num = 0 + for number in number_list: + # encode the difference between numbers + difference = number - previous_num + # multiply and round to get integer that preserves decimal places + difference = int(round(difference*(10**precision))) + # left bitwise shift 1 position (0b11111 --> 0b111110) + # and invert the encoding if the original number was negative + # the bit added by the shift will be inverted and that allows + # to know that the number is negative + difference = ~(difference << 1) if difference < 0 else difference << 1 + + # process 5-bit chunks from right to left until we get a value that + # is smaller than 0b100000/0x20/32 + while difference >= 0x20: + # use 0x1f (0b11111) as bitmask to get the smallest 5-bit chunk + # and 0x20 is used as continuation bit to help decode + compressed_values += (chr((0x20 | (difference & 0x1f)) + 63)) + # right bitwise shift to exclude the 5-bit chunk that was encoded + difference >>= 5 + + # encode the last 5-bit chunk + compressed_values += (chr(difference + 63)) + # store number to subtract from next + previous_num = number + + return compressed_values def decompress_number(text, index): - """Decode a single number from a string created with polyline encoding. - - Parameters - ---------- - text : str - String representing a compressed list of numbers. - index : int - Index of the first character to start decoding a number. - - Returns - ------- - Index to start decoding the next number and the number decoded - in the current function call. - """ - number = 0 - bitwise_shift = 0 - - while True: - # subtract 63 and remove bit from OR with 0x20 if 5-bit chunk - # is not the last to decode a number that was in the original list - n = (ord(text[index]) - 63) - index += 1 - # only continue if there is a continuation bit (0b100000) - # e.g.: 0b11111 only has 5 bits, meaning it is the last chunk - # that is necessary to decode the number - if n >= 0x20: - # subtract 0x20 (0b100000) to get the 5-bit chunk - n -= 0x20 - # contruct the binary number with biwise shift to add each - # 5-bit chunk to original position - number = number | (n << bitwise_shift) - # increment bitwise shift value for next 5-bit chunk - bitwise_shift += 5 - else: - break - - # add the last chunk, without continuation bit, to the leftmost position - number = number | (n << bitwise_shift) - - # invert bits to get negative number if sign bit is 1 - # remove sign bit and keep only bits for decoded value - return index, (~number >> 1) if (number & 1) != 0 else (number >> 1) + """Decode a single number from a string created with polyline encoding. + + Parameters + ---------- + text : str + String representing a compressed list of numbers. + index : int + Index of the first character to start decoding a number. + + Returns + ------- + Index to start decoding the next number and the number decoded + in the current function call. + """ + number = 0 + bitwise_shift = 0 + + while True: + # subtract 63 and remove bit from OR with 0x20 if 5-bit chunk + # is not the last to decode a number that was in the original list + n = (ord(text[index]) - 63) + index += 1 + # only continue if there is a continuation bit (0b100000) + # e.g.: 0b11111 only has 5 bits, meaning it is the last chunk + # that is necessary to decode the number + if n >= 0x20: + # subtract 0x20 (0b100000) to get the 5-bit chunk + n -= 0x20 + # contruct the binary number with biwise shift to add each + # 5-bit chunk to original position + number = number | (n << bitwise_shift) + # increment bitwise shift value for next 5-bit chunk + bitwise_shift += 5 + else: + break + + # add the last chunk, without continuation bit, to the leftmost position + number = number | (n << bitwise_shift) + + # invert bits to get negative number if sign bit is 1 + # remove sign bit and keep only bits for decoded value + return index, (~number >> 1) if (number & 1) != 0 else (number >> 1) def polyline_decoding(text): - """Decode a list of integers compressed with polyline encoding. - - Parameters - ---------- - text : str - String representing a compressed list of numbers. - - Returns - ------- - number_list : list - List with the decoded numbers. - """ - number_list = [] - index = last_num = 0 - # decode precision value - precision = ord(text[index]) - 63 - index += 1 - while index < len(text): - # decode a number and get index to start decoding next number - index, difference = decompress_number(text, index) - # add decoded difference to get next number in original list - last_num += difference - number_list.append(last_num) - - number_list = [round(item * (10 ** (-precision)), precision) for item in number_list] - - return number_list + """Decode a list of integers compressed with polyline encoding. + + Parameters + ---------- + text : str + String representing a compressed list of numbers. + + Returns + ------- + number_list : list + List with the decoded numbers. + """ + number_list = [] + index = last_num = 0 + # decode precision value + precision = ord(text[index]) - 63 + index += 1 + while index < len(text): + # decode a number and get index to start decoding next number + index, difference = decompress_number(text, index) + # add decoded difference to get next number in original list + last_num += difference + number_list.append(last_num) + + number_list = [round(item * (10 ** (-precision)), precision) for item in number_list] + + return number_list def replace_list_values(input_list, replace_dict): - """Replace values in list based on provided substitutions. + """Replace values in list based on provided substitutions. - Parameters - ---------- - input_list : list - List with values to substitute. - replace_dict : dict - Mapping between the values to substitute - and the values to substitute by. + Parameters + ---------- + input_list : list + List with values to substitute. + replace_dict : dict + Mapping between the values to substitute + and the values to substitute by. - Returns - ------- - replaced_list : list - List with substituted values (values - that are not in `replace_dict` are kept - unchanged). - """ - replaced_list = [replace_dict.get(e, e) for e in input_list] + Returns + ------- + replaced_list : list + List with substituted values (values + that are not in `replace_dict` are kept + unchanged). + """ + replaced_list = [replace_dict.get(e, e) for e in input_list] - return replaced_list + return replaced_list def replace_chars(column, missing_replace='0'): - """Replace all non-numeric characters in a column with allele identifiers. - - Parameters - ---------- - column : pandas.core.series.Series - Pandas dataframe column. - - Returns - ------- - replace_missing : pandas.core.series.Series - Input column with cells that only contain - numeric characters. - """ - # remove 'INF-' from inferred alleles - replace_inf = column.replace(to_replace='INF-', - value='', regex=True) - # replace '*' in novel alleles from schemas in Chewie-NS - # before replacing missing data cases to avoid replacing '*' with '0' - replace_inf = replace_inf.replace(to_replace='\*', - value='', regex=True) - # replace missing data with - replace_missing = replace_inf.replace(to_replace='\D+.*', - value=missing_replace, regex=True) - - return replace_missing + """Replace all non-numeric characters in a column with allele identifiers. + + Parameters + ---------- + column : pandas.core.series.Series + Pandas dataframe column. + + Returns + ------- + replace_missing : pandas.core.series.Series + Input column with cells that only contain + numeric characters. + """ + # remove 'INF-' from inferred alleles + replace_inf = column.replace(to_replace='INF-', + value='', regex=True) + # replace '*' in novel alleles from schemas in Chewie-NS + # before replacing missing data cases to avoid replacing '*' with '0' + replace_inf = replace_inf.replace(to_replace='\*', + value='', regex=True) + # replace missing data with + replace_missing = replace_inf.replace(to_replace='\D+.*', + value=missing_replace, regex=True) + + return replace_missing def inclusive_range(start, stop, step): - """ - """ - i = start - while i < stop: - yield i - i += step - yield stop + """ + """ + i = start + while i < stop: + yield i + i += step + yield stop diff --git a/CHEWBBACA/utils/join_profiles.py b/CHEWBBACA/utils/join_profiles.py index 23bd2214..0cb37782 100755 --- a/CHEWBBACA/utils/join_profiles.py +++ b/CHEWBBACA/utils/join_profiles.py @@ -20,96 +20,96 @@ import pandas as pd try: - from utils import file_operations as fo + from utils import file_operations as fo except ModuleNotFoundError: - from CHEWBBACA.utils import file_operations as fo + from CHEWBBACA.utils import file_operations as fo def concatenate_profiles(files, loci_list, output_file): - """Concatenate allele calling results for a common set of loci. - - Parameters - ---------- - files : list - List with the paths to the TSV files with - allele calling results. - loci_list : list - List with the identifiers of common loci. - output_file : str - Path to the output file. - - Returns - ------- - total_profiles : int - Number of profiles written to the output file. - """ - total_profiles = 0 - for f in files: - # create dataframe with columns for loci in list - profiles = pd.read_csv(f, sep='\t', - usecols=loci_list, dtype=str) - # reorder columns to prevent concatenating results - # with same loci but with different loci/column order - profiles = profiles[loci_list] - total_profiles += len(profiles) - # save dataframe to file - profiles.to_csv(output_file, mode='a', - header=not os.path.exists(output_file), - sep='\t', index=False) - - return total_profiles + """Concatenate allele calling results for a common set of loci. + + Parameters + ---------- + files : list + List with the paths to the TSV files with + allele calling results. + loci_list : list + List with the identifiers of common loci. + output_file : str + Path to the output file. + + Returns + ------- + total_profiles : int + Number of profiles written to the output file. + """ + total_profiles = 0 + for f in files: + # create dataframe with columns for loci in list + profiles = pd.read_csv(f, sep='\t', + usecols=loci_list, dtype=str) + # reorder columns to prevent concatenating results + # with same loci but with different loci/column order + profiles = profiles[loci_list] + total_profiles += len(profiles) + # save dataframe to file + profiles.to_csv(output_file, mode='a', + header=not os.path.exists(output_file), + sep='\t', index=False) + + return total_profiles def main(profiles, output_file, common): - """Join files with allelic profiles. - - Parameters - ---------- - profiles : list - List with paths to TSV files with allelic profiles. - output_file : str - Path to the output file. - common : bool - If the process should join profile data only for shared loci - when the profiles do not share the same loci sets. - """ - if len(profiles) == 1: - sys.exit('Provided a single file. Nothing to do.') - - headers = [] - for file in profiles: - header = fo.read_lines(file, strip=True, num_lines=1) - headers.append(header[0].split('\t')) - - if common is False: - # check if headers are equal - if all([set(headers[0]) == set(h) for h in headers[1:]]) is True: - print('Profiles have {0} loci.'.format(len(headers[0])-1)) - total_profiles = concatenate_profiles(profiles, - headers[0], - output_file) - else: - sys.exit('Files have different sets of loci. Please provide ' - 'files with the results for the same set of loci or ' - 'provide the "--common" parameter to create a file ' - 'with the results for the set of common loci.') - else: - # determine set of common loci - common_loci = headers[0] - for h in headers[1:]: - # determine common loci ordered based on headers in first file - latest_common = [locus for locus in common_loci if locus in h] - # update set of common loci so far - common_loci = latest_common - - if len(latest_common) <= 1: - sys.exit('Profiles do not have loci in common.') - - print('Profiles have {0} loci in common.' - ''.format(len(latest_common)-1)) - total_profiles = concatenate_profiles(profiles, - latest_common, - output_file) - - print('Joined {0} files with a total of {1} ' - 'profiles.'.format(len(profiles), total_profiles)) + """Join files with allelic profiles. + + Parameters + ---------- + profiles : list + List with paths to TSV files with allelic profiles. + output_file : str + Path to the output file. + common : bool + If the process should join profile data only for shared loci + when the profiles do not share the same loci sets. + """ + if len(profiles) == 1: + sys.exit('Provided a single file. Nothing to do.') + + headers = [] + for file in profiles: + header = fo.read_lines(file, strip=True, num_lines=1) + headers.append(header[0].split('\t')) + + if common is False: + # check if headers are equal + if all([set(headers[0]) == set(h) for h in headers[1:]]) is True: + print('Profiles have {0} loci.'.format(len(headers[0])-1)) + total_profiles = concatenate_profiles(profiles, + headers[0], + output_file) + else: + sys.exit('Files have different sets of loci. Please provide ' + 'files with the results for the same set of loci or ' + 'provide the "--common" parameter to create a file ' + 'with the results for the set of common loci.') + else: + # determine set of common loci + common_loci = headers[0] + for h in headers[1:]: + # determine common loci ordered based on headers in first file + latest_common = [locus for locus in common_loci if locus in h] + # update set of common loci so far + common_loci = latest_common + + if len(latest_common) <= 1: + sys.exit('Profiles do not have loci in common.') + + print('Profiles have {0} loci in common.' + ''.format(len(latest_common)-1)) + total_profiles = concatenate_profiles(profiles, + latest_common, + output_file) + + print('Joined {0} files with a total of {1} ' + 'profiles.'.format(len(profiles), total_profiles)) diff --git a/CHEWBBACA/utils/mafft_wrapper.py b/CHEWBBACA/utils/mafft_wrapper.py index 9c754b54..e5323d9b 100644 --- a/CHEWBBACA/utils/mafft_wrapper.py +++ b/CHEWBBACA/utils/mafft_wrapper.py @@ -16,43 +16,43 @@ import subprocess try: - from utils import constants as ct + from utils import constants as ct except ModuleNotFoundError: - from CHEWBBACA.utils import constants as ct + from CHEWBBACA.utils import constants as ct def call_mafft(input_file, output_file): - """Call MAFFT to compute a MSA. - - Parameters - ---------- - input_file : str - Path to a FASTA file with the sequences to align. - output_file : str - Path to the output file created by MAFFT. - - Returns - ------- - output_file : str - Path to the output file. - outfile_exists : bool - True if the output file was created, False otherwise. - stdout : bytes - MAFFT stdout. - stderr : bytes - MAFFT stderr. - """ - mafft_cmd = [ct.MAFFT_ALIAS, '--thread', '1', '--treeout', '--retree', '1', - '--maxiterate', '0', input_file, '>', output_file] - mafft_cmd = ' '.join(mafft_cmd) - - # Must include subprocess.PIPE to get stdout and stderr - mafft_cmd = subprocess.Popen(mafft_cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True) - stdout, stderr = mafft_cmd.communicate() - - outfile_exists = os.path.exists(output_file) - - return [output_file, outfile_exists, stdout, stderr] + """Call MAFFT to compute a MSA. + + Parameters + ---------- + input_file : str + Path to a FASTA file with the sequences to align. + output_file : str + Path to the output file created by MAFFT. + + Returns + ------- + output_file : str + Path to the output file. + outfile_exists : bool + True if the output file was created, False otherwise. + stdout : bytes + MAFFT stdout. + stderr : bytes + MAFFT stderr. + """ + mafft_cmd = [ct.MAFFT_ALIAS, '--thread', '1', '--treeout', '--retree', '1', + '--maxiterate', '0', input_file, '>', output_file] + mafft_cmd = ' '.join(mafft_cmd) + + # Must include subprocess.PIPE to get stdout and stderr + mafft_cmd = subprocess.Popen(mafft_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) + stdout, stderr = mafft_cmd.communicate() + + outfile_exists = os.path.exists(output_file) + + return [output_file, outfile_exists, stdout, stderr] diff --git a/CHEWBBACA/utils/multiprocessing_operations.py b/CHEWBBACA/utils/multiprocessing_operations.py index c6c0ae34..8f552a9f 100644 --- a/CHEWBBACA/utils/multiprocessing_operations.py +++ b/CHEWBBACA/utils/multiprocessing_operations.py @@ -17,229 +17,229 @@ import multiprocessing.pool try: - from utils import iterables_manipulation as im + from utils import iterables_manipulation as im except ModuleNotFoundError: - from CHEWBBACA.utils import iterables_manipulation as im + from CHEWBBACA.utils import iterables_manipulation as im def function_helper(input_args): - """Run function with provided inputs and capture exceptions. - - Parameters - ---------- - input_args : list - List with function inputs and function object to call - in the last index. - - Returns - ------- - results : list - List with the results returned by the function. - If an exception is raised it returns a list with - the name of the function and the exception traceback. - """ - try: - results = input_args[-1](*input_args[0:-1]) - except Exception as e: - func_name = (input_args[-1]).__name__ - traceback_lines = traceback.format_exc() - traceback_text = ''.join(traceback_lines) - print('\nError on {0}:\n{1}\n'.format(func_name, traceback_text), flush=True) - results = [func_name, traceback_text] - - return results + """Run function with provided inputs and capture exceptions. + + Parameters + ---------- + input_args : list + List with function inputs and function object to call + in the last index. + + Returns + ------- + results : list + List with the results returned by the function. + If an exception is raised it returns a list with + the name of the function and the exception traceback. + """ + try: + results = input_args[-1](*input_args[0:-1]) + except Exception as e: + func_name = (input_args[-1]).__name__ + traceback_lines = traceback.format_exc() + traceback_text = ''.join(traceback_lines) + print('\nError on {0}:\n{1}\n'.format(func_name, traceback_text), flush=True) + results = [func_name, traceback_text] + + return results def map_async_parallelizer(inputs, function, cpu, callback='extend', - chunksize=1, show_progress=False, pool_type='pool'): - """Run function in parallel. - - Parameters - ---------- - inputs : list - List with inputs to process. - function : func - Function to be parallelized. - cpu : int - Number of processes to create (based on the - number of CPU cores). - callback : str - Results can be appended, "append", to the - list that stores results or the list of results - can be extended, "extend". - chunksize : int - Size of input chunks that will be passed to - each process. The function will create groups - of inputs with this number of elements. - show_progress : bool - True to show a progress bar with the percentage - of inputs that have been processed, False - otherwise. - pool_type : str - The multiprocessing.pool object that will be used, - Pool or ThreadPool. - - Returns - ------- - results : list - List with the results returned for each function - call. - """ - if pool_type == 'pool': - multiprocessing_function = multiprocessing.pool.Pool - # Gene prediction uses ThreadPool because Pyrodigal might hang with Pool - elif pool_type == 'threadpool': - multiprocessing_function = multiprocessing.pool.ThreadPool - - results = [] - # Use context manager to join and close pool automatically - with multiprocessing_function(cpu) as pool: - if callback == 'extend': - rawr = pool.map_async(function, inputs, - callback=results.extend, chunksize=chunksize) - elif callback == 'append': - rawr = pool.map_async(function, inputs, - callback=results.append, chunksize=chunksize) - - if show_progress is True: - progress = None - while progress != 100: - progress = progress_bar(rawr._number_left, len(inputs), progress) - - rawr.wait() - - return results + chunksize=1, show_progress=False, pool_type='pool'): + """Run function in parallel. + + Parameters + ---------- + inputs : list + List with inputs to process. + function : func + Function to be parallelized. + cpu : int + Number of processes to create (based on the + number of CPU cores). + callback : str + Results can be appended, "append", to the + list that stores results or the list of results + can be extended, "extend". + chunksize : int + Size of input chunks that will be passed to + each process. The function will create groups + of inputs with this number of elements. + show_progress : bool + True to show a progress bar with the percentage + of inputs that have been processed, False + otherwise. + pool_type : str + The multiprocessing.pool object that will be used, + Pool or ThreadPool. + + Returns + ------- + results : list + List with the results returned for each function + call. + """ + if pool_type == 'pool': + multiprocessing_function = multiprocessing.pool.Pool + # Gene prediction uses ThreadPool because Pyrodigal might hang with Pool + elif pool_type == 'threadpool': + multiprocessing_function = multiprocessing.pool.ThreadPool + + results = [] + # Use context manager to join and close pool automatically + with multiprocessing_function(cpu) as pool: + if callback == 'extend': + rawr = pool.map_async(function, inputs, + callback=results.extend, chunksize=chunksize) + elif callback == 'append': + rawr = pool.map_async(function, inputs, + callback=results.append, chunksize=chunksize) + + if show_progress is True: + progress = None + while progress != 100: + progress = progress_bar(rawr._number_left, len(inputs), progress) + + rawr.wait() + + return results def progress_bar(remaining, total, previous, tickval=5, ticknum=20): - """Create and print a progress bar to the stdout. - - Parameters - ---------- - remaining : int - Number of remaining tasks to complete. - total : int - Total number of inputs that have to be processed. - previous : int - Percentage of tasks that had been completed in the - previous function call. - tickval : int - Progress completion percentage value for each - tick. - ticknum : int - Total number of ticks in progress bar. - - Returns - ------- - completed : bool - Boolean indicating if all inputs have been processed. - """ - # determine percentage of processed inputs - progress = int(100-(remaining/total)*100) - # only print if percentage has changed - if progress != previous: - progress_tick = progress//tickval - progress_bar = '[{0}{1}] {2}%'.format('='*progress_tick, - ' '*(ticknum-progress_tick), - progress) - print('\r', progress_bar, end='') - - time.sleep(0.1) - - return progress + """Create and print a progress bar to the stdout. + + Parameters + ---------- + remaining : int + Number of remaining tasks to complete. + total : int + Total number of inputs that have to be processed. + previous : int + Percentage of tasks that had been completed in the + previous function call. + tickval : int + Progress completion percentage value for each + tick. + ticknum : int + Total number of ticks in progress bar. + + Returns + ------- + completed : bool + Boolean indicating if all inputs have been processed. + """ + # determine percentage of processed inputs + progress = int(100-(remaining/total)*100) + # only print if percentage has changed + if progress != previous: + progress_tick = progress//tickval + progress_bar = '[{0}{1}] {2}%'.format('='*progress_tick, + ' '*(ticknum-progress_tick), + progress) + print('\r', progress_bar, end='') + + time.sleep(0.1) + + return progress def distribute_loci(inputs, cores, method): - """Create balanced lists of loci to efficiently parallelize function calls. - - Creates balanced lists of loci to distribute per number of - available cores. Loci lists can be created based on the number - of sequences per locus (seqcount), the mean length of the - sequences (length) in each locus or the product of both values - (seqcount+length). - - Parameters - ---------- - inputs : list - List with one sublist per locus. Each sublist has - a locus identifier, the total number of sequences - and sequence mean legth for that locus. - cores : int - The number of loci groups that should be created. - Based on the number of CPU cores that will be - used to process the inputs. - method : str - "seqcount" to create loci lists based on the total - number of sequences, "length" to split based - on mean length of sequences and "seqcount+length" to - split based on both criteria. - - Returns - ------- - splitted_ids : list - List with sublists that contain loci identifiers. - Sublists are balanced based on the chosen method. - """ - # initialize list with sublists to store inputs - splitted_ids = [[] for cpu in range(cores)] - # initialize list with chosen criterion values - # for each sublist of inputs - splitted_values = [0 for cpu in range(cores)] - i = 0 - for locus in inputs: - if method == 'seqcount': - splitted_values[i] += locus[1] - elif method == 'length': - splitted_values[i] += locus[4] - elif method == 'seqcount+length': - splitted_values[i] += locus[1] * locus[4] - splitted_ids[i].append(locus[0]) - # at the end of each iteration, choose the sublist - # with lowest criterion value - i = splitted_values.index(min(splitted_values)) - - return splitted_ids + """Create balanced lists of loci to efficiently parallelize function calls. + + Creates balanced lists of loci to distribute per number of + available cores. Loci lists can be created based on the number + of sequences per locus (seqcount), the mean length of the + sequences (length) in each locus or the product of both values + (seqcount+length). + + Parameters + ---------- + inputs : list + List with one sublist per locus. Each sublist has + a locus identifier, the total number of sequences + and sequence mean legth for that locus. + cores : int + The number of loci groups that should be created. + Based on the number of CPU cores that will be + used to process the inputs. + method : str + "seqcount" to create loci lists based on the total + number of sequences, "length" to split based + on mean length of sequences and "seqcount+length" to + split based on both criteria. + + Returns + ------- + splitted_ids : list + List with sublists that contain loci identifiers. + Sublists are balanced based on the chosen method. + """ + # initialize list with sublists to store inputs + splitted_ids = [[] for cpu in range(cores)] + # initialize list with chosen criterion values + # for each sublist of inputs + splitted_values = [0 for cpu in range(cores)] + i = 0 + for locus in inputs: + if method == 'seqcount': + splitted_values[i] += locus[1] + elif method == 'length': + splitted_values[i] += locus[4] + elif method == 'seqcount+length': + splitted_values[i] += locus[1] * locus[4] + splitted_ids[i].append(locus[0]) + # at the end of each iteration, choose the sublist + # with lowest criterion value + i = splitted_values.index(min(splitted_values)) + + return splitted_ids def parallelize_function(function, inputs, common_args=None, - cpu_cores=1, show_progress=False): - """Create list of inputs and parallelize function calls. - - Parameters - ---------- - function : func - Function to be parallelized. - inputs : list - List of inputs to divide into sublists. - common_args : list - List of common arguments to add to each sublist. - The common args are values that will be passed to the - function. - cpu_cores : int - Number of CPU cores used to parallelize the function. - show_progress : bool - True to show a progress bar for the percentage of - inputs that have been processed, False otherwise. - - Returns - ------- - results : list - List with the results returned by the function calls. - """ - # create chunks to distribute per cores - input_lists = im.divide_list_into_n_chunks(inputs, len(inputs)) - - if common_args is None: - common_args = [] - - # add common arguments to all sublists - input_lists = im.multiprocessing_inputs(input_lists, - common_args, - function) - - results = map_async_parallelizer(input_lists, - function_helper, - cpu_cores, - show_progress=show_progress) - - return results + cpu_cores=1, show_progress=False): + """Create list of inputs and parallelize function calls. + + Parameters + ---------- + function : func + Function to be parallelized. + inputs : list + List of inputs to divide into sublists. + common_args : list + List of common arguments to add to each sublist. + The common args are values that will be passed to the + function. + cpu_cores : int + Number of CPU cores used to parallelize the function. + show_progress : bool + True to show a progress bar for the percentage of + inputs that have been processed, False otherwise. + + Returns + ------- + results : list + List with the results returned by the function calls. + """ + # create chunks to distribute per cores + input_lists = im.divide_list_into_n_chunks(inputs, len(inputs)) + + if common_args is None: + common_args = [] + + # add common arguments to all sublists + input_lists = im.multiprocessing_inputs(input_lists, + common_args, + function) + + results = map_async_parallelizer(input_lists, + function_helper, + cpu_cores, + show_progress=show_progress) + + return results diff --git a/CHEWBBACA/utils/process_datetime.py b/CHEWBBACA/utils/process_datetime.py index fcb45304..70042ae6 100644 --- a/CHEWBBACA/utils/process_datetime.py +++ b/CHEWBBACA/utils/process_datetime.py @@ -18,139 +18,139 @@ def get_datetime(): - """Return datetime object for current date and hour.""" - current_datetime = dt.datetime.now() + """Return datetime object for current date and hour.""" + current_datetime = dt.datetime.now() - return current_datetime + return current_datetime def datetime_str(datetime_obj, date_format='%Y-%m-%dT%H:%M:%S'): - """Convert datetime module object to formatted string. + """Convert datetime module object to formatted string. - Parameters - ---------- - datetime_obj : datetime.datetime - Datetime module object. - date_format : str - Format for string representation of the date - object. + Parameters + ---------- + datetime_obj : datetime.datetime + Datetime module object. + date_format : str + Format for string representation of the date + object. - Returns - ------- - dt_str : str - String representation of the date according - to specified format. - """ - dt_str = dt.datetime.strftime(datetime_obj, date_format) + Returns + ------- + dt_str : str + String representation of the date according + to specified format. + """ + dt_str = dt.datetime.strftime(datetime_obj, date_format) - return dt_str + return dt_str def datetime_obj(datetime_str, date_format='%Y-%m-%dT%H:%M:%S'): - """Convert formatted string to datetime module object. + """Convert formatted string to datetime module object. - Parameters - ---------- - datetime_str : str - String to convert to datetime object. - date_format : str - Format of the string representation of the date - object. + Parameters + ---------- + datetime_str : str + String to convert to datetime object. + date_format : str + Format of the string representation of the date + object. - Returns - ------- - dt_obj : datetime.datetime - Datetime module object. - """ - dt_obj = dt.datetime.strptime(datetime_str, date_format) + Returns + ------- + dt_obj : datetime.datetime + Datetime module object. + """ + dt_obj = dt.datetime.strptime(datetime_str, date_format) - return dt_obj + return dt_obj def datetime_diff(sdate, edate): - """Return the difference in minutes and seconds between two dates. - - Parameters - ---------- - sdate : datetime.datetime - Datetime module object corresponding to - the oldest date. - edate : datetime.datetime - Datetime module object corresponding to - the most recent date. - - Returns - ------- - A list with the following elements: - minutes : float - Time difference between the two dates - in minutes. - seconds : float - The remaining time difference in seconds. - """ - delta = edate - sdate - minutes, seconds = divmod(delta.total_seconds(), 60) - - return [minutes, seconds] + """Return the difference in minutes and seconds between two dates. + + Parameters + ---------- + sdate : datetime.datetime + Datetime module object corresponding to + the oldest date. + edate : datetime.datetime + Datetime module object corresponding to + the most recent date. + + Returns + ------- + A list with the following elements: + minutes : float + Time difference between the two dates + in minutes. + seconds : float + The remaining time difference in seconds. + """ + delta = edate - sdate + minutes, seconds = divmod(delta.total_seconds(), 60) + + return [minutes, seconds] def validate_date(date, date_format='%Y-%m-%dT%H:%M:%S.%f'): - """Check if date is in specified format. - - Parameters - ---------- - date : str - String representing a date. - date_format : str - Date format. - - Returns - ------- - valid : str - The string representing the date if it - is in the specified format. - """ - valid = False - try: - date = dt.datetime.strptime(date, date_format) - valid = date - except ValueError: - date = dt.datetime.strptime(date+'.0', date_format) - valid = date - - return valid + """Check if date is in specified format. + + Parameters + ---------- + date : str + String representing a date. + date_format : str + Date format. + + Returns + ------- + valid : str + The string representing the date if it + is in the specified format. + """ + valid = False + try: + date = dt.datetime.strptime(date, date_format) + valid = date + except ValueError: + date = dt.datetime.strptime(date+'.0', date_format) + valid = date + + return valid def process_header(process): - """Print a header with the name of the process.""" - header = f'chewBBACA - {process}' - hf = '='*(len(header)+4) - print(f'{hf}\n {header}\n{hf}') + """Print a header with the name of the process.""" + header = f'chewBBACA - {process}' + hf = '='*(len(header)+4) + print(f'{hf}\n {header}\n{hf}') # Decorator to time main processes def process_timer(func): - # Use functools to preserve info about wrapped function - @functools.wraps(func) - def wrapper(*args, **kwargs): - # Get process name and print header - process_header(sys.argv[1]) - - # Do not measure time if it is only needed to print the help message - if any([option in ['-h', '--help'] for option in sys.argv]) is False: - start = get_datetime() - start_str = datetime_str(start) - print(f'Started at: {start_str}\n') - - # Run function - func(*args, **kwargs) - - # Does not print elapsed time if the help message is printed - end = get_datetime() - end_str = datetime_str(end) - print(f'\nFinished at: {end_str}') - - minutes, seconds = datetime_diff(start, end) - print(f'Took {minutes: .0f}m{seconds: .0f}s.') - - return wrapper + # Use functools to preserve info about wrapped function + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Get process name and print header + process_header(sys.argv[1]) + + # Do not measure time if it is only needed to print the help message + if any([option in ['-h', '--help'] for option in sys.argv]) is False: + start = get_datetime() + start_str = datetime_str(start) + print(f'Started at: {start_str}\n') + + # Run function + func(*args, **kwargs) + + # Does not print elapsed time if the help message is printed + end = get_datetime() + end_str = datetime_str(end) + print(f'\nFinished at: {end_str}') + + minutes, seconds = datetime_diff(start, end) + print(f'Took {minutes: .0f}m{seconds: .0f}s.') + + return wrapper diff --git a/CHEWBBACA/utils/profile_hasher.py b/CHEWBBACA/utils/profile_hasher.py index c2048fc1..da2ebceb 100644 --- a/CHEWBBACA/utils/profile_hasher.py +++ b/CHEWBBACA/utils/profile_hasher.py @@ -16,193 +16,193 @@ import pandas as pd try: - from utils import (file_operations as fo, - fasta_operations as fao, - iterables_manipulation as im, - multiprocessing_operations as mo) + from utils import (file_operations as fo, + fasta_operations as fao, + iterables_manipulation as im, + multiprocessing_operations as mo) except ModuleNotFoundError: - from CHEWBBACA.utils import (file_operations as fo, - fasta_operations as fao, - iterables_manipulation as im, - multiprocessing_operations as mo) + from CHEWBBACA.utils import (file_operations as fo, + fasta_operations as fao, + iterables_manipulation as im, + multiprocessing_operations as mo) def hash_column(column, locus_file, hashing_function): - """Substitute allele identifiers by allele sequence hashes. - - Parameters - ---------- - column : pandas.core.series.Series - Column with allele identifiers to substitute by the - hash values computed from each allele sequence. - locus_file : str - Path to the FASTA file that contains the locus alleles. - hashing_function : func - Hashing function used to hash each allele. - - Returns - ------- - hashed_column : pandas.core.series.Series - Column where each allele identifier was substituted by - the hash computed from the allele sequence. - """ - # read Fasta files with locus alleles - locus_alleles = {(rec.id).split('_')[-1].replace('*', ''): str(rec.seq) - for rec in fao.sequence_generator(locus_file[0])} - if len(locus_file) > 1: - novel_records = {(rec.id).split('_')[-1].replace('*', ''): str(rec.seq) - for rec in fao.sequence_generator(locus_file[1])} - locus_alleles = im.merge_dictionaries([locus_alleles, novel_records], True) - - hashed_alleles = {} - for seqid, seq in locus_alleles.items(): - # hash function does not accept string object, encode to get bytes object - hashed_seq = hashing_function(seq.encode()) - # bitwise operation to convert crc32 and adler32 hashes to unsigned - # integer and ensure the computed value is the same for Python 2 & 3 - if isinstance(hashed_seq, int): - hashed_seq &= 0xffffffff - else: - hashed_seq = hashed_seq.hexdigest() - hashed_alleles[seqid] = hashed_seq - - # faster than replace or map with update to avoid adding NaN - hashed_column = column.apply(lambda x: hashed_alleles.get(x, x)) - - return hashed_column + """Substitute allele identifiers by allele sequence hashes. + + Parameters + ---------- + column : pandas.core.series.Series + Column with allele identifiers to substitute by the + hash values computed from each allele sequence. + locus_file : str + Path to the FASTA file that contains the locus alleles. + hashing_function : func + Hashing function used to hash each allele. + + Returns + ------- + hashed_column : pandas.core.series.Series + Column where each allele identifier was substituted by + the hash computed from the allele sequence. + """ + # read Fasta files with locus alleles + locus_alleles = {(rec.id).split('_')[-1].replace('*', ''): str(rec.seq) + for rec in fao.sequence_generator(locus_file[0])} + if len(locus_file) > 1: + novel_records = {(rec.id).split('_')[-1].replace('*', ''): str(rec.seq) + for rec in fao.sequence_generator(locus_file[1])} + locus_alleles = im.merge_dictionaries([locus_alleles, novel_records], True) + + hashed_alleles = {} + for seqid, seq in locus_alleles.items(): + # hash function does not accept string object, encode to get bytes object + hashed_seq = hashing_function(seq.encode()) + # bitwise operation to convert crc32 and adler32 hashes to unsigned + # integer and ensure the computed value is the same for Python 2 & 3 + if isinstance(hashed_seq, int): + hashed_seq &= 0xffffffff + else: + hashed_seq = hashed_seq.hexdigest() + hashed_alleles[seqid] = hashed_seq + + # faster than replace or map with update to avoid adding NaN + hashed_column = column.apply(lambda x: hashed_alleles.get(x, x)) + + return hashed_column def hash_profiles(profiles_table, loci_ids, loci_files, hashing_function, - nrows, skiprows, output_directory): - """Hash a set of allelic profiles read from a TSV file. - - Parameters - ---------- - profiles_table : str - Path to the TSV file that contains the allelic profiles. - loci_ids : list - List with the loci identifiers. - loci_files : list - List with the paths to the FASTA files that contain the - loci alleles. - hashing_function : func - Hashing function used to hash each allele. - nrows : int - Number of rows/allelic profiles to read from the input - file. - skiprows : range - Range of rows to skip. - output_directory : str - Path to the output directory. - - Returns - ------- - output_file : str - Path to the output file with the hashed profiles. - """ - current_rows = pd.read_csv(profiles_table, delimiter='\t', dtype=str, - skiprows=skiprows, nrows=nrows, index_col=0) - - # Remove all 'INF-' prefixes, missing data and '*' from identifiers - current_rows = current_rows.apply(im.replace_chars, args=('-')) - - hashed_profiles = [] - for locus in loci_ids: - locus_column = current_rows[locus] - hashed_column = hash_column(locus_column, loci_files[locus], - hashing_function) - hashed_profiles.append(hashed_column) - - hashed_df = pd.concat(hashed_profiles, axis=1) - start = skiprows.stop - stop = skiprows.stop+len(hashed_df)-1 - input_basename = fo.file_basename(profiles_table, False) - output_file = fo.join_paths(output_directory, - ['{0}_{1}-{2}_hashed.tsv'.format(input_basename, start, stop)]) - hashed_df.to_csv(output_file, sep='\t', index=True, header=False) - - return output_file + nrows, skiprows, output_directory): + """Hash a set of allelic profiles read from a TSV file. + + Parameters + ---------- + profiles_table : str + Path to the TSV file that contains the allelic profiles. + loci_ids : list + List with the loci identifiers. + loci_files : list + List with the paths to the FASTA files that contain the + loci alleles. + hashing_function : func + Hashing function used to hash each allele. + nrows : int + Number of rows/allelic profiles to read from the input + file. + skiprows : range + Range of rows to skip. + output_directory : str + Path to the output directory. + + Returns + ------- + output_file : str + Path to the output file with the hashed profiles. + """ + current_rows = pd.read_csv(profiles_table, delimiter='\t', dtype=str, + skiprows=skiprows, nrows=nrows, index_col=0) + + # Remove all 'INF-' prefixes, missing data and '*' from identifiers + current_rows = current_rows.apply(im.replace_chars, args=('-')) + + hashed_profiles = [] + for locus in loci_ids: + locus_column = current_rows[locus] + hashed_column = hash_column(locus_column, loci_files[locus], + hashing_function) + hashed_profiles.append(hashed_column) + + hashed_df = pd.concat(hashed_profiles, axis=1) + start = skiprows.stop + stop = skiprows.stop+len(hashed_df)-1 + input_basename = fo.file_basename(profiles_table, False) + output_file = fo.join_paths(output_directory, + ['{0}_{1}-{2}_hashed.tsv'.format(input_basename, start, stop)]) + hashed_df.to_csv(output_file, sep='\t', index=True, header=False) + + return output_file def main(profiles_table, schema_directory, output_directory, hash_type, - cpu_cores, nrows, updated_files, no_inferred): - """Hash allele identifiers in a matrix of allelic profiles. - - Parameters - ---------- - profiles_table : str - Path to a TSV file with allelic profiles determined by the - AlleleCall module. - schema_directory : str - Path to the directory of the schema used to determine the - allelic profiles. - output_directory : str - Path to the output directory. - hash_type : str - Hashing algorithm to use. - cpu_cores : int - Number of CPU cores used by the process. - nrows : int - Divide input file into subsets to process more efficiently. - updated_files : dict - Dictionary with paths to schema FASTA files as keys and paths - to FASTA files updated by allele calling as values. Only used - if `no_inferred` is True. - no_inferred : bool - If the allele calling process did not add inferred alleles to - the schema. - """ - # Get hash function - hashing_function = getattr(hashlib, hash_type, None) - if hashing_function is None: - hashing_function = getattr(zlib, hash_type, None) - - if hashing_function is None: - print('{0} hash function is not available in ' - 'hashlib or zlib modules.'.format(hash_type)) - return False - - # Get loci identifiers - with open(profiles_table, 'r') as infile: - header = infile.readline() - loci_ids = header.split()[1:] - - loci_files = {} - for locus in loci_ids: - locus_file = fo.join_paths(schema_directory, [locus]) - # Add .fasta extension if file headers did not include it - if locus_file.endswith('.fasta') is False: - locus_file += '.fasta' - loci_files[locus] = [locus_file] - if locus_file in updated_files and no_inferred is True: - loci_files[locus].append(updated_files[locus_file][0]) - - # Get input/sample identifiers - sample_ids = pd.read_csv(profiles_table, delimiter='\t', - dtype=str, usecols=['FILE']) - - # Write file with header - header_basename = fo.file_basename(profiles_table).replace('.tsv', '_header.tsv') - header_file = fo.join_paths(output_directory, [header_basename]) - fo.write_to_file(header, header_file, 'w', '') - - # Create multiprocessing inputs - multi_inputs = [] - # Divide and process by row chunks - for i in range(0, len(sample_ids), nrows): - multi_inputs.append([profiles_table, loci_ids, loci_files, - hashing_function, nrows, range(1, i+1), - output_directory, hash_profiles]) - - hashed_files = mo.map_async_parallelizer(multi_inputs, mo.function_helper, - cpu_cores) - - # Concatenate all files - output_basename = fo.file_basename(profiles_table).replace('.tsv', '_hashed.tsv') - output_file = fo.join_paths(output_directory, [output_basename]) - fo.concatenate_files([header_file]+hashed_files, output_file) - - # Delete intermediate dataframes - fo.remove_files([header_file]+hashed_files) - - return output_file + cpu_cores, nrows, updated_files, no_inferred): + """Hash allele identifiers in a matrix of allelic profiles. + + Parameters + ---------- + profiles_table : str + Path to a TSV file with allelic profiles determined by the + AlleleCall module. + schema_directory : str + Path to the directory of the schema used to determine the + allelic profiles. + output_directory : str + Path to the output directory. + hash_type : str + Hashing algorithm to use. + cpu_cores : int + Number of CPU cores used by the process. + nrows : int + Divide input file into subsets to process more efficiently. + updated_files : dict + Dictionary with paths to schema FASTA files as keys and paths + to FASTA files updated by allele calling as values. Only used + if `no_inferred` is True. + no_inferred : bool + If the allele calling process did not add inferred alleles to + the schema. + """ + # Get hash function + hashing_function = getattr(hashlib, hash_type, None) + if hashing_function is None: + hashing_function = getattr(zlib, hash_type, None) + + if hashing_function is None: + print('{0} hash function is not available in ' + 'hashlib or zlib modules.'.format(hash_type)) + return False + + # Get loci identifiers + with open(profiles_table, 'r') as infile: + header = infile.readline() + loci_ids = header.split()[1:] + + loci_files = {} + for locus in loci_ids: + locus_file = fo.join_paths(schema_directory, [locus]) + # Add .fasta extension if file headers did not include it + if locus_file.endswith('.fasta') is False: + locus_file += '.fasta' + loci_files[locus] = [locus_file] + if locus_file in updated_files and no_inferred is True: + loci_files[locus].append(updated_files[locus_file][0]) + + # Get input/sample identifiers + sample_ids = pd.read_csv(profiles_table, delimiter='\t', + dtype=str, usecols=['FILE']) + + # Write file with header + header_basename = fo.file_basename(profiles_table).replace('.tsv', '_header.tsv') + header_file = fo.join_paths(output_directory, [header_basename]) + fo.write_to_file(header, header_file, 'w', '') + + # Create multiprocessing inputs + multi_inputs = [] + # Divide and process by row chunks + for i in range(0, len(sample_ids), nrows): + multi_inputs.append([profiles_table, loci_ids, loci_files, + hashing_function, nrows, range(1, i+1), + output_directory, hash_profiles]) + + hashed_files = mo.map_async_parallelizer(multi_inputs, mo.function_helper, + cpu_cores) + + # Concatenate all files + output_basename = fo.file_basename(profiles_table).replace('.tsv', '_hashed.tsv') + output_file = fo.join_paths(output_directory, [output_basename]) + fo.concatenate_files([header_file]+hashed_files, output_file) + + # Delete intermediate dataframes + fo.remove_files([header_file]+hashed_files) + + return output_file diff --git a/CHEWBBACA/utils/profiles_sqlitedb.py b/CHEWBBACA/utils/profiles_sqlitedb.py index 4c99e060..cb4f603b 100755 --- a/CHEWBBACA/utils/profiles_sqlitedb.py +++ b/CHEWBBACA/utils/profiles_sqlitedb.py @@ -26,690 +26,690 @@ import sqlite3 try: - from utils import process_datetime as pd + from utils import process_datetime as pd except ModuleNotFoundError: - from CHEWBBACA.utils import process_datetime as pd + from CHEWBBACA.utils import process_datetime as pd def create_database_file(db_file): - """Create a SQLite database file. - - If the database file already exists, - it will establish and close connection. - - Parameters - ---------- - df_file : str - Path to the SQLite database file. - - Returns - ------- - error : None or sqlite3.OperationalError - None if the SQLite database file was - successfully created, OperationalError - if it could not create/establish connection - """ - conn = None - error = None - try: - # Creates db file if it does not exist - conn = sqlite3.connect(db_file) - except Exception as e: - error = e - finally: - if conn: - conn.close() - - return error + """Create a SQLite database file. + + If the database file already exists, + it will establish and close connection. + + Parameters + ---------- + df_file : str + Path to the SQLite database file. + + Returns + ------- + error : None or sqlite3.OperationalError + None if the SQLite database file was + successfully created, OperationalError + if it could not create/establish connection + """ + conn = None + error = None + try: + # Creates db file if it does not exist + conn = sqlite3.connect(db_file) + except Exception as e: + error = e + finally: + if conn: + conn.close() + + return error def create_connection(db_file): - """Create a database connection to a SQLite database. + """Create a database connection to a SQLite database. - Parameters - ---------- - db_file: str - Path to the SQLite database file. + Parameters + ---------- + db_file: str + Path to the SQLite database file. - Returns - ------- - conn : sqlite3.Connection or sqlite3.OperationalError - SQLite Connection object if connection was - successfull or error if it was not possible - to connect to the database. - """ - try: - conn = sqlite3.connect(db_file) - except Exception as e: - conn = e + Returns + ------- + conn : sqlite3.Connection or sqlite3.OperationalError + SQLite Connection object if connection was + successfull or error if it was not possible + to connect to the database. + """ + try: + conn = sqlite3.connect(db_file) + except Exception as e: + conn = e - return conn + return conn def execute_statement(conn, statement): - """Execute a SQL statement. - - Parameters - ---------- - conn : sqlite3.Connection - SQLite Connection object. - statement : str - SQL statement to execute. - - Returns - ------- - error : None or sqlite3.OperationalError - None if the SQLite database file was - successfully created, OperationalError - if it could not create/establish connection - """ - error = None - try: - c = conn.cursor() - c.execute(statement) - return c - except Exception as e: - error = e - return error + """Execute a SQL statement. + + Parameters + ---------- + conn : sqlite3.Connection + SQLite Connection object. + statement : str + SQL statement to execute. + + Returns + ------- + error : None or sqlite3.OperationalError + None if the SQLite database file was + successfully created, OperationalError + if it could not create/establish connection + """ + error = None + try: + c = conn.cursor() + c.execute(statement) + return c + except Exception as e: + error = e + return error def select_all_rows(db_file, table): - """Retrieve all rows in a table. + """Retrieve all rows in a table. - Parameters - ---------- - db_file : str - Path to the SQLite database file. - table : str - Name of the table. + Parameters + ---------- + db_file : str + Path to the SQLite database file. + table : str + Name of the table. - Returns - ------- - rows : list of tup - List of all rows in the table. Each row - is represented by a tuple with the values - for all columns. - """ - conn = create_connection(db_file) - cur = conn.cursor() - cur.execute('SELECT * FROM {0}'.format(table)) + Returns + ------- + rows : list of tup + List of all rows in the table. Each row + is represented by a tuple with the values + for all columns. + """ + conn = create_connection(db_file) + cur = conn.cursor() + cur.execute('SELECT * FROM {0}'.format(table)) - result = cur.fetchall() + result = cur.fetchall() - rows = [r for r in result] + rows = [r for r in result] - conn.close() + conn.close() - return rows + return rows def create_insert_statement(table, columns, placeholders): - """Create a base SQL insert statement. - - Parameters - ---------- - table : str - Name of the table. - columns : list - List with the names of the columns - that values will be inserted into. - - Returns - ------- - statement : str - SQL insert statement that can be - used to insert values into `columns` - of a `table`. - """ - statement = ('INSERT OR IGNORE INTO {0}({1}) ' - 'VALUES({2});'.format(table, ','.join(columns), - ','.join(placeholders))) - - return statement + """Create a base SQL insert statement. + + Parameters + ---------- + table : str + Name of the table. + columns : list + List with the names of the columns + that values will be inserted into. + + Returns + ------- + statement : str + SQL insert statement that can be + used to insert values into `columns` + of a `table`. + """ + statement = ('INSERT OR IGNORE INTO {0}({1}) ' + 'VALUES({2});'.format(table, ','.join(columns), + ','.join(placeholders))) + + return statement def insert_loci(db_file, matrix_file): - """Insert loci into the loci table. + """Insert loci into the loci table. - Parameters - ---------- - db_file : str - Path to the SQLite database file. - matrix_file : str - Path to the TSV file with a matrix - of allelic profiles. + Parameters + ---------- + db_file : str + Path to the SQLite database file. + matrix_file : str + Path to the TSV file with a matrix + of allelic profiles. - Returns - ------- - The number of loci that were insert - into the table. - """ - matrix_lines = read_matrix(matrix_file) - loci_list = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]] + Returns + ------- + The number of loci that were insert + into the table. + """ + matrix_lines = read_matrix(matrix_file) + loci_list = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]] - conn = create_connection(db_file) - locus_sql = create_insert_statement('loci', ['name'], - ['?']) + conn = create_connection(db_file) + locus_sql = create_insert_statement('loci', ['name'], + ['?']) - loci = [(locus,) for locus in loci_list] + loci = [(locus,) for locus in loci_list] - cur = conn.cursor() - cur.executemany(locus_sql, loci) + cur = conn.cursor() + cur.executemany(locus_sql, loci) - conn.commit() - conn.close() + conn.commit() + conn.close() - return len(loci_list) + return len(loci_list) def insert_multiple(db_file, base_statement, data): - """Execute several insert statements. - - Parameters - ---------- - df_file : str - Path to the SQLite database file. - base_statement : str - Base SQL insert statement to execute. - data : list of tup - A list with tuples that contain the - column values to insert for each row. - - Returns - ------- - error : None or sqlite3.OperationalError - None if the SQL statement was successfully - inserted, SQLite OperationalError otherwise. - """ - error = None - try: - conn = create_connection(db_file) - cur = conn.cursor() - cur.executemany(base_statement, data) - conn.commit() - conn.close() - except Exception as e: - error = e - - return error + """Execute several insert statements. + + Parameters + ---------- + df_file : str + Path to the SQLite database file. + base_statement : str + Base SQL insert statement to execute. + data : list of tup + A list with tuples that contain the + column values to insert for each row. + + Returns + ------- + error : None or sqlite3.OperationalError + None if the SQL statement was successfully + inserted, SQLite OperationalError otherwise. + """ + error = None + try: + conn = create_connection(db_file) + cur = conn.cursor() + cur.executemany(base_statement, data) + conn.commit() + conn.close() + except Exception as e: + error = e + + return error def create_database(db_file): - """Create a SQLite database to store allelic profiles. - - Parameters - ---------- - db_file : str - Path to the SQLite database file. - - Returns - ------- - True if the SQLite database file and tables were - successfully created, SQLite OperationalError otherwise. - """ - message = create_database_file(db_file) - - # samples table - # date format YYYY-MM-DDTHH:MM:SS - sql_samples_table = ('CREATE TABLE IF NOT EXISTS samples (' - 'id INTEGER PRIMARY KEY,' - 'name TEXT NOT NULL,' - 'date TEXT NOT NULL,' - 'profile_id TEXT NOT NULL,' - 'FOREIGN KEY (profile_id) REFERENCES profiles (profile_id)' - ');') - - # loci table - sql_loci_table = ('CREATE TABLE IF NOT EXISTS loci (' - 'id INTEGER PRIMARY KEY,' - 'name TEXT,' - 'annotation TEXT,' - 'custom_annotation TEXT' - ');') - - # profiles table - sql_profiles_table = ('CREATE TABLE IF NOT EXISTS profiles (' - 'profile_id TEXT PRIMARY KEY,' - 'type TEXT,' - 'date TEXT,' - 'profile_json JSON,' - 'subschema_id TEXT,' - 'FOREIGN KEY (subschema_id) REFERENCES subschemas (subschema_id)' - ');') - - # subschemas table - sql_subschemas_table = ('CREATE TABLE IF NOT EXISTS subschemas (' - 'subschema_id TEXT PRIMARY KEY,' - 'loci JSON' # JSON with schema subset - ');') - - # create tables - conn = create_connection(db_file) - - if isinstance(conn, str) is True: - return conn - else: - row = execute_statement(conn, sql_samples_table) - row = execute_statement(conn, sql_loci_table) - row = execute_statement(conn, sql_profiles_table) - row = execute_statement(conn, sql_subschemas_table) - conn.commit() - conn.close() - - return True + """Create a SQLite database to store allelic profiles. + + Parameters + ---------- + db_file : str + Path to the SQLite database file. + + Returns + ------- + True if the SQLite database file and tables were + successfully created, SQLite OperationalError otherwise. + """ + message = create_database_file(db_file) + + # samples table + # date format YYYY-MM-DDTHH:MM:SS + sql_samples_table = ('CREATE TABLE IF NOT EXISTS samples (' + 'id INTEGER PRIMARY KEY,' + 'name TEXT NOT NULL,' + 'date TEXT NOT NULL,' + 'profile_id TEXT NOT NULL,' + 'FOREIGN KEY (profile_id) REFERENCES profiles (profile_id)' + ');') + + # loci table + sql_loci_table = ('CREATE TABLE IF NOT EXISTS loci (' + 'id INTEGER PRIMARY KEY,' + 'name TEXT,' + 'annotation TEXT,' + 'custom_annotation TEXT' + ');') + + # profiles table + sql_profiles_table = ('CREATE TABLE IF NOT EXISTS profiles (' + 'profile_id TEXT PRIMARY KEY,' + 'type TEXT,' + 'date TEXT,' + 'profile_json JSON,' + 'subschema_id TEXT,' + 'FOREIGN KEY (subschema_id) REFERENCES subschemas (subschema_id)' + ');') + + # subschemas table + sql_subschemas_table = ('CREATE TABLE IF NOT EXISTS subschemas (' + 'subschema_id TEXT PRIMARY KEY,' + 'loci JSON' # JSON with schema subset + ');') + + # create tables + conn = create_connection(db_file) + + if isinstance(conn, str) is True: + return conn + else: + row = execute_statement(conn, sql_samples_table) + row = execute_statement(conn, sql_loci_table) + row = execute_statement(conn, sql_profiles_table) + row = execute_statement(conn, sql_subschemas_table) + conn.commit() + conn.close() + + return True def read_matrix(matrix_file): - """Read a TSV file that contains a matrix with allelic profiles. + """Read a TSV file that contains a matrix with allelic profiles. - Parameters - ---------- - matrix_file : str - Path to the TSV file with the matrix. + Parameters + ---------- + matrix_file : str + Path to the TSV file with the matrix. - Returns - ------- - matrix_lines : list of list - A list with all the lines in the TSV file. - """ - with open(matrix_file, 'r') as m: - matrix_lines = list(csv.reader(m, delimiter='\t')) + Returns + ------- + matrix_lines : list of list + A list with all the lines in the TSV file. + """ + with open(matrix_file, 'r') as m: + matrix_lines = list(csv.reader(m, delimiter='\t')) - return matrix_lines + return matrix_lines def get_loci_ids(matrix_lines): - """Extract loci identifiers from a list with allelic profiles. + """Extract loci identifiers from a list with allelic profiles. - Parameters - ---------- - matrix_lines : list of list - A list with all the lines read from a TSV - file that contains a matrix of allelic profiles. + Parameters + ---------- + matrix_lines : list of list + A list with all the lines read from a TSV + file that contains a matrix of allelic profiles. - Returns - ------- - loci_ids : list - List with the identifiers of all loci represented - in the allelic profiles. - """ - loci_ids = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]] + Returns + ------- + loci_ids : list + List with the identifiers of all loci represented + in the allelic profiles. + """ + loci_ids = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]] - return loci_ids + return loci_ids def get_sample_ids(matrix_lines): - """Extract sample identifiers from a list with allelic profiles. + """Extract sample identifiers from a list with allelic profiles. - Parameters - ---------- - matrix_lines : list of list - A list with all the lines read from a TSV - file that contains a matrix of allelic profiles. + Parameters + ---------- + matrix_lines : list of list + A list with all the lines read from a TSV + file that contains a matrix of allelic profiles. - Returns - ------- - sample_ids : list - List with the sample identifiers of all allelic - profiles. - """ - sample_ids = [l[0].rstrip('.fasta') for l in matrix_lines[1:]] + Returns + ------- + sample_ids : list + List with the sample identifiers of all allelic + profiles. + """ + sample_ids = [l[0].rstrip('.fasta') for l in matrix_lines[1:]] - return sample_ids + return sample_ids def get_profiles(matrix_lines): - """Extract profiles from a list with allelic profiles. + """Extract profiles from a list with allelic profiles. - Parameters - ---------- - matrix_lines : list of list - A list with all the lines read from a TSV - file that contains a matrix of allelic profiles. + Parameters + ---------- + matrix_lines : list of list + A list with all the lines read from a TSV + file that contains a matrix of allelic profiles. - Returns - ------- - profiles : list of dict - List with one dictionary per allelic profile. - """ - profiles = [] - loci_ids = matrix_lines[0][1:] - for l in matrix_lines[1:]: - lp = l[1:] - profile = {loci_ids[i].rstrip('.fasta'): lp[i] for i in range(len(lp))} - profiles.append(profile) + Returns + ------- + profiles : list of dict + List with one dictionary per allelic profile. + """ + profiles = [] + loci_ids = matrix_lines[0][1:] + for l in matrix_lines[1:]: + lp = l[1:] + profile = {loci_ids[i].rstrip('.fasta'): lp[i] for i in range(len(lp))} + profiles.append(profile) - return profiles + return profiles def remove_inf(profile): - """Remove the 'INF-' prefix from inferred alleles. + """Remove the 'INF-' prefix from inferred alleles. - Parameters - ---------- - profile : list - List with the allele identifiers in an allelic - profile. + Parameters + ---------- + profile : list + List with the allele identifiers in an allelic + profile. - Returns - ------- - clean_profile : list - List with allele identifiers stripped of the - 'INF-' prefix. - """ - clean_profile = [a.lstrip('INF-') if 'INF-' in a else a for a in profile] + Returns + ------- + clean_profile : list + List with allele identifiers stripped of the + 'INF-' prefix. + """ + clean_profile = [a.lstrip('INF-') if 'INF-' in a else a for a in profile] - return clean_profile + return clean_profile def jsonify_profile(profile, loci): - """ - """ - json_profile = '' - for k, v in profile.items(): - # add first entry to JSON only if locus value is not LNF - locus = k - locus_id = loci[locus] - allele_id = v - if len(json_profile) == 0: - if allele_id != 'LNF': - json_profile += '{{"{0}":"{1}"'.format(locus_id, allele_id) - else: - if allele_id != 'LNF': - json_profile += ', "{0}":"{1}"'.format(locus_id, allele_id) - - json_profile += '}' - - return json_profile + """ + """ + json_profile = '' + for k, v in profile.items(): + # add first entry to JSON only if locus value is not LNF + locus = k + locus_id = loci[locus] + allele_id = v + if len(json_profile) == 0: + if allele_id != 'LNF': + json_profile += '{{"{0}":"{1}"'.format(locus_id, allele_id) + else: + if allele_id != 'LNF': + json_profile += ', "{0}":"{1}"'.format(locus_id, allele_id) + + json_profile += '}' + + return json_profile def store_allelecall_results(output_directory, schema_directory): - """ - """ - # add profiles to SQLite database - # parent results folder might have several results folders - results_folders = [os.path.join(output_directory, file) - for file in os.listdir(output_directory) if 'results' in file] - # create datetime objects and sort to get latest - insert_dates = [(file, pd.datetime_obj(file.split('_')[-1], '%Y%m%dT%H%M%S')) - for file in results_folders] - sorted_insert_dates = sorted(insert_dates, key=lambda x: x[1], reverse=True) - results_matrix = os.path.join(sorted_insert_dates[0][0], 'results_alleles.tsv') - insert_date = sorted_insert_dates[0][0].split('_')[-1] - - # verify that database directory exists - database_directory = os.path.join(schema_directory, 'profiles_database') - # create if it does not exist - if os.path.isdir(database_directory) is False: - os.mkdir(database_directory) - - # also need to check for database file - database_file = os.path.join(database_directory, 'profiles.db') - if os.path.isfile(database_file) is False: - print('\nCreating SQLite database to store profiles...', end='') - try: - create_database(database_file) - # insert loci list into loci table - total_loci = insert_loci(database_file, results_matrix) - print('done.') - print('Inserted {0} loci into database.'.format(total_loci)) - except Exception as e: - print(e) - print('WARNING: Could not create database file. Will not store profiles.') - - # insert whole matrix - updated = False - if os.path.isfile(database_file) is not False: - print('\nSending allelic profiles to SQLite database...', end='') - try: - total_profiles = insert_allelecall_matrix(results_matrix, database_file, insert_date) - print('done.') - print('Inserted {0} profiles ({1} total, {2} total unique).'.format(total_profiles[0], total_profiles[1], total_profiles[2])) - updated = True - except Exception: - print('WARNING: Could not store profiles in local database.') - - return updated + """ + """ + # add profiles to SQLite database + # parent results folder might have several results folders + results_folders = [os.path.join(output_directory, file) + for file in os.listdir(output_directory) if 'results' in file] + # create datetime objects and sort to get latest + insert_dates = [(file, pd.datetime_obj(file.split('_')[-1], '%Y%m%dT%H%M%S')) + for file in results_folders] + sorted_insert_dates = sorted(insert_dates, key=lambda x: x[1], reverse=True) + results_matrix = os.path.join(sorted_insert_dates[0][0], 'results_alleles.tsv') + insert_date = sorted_insert_dates[0][0].split('_')[-1] + + # verify that database directory exists + database_directory = os.path.join(schema_directory, 'profiles_database') + # create if it does not exist + if os.path.isdir(database_directory) is False: + os.mkdir(database_directory) + + # also need to check for database file + database_file = os.path.join(database_directory, 'profiles.db') + if os.path.isfile(database_file) is False: + print('\nCreating SQLite database to store profiles...', end='') + try: + create_database(database_file) + # insert loci list into loci table + total_loci = insert_loci(database_file, results_matrix) + print('done.') + print('Inserted {0} loci into database.'.format(total_loci)) + except Exception as e: + print(e) + print('WARNING: Could not create database file. Will not store profiles.') + + # insert whole matrix + updated = False + if os.path.isfile(database_file) is not False: + print('\nSending allelic profiles to SQLite database...', end='') + try: + total_profiles = insert_allelecall_matrix(results_matrix, database_file, insert_date) + print('done.') + print('Inserted {0} profiles ({1} total, {2} total unique).'.format(total_profiles[0], total_profiles[1], total_profiles[2])) + updated = True + except Exception: + print('WARNING: Could not store profiles in local database.') + + return updated def insert_allelecall_matrix(matrix_file, db_file, insert_date): - """Insert profile data from a TSV file into a SQLite db. - - Parameters - ---------- - matrix_file : str - Path to the TSV file with the matrix. - db_file : str - Path to the SQLite database file. - insert_date : str - Date in the name of the folder created - by the AlleleCall process to save results - (in the format Y-m-dTH:M:S). - - Returns - ------- - A list with the following elements: - - - Number of inserted profiles. - - Total number of profiles. - - Number of unique profiles. - """ - loci_list_db = select_all_rows(db_file, 'loci') - loci_map = {t[1]: t[0] for t in loci_list_db} - - # read matrix - matrix_lines = read_matrix(matrix_file) - matrix_lines = [remove_inf(l) for l in matrix_lines] - - # get sample identifiers - sample_ids = get_sample_ids(matrix_lines) - - # get profiles - profiles = get_profiles(matrix_lines) - - # insert profiles - # create JSON format of each profile - profiles_hashes = [] - subschemas_hashes = [] - subschemas_loci = [] - inserted_profiles = 0 - profile_data = [] - for p in profiles: - loci = [str(loci_map[locus]) for locus in p.keys()] - loci_join = ','.join(loci) - loci_hash = hashlib.sha256(loci_join.encode('utf-8')).hexdigest() - if loci_hash not in subschemas_hashes: - subschemas_loci.append(loci_join) - subschemas_hashes.append(loci_hash) - - json_profile = jsonify_profile(p, loci_map) - - profile_hash = hashlib.sha256(json_profile.encode('utf-8')).hexdigest() - profiles_hashes.append(profile_hash) - - profile_data.append((profile_hash, insert_date, - json_profile, loci_hash)) - - # insert all profiles - conn = create_connection(db_file) - profile_statement = create_insert_statement('profiles', - ['profile_id', 'date', - 'profile_json', 'subschema_id'], - ['?', '?', 'json(?)', '?']) - previous_rcount = execute_statement(conn, "SELECT COUNT(*) " - "FROM profiles;").fetchone()[0] - profiles_res = insert_multiple(db_file, profile_statement, profile_data) - - posterior_rcount = execute_statement(conn, "SELECT COUNT(*) " - "FROM profiles;").fetchone()[0] - inserted_profiles = posterior_rcount - previous_rcount - conn.close() - - # insert subschemas - subschema_statement = create_insert_statement('subschemas', - ['subschema_id', 'loci'], - ['?', '?']) - subschema_data = [(subschemas_hashes[i], subschemas_loci[i]) - for i in range(len(subschemas_hashes))] - - # insert all subschemas - subschema_res = insert_multiple(db_file, subschema_statement, - subschema_data) - - # insert samples - sample_statement = create_insert_statement('samples', - ['name', 'date', 'profile_id'], - ['?', '?', '?']) - samples_data = [(sample_ids[i], insert_date, profiles_hashes[i]) - for i in range(len(sample_ids))] - - # insert all samples - it checks PK uniqueness condition - samples_res = insert_multiple(db_file, sample_statement, samples_data) - - return [inserted_profiles, len(profiles), len(set(profiles_hashes)), - profiles_res, subschema_res, samples_res] + """Insert profile data from a TSV file into a SQLite db. + + Parameters + ---------- + matrix_file : str + Path to the TSV file with the matrix. + db_file : str + Path to the SQLite database file. + insert_date : str + Date in the name of the folder created + by the AlleleCall process to save results + (in the format Y-m-dTH:M:S). + + Returns + ------- + A list with the following elements: + + - Number of inserted profiles. + - Total number of profiles. + - Number of unique profiles. + """ + loci_list_db = select_all_rows(db_file, 'loci') + loci_map = {t[1]: t[0] for t in loci_list_db} + + # read matrix + matrix_lines = read_matrix(matrix_file) + matrix_lines = [remove_inf(l) for l in matrix_lines] + + # get sample identifiers + sample_ids = get_sample_ids(matrix_lines) + + # get profiles + profiles = get_profiles(matrix_lines) + + # insert profiles + # create JSON format of each profile + profiles_hashes = [] + subschemas_hashes = [] + subschemas_loci = [] + inserted_profiles = 0 + profile_data = [] + for p in profiles: + loci = [str(loci_map[locus]) for locus in p.keys()] + loci_join = ','.join(loci) + loci_hash = hashlib.sha256(loci_join.encode('utf-8')).hexdigest() + if loci_hash not in subschemas_hashes: + subschemas_loci.append(loci_join) + subschemas_hashes.append(loci_hash) + + json_profile = jsonify_profile(p, loci_map) + + profile_hash = hashlib.sha256(json_profile.encode('utf-8')).hexdigest() + profiles_hashes.append(profile_hash) + + profile_data.append((profile_hash, insert_date, + json_profile, loci_hash)) + + # insert all profiles + conn = create_connection(db_file) + profile_statement = create_insert_statement('profiles', + ['profile_id', 'date', + 'profile_json', 'subschema_id'], + ['?', '?', 'json(?)', '?']) + previous_rcount = execute_statement(conn, "SELECT COUNT(*) " + "FROM profiles;").fetchone()[0] + profiles_res = insert_multiple(db_file, profile_statement, profile_data) + + posterior_rcount = execute_statement(conn, "SELECT COUNT(*) " + "FROM profiles;").fetchone()[0] + inserted_profiles = posterior_rcount - previous_rcount + conn.close() + + # insert subschemas + subschema_statement = create_insert_statement('subschemas', + ['subschema_id', 'loci'], + ['?', '?']) + subschema_data = [(subschemas_hashes[i], subschemas_loci[i]) + for i in range(len(subschemas_hashes))] + + # insert all subschemas + subschema_res = insert_multiple(db_file, subschema_statement, + subschema_data) + + # insert samples + sample_statement = create_insert_statement('samples', + ['name', 'date', 'profile_id'], + ['?', '?', '?']) + samples_data = [(sample_ids[i], insert_date, profiles_hashes[i]) + for i in range(len(sample_ids))] + + # insert all samples - it checks PK uniqueness condition + samples_res = insert_multiple(db_file, sample_statement, samples_data) + + return [inserted_profiles, len(profiles), len(set(profiles_hashes)), + profiles_res, subschema_res, samples_res] def select_outdated(loci, reassigned, cursor): - """Retrive the allelic profiles with outdated allele identifiers. - - Parameters - ---------- - loci : dict - A dictionary with the numeric identifiers - of loci as keys and the integer identifier - of each locus in the local SQLite database. - reassigned : dict - A dictionary with loci identifiers as keys. - Each key has a dictionary as value, with the - current alleles identifiers as keys and the - updated identifiers as values. - cursor : sqlite3.Cursor - SQLite cursor object. - - Returns - ------- - profiles : dict of list - A dictionary with profiles hashes as keys and - lists as values. Each list contains the profile - as str type and a variable number of tuples, - one tuple per allele identifier that must be - updated (tuples contain the locus identifier, - the outdated allele identifier and the updated - allele identifier). - """ - profiles = {} - for locus, alleles in reassigned.items(): - locus_id = loci[locus.split('-')[-1].rstrip('.fasta').lstrip('0')] - for a1, a2 in alleles.items(): - # good idea to implement way to run many queries at once - query = ("SELECT profiles.profile_id, profiles.profile_json " - "FROM profiles " - "WHERE json_extract(profiles.profile_json, '$.{0}') = '{1}';".format(locus_id, a1)) - cursor.execute(query) - rows = [r for r in cursor.fetchall()] - for r in rows: - profile_hash = r[0] - profile = r[1] - if profile_hash in profiles: - profiles[profile_hash].append((locus_id, a1, a2)) - else: - profiles[profile_hash] = [profile, (locus_id, a1, a2)] - - return profiles + """Retrive the allelic profiles with outdated allele identifiers. + + Parameters + ---------- + loci : dict + A dictionary with the numeric identifiers + of loci as keys and the integer identifier + of each locus in the local SQLite database. + reassigned : dict + A dictionary with loci identifiers as keys. + Each key has a dictionary as value, with the + current alleles identifiers as keys and the + updated identifiers as values. + cursor : sqlite3.Cursor + SQLite cursor object. + + Returns + ------- + profiles : dict of list + A dictionary with profiles hashes as keys and + lists as values. Each list contains the profile + as str type and a variable number of tuples, + one tuple per allele identifier that must be + updated (tuples contain the locus identifier, + the outdated allele identifier and the updated + allele identifier). + """ + profiles = {} + for locus, alleles in reassigned.items(): + locus_id = loci[locus.split('-')[-1].rstrip('.fasta').lstrip('0')] + for a1, a2 in alleles.items(): + # good idea to implement way to run many queries at once + query = ("SELECT profiles.profile_id, profiles.profile_json " + "FROM profiles " + "WHERE json_extract(profiles.profile_json, '$.{0}') = '{1}';".format(locus_id, a1)) + cursor.execute(query) + rows = [r for r in cursor.fetchall()] + for r in rows: + profile_hash = r[0] + profile = r[1] + if profile_hash in profiles: + profiles[profile_hash].append((locus_id, a1, a2)) + else: + profiles[profile_hash] = [profile, (locus_id, a1, a2)] + + return profiles def alter_profiles(profiles, cursor): - """Update allele identifiers in allelic profiles. - - Parameters - ---------- - profiles : dict - A dictionary with profiles hashes as keys and - lists as values. Each list contains the profile - as str type and a variable number of tuples, - one tuple per allele identifier that must be - updated (tuples contain the locus identifier, - the outdated allele identifier and the updated - allele identifier). - cursor : sqlite3.Cursor - SQLite cursor object. - - Returns - ------- - results : dict of str - A dictionary with profiles hashes as keys and - updated profiles as values. - """ - results = {} - for k, v in profiles.items(): - profile = v[0] - to_replace = ["'$.{0}', '{1}'".format(r[0], r[2]) for r in v[1:]] - # json_replace cannot receive a great number of arguments - # change 50 identifiers at a time - for i in range(0, len(to_replace), 50): - to_replace_txt = ','.join(to_replace[i:i+50]) - query = ("SELECT json_replace('{0}', {1}) " - "FROM profiles;".format(profile, to_replace_txt)) - cursor.execute(query) - res = cursor.fetchall() - profile = res[0][0] - results[k] = profile - - return results + """Update allele identifiers in allelic profiles. + + Parameters + ---------- + profiles : dict + A dictionary with profiles hashes as keys and + lists as values. Each list contains the profile + as str type and a variable number of tuples, + one tuple per allele identifier that must be + updated (tuples contain the locus identifier, + the outdated allele identifier and the updated + allele identifier). + cursor : sqlite3.Cursor + SQLite cursor object. + + Returns + ------- + results : dict of str + A dictionary with profiles hashes as keys and + updated profiles as values. + """ + results = {} + for k, v in profiles.items(): + profile = v[0] + to_replace = ["'$.{0}', '{1}'".format(r[0], r[2]) for r in v[1:]] + # json_replace cannot receive a great number of arguments + # change 50 identifiers at a time + for i in range(0, len(to_replace), 50): + to_replace_txt = ','.join(to_replace[i:i+50]) + query = ("SELECT json_replace('{0}', {1}) " + "FROM profiles;".format(profile, to_replace_txt)) + cursor.execute(query) + res = cursor.fetchall() + profile = res[0][0] + results[k] = profile + + return results def update_profiles(schema_directory, reassigned): - """Update allele identifiers in allelic profiles. - - Parameters - ---------- - schema_directory : str - Path to the directory with the schema files. - reassigned : dict of dict - A dictionary with loci identifiers as keys. - Each key has a dictionary as value, with the - current alleles identifiers as keys and the - updated identifiers as values. - - Returns - ------- - The number of profiles that were altered. - """ - - db_file = os.path.join(schema_directory, 'profiles_database', - 'profiles.db') - - if os.path.isfile(db_file) is False: - return None - - # create connection to db - conn = create_connection(db_file) - cursor = conn.cursor() - # get list of loci - loci_list = select_all_rows(db_file, 'loci') - ################ - ################ - ################ - ################ - # this does not seem right! loci identifiers are being truncated? - # maybe because it was designed to work with identifiers from Chewie-NS? -_- - # profiles should only be updated for schemas downloaded from the NS anyway... - loci = {l[1].split('-')[-1].lstrip('0'): l[0] for l in loci_list} - # get profiles with identifiers that have to be changed - profiles = select_outdated(loci, reassigned, cursor) - - # change identifiers in profiles - updated_profiles = alter_profiles(profiles, cursor) - - conn.close() - - # change values in profiles table - query = "update profiles set profile_json = ? where profile_id = ?;" - params = [[p, h] for h, p in updated_profiles.items()] - insert_multiple(db_file, query, params) - - return len(profiles) + """Update allele identifiers in allelic profiles. + + Parameters + ---------- + schema_directory : str + Path to the directory with the schema files. + reassigned : dict of dict + A dictionary with loci identifiers as keys. + Each key has a dictionary as value, with the + current alleles identifiers as keys and the + updated identifiers as values. + + Returns + ------- + The number of profiles that were altered. + """ + + db_file = os.path.join(schema_directory, 'profiles_database', + 'profiles.db') + + if os.path.isfile(db_file) is False: + return None + + # create connection to db + conn = create_connection(db_file) + cursor = conn.cursor() + # get list of loci + loci_list = select_all_rows(db_file, 'loci') + ################ + ################ + ################ + ################ + # this does not seem right! loci identifiers are being truncated? + # maybe because it was designed to work with identifiers from Chewie-NS? -_- + # profiles should only be updated for schemas downloaded from the NS anyway... + loci = {l[1].split('-')[-1].lstrip('0'): l[0] for l in loci_list} + # get profiles with identifiers that have to be changed + profiles = select_outdated(loci, reassigned, cursor) + + # change identifiers in profiles + updated_profiles = alter_profiles(profiles, cursor) + + conn.close() + + # change values in profiles table + query = "update profiles set profile_json = ? where profile_id = ?;" + params = [[p, h] for h, p in updated_profiles.items()] + insert_multiple(db_file, query, params) + + return len(profiles) #schema_directory = '/home/rfm/Desktop/rfm/Lab_Software/senterica_schema/senterica_INNUENDO_cgMLST' diff --git a/CHEWBBACA/utils/sequence_clustering.py b/CHEWBBACA/utils/sequence_clustering.py index 6934046e..5cf90823 100644 --- a/CHEWBBACA/utils/sequence_clustering.py +++ b/CHEWBBACA/utils/sequence_clustering.py @@ -16,510 +16,510 @@ from collections import Counter try: - from utils import (file_operations as fo, - iterables_manipulation as im, - blast_wrapper as bw, - fasta_operations as fao) + from utils import (file_operations as fo, + iterables_manipulation as im, + blast_wrapper as bw, + fasta_operations as fao) except ModuleNotFoundError: - from CHEWBBACA.utils import (file_operations as fo, - iterables_manipulation as im, - blast_wrapper as bw, - fasta_operations as fao) + from CHEWBBACA.utils import (file_operations as fo, + iterables_manipulation as im, + blast_wrapper as bw, + fasta_operations as fao) def select_representatives(kmers, reps_groups, clustering_sim): - """Determine the clusters a sequence can be added to. - - Determines the set of clusters that a sequence can be - added to based on the decimal proportion of shared - distinct k-mers. - - Parameters - ---------- - kmers : list or set - Set of k-mers determined by decomposing a single - sequence. - reps_groups : dict - Dictionary with k-mers as keys and sequence - identifiers of sequences that contain each - k-mer as values. - clustering_sim : float - Sequences are added to clusters if they - share a minimum decimal proportion of - distinct k-mers with a cluster representative. - - Returns - ------- - selected_reps : list - List with a tuple per cluster/representative - that the sequence can be added to. Each tuple - has the identifier of the cluster representative - and the decimal proportion of shared distinct - kmers. - """ - current_reps = [reps_groups[k] for k in kmers if k in reps_groups] - current_reps = im.flatten_list(current_reps) - - # count number of kmer hits per representative - counts = Counter(current_reps) - selected_reps = [(k, v/len(kmers)) - for k, v in counts.items() - if v/len(kmers) >= clustering_sim] - - # sort by identifier and then by similarity to always get same order - selected_reps = sorted(selected_reps, key=lambda x: x[0]) - selected_reps = sorted(selected_reps, key=lambda x: x[1], reverse=True) - - return selected_reps + """Determine the clusters a sequence can be added to. + + Determines the set of clusters that a sequence can be + added to based on the decimal proportion of shared + distinct k-mers. + + Parameters + ---------- + kmers : list or set + Set of k-mers determined by decomposing a single + sequence. + reps_groups : dict + Dictionary with k-mers as keys and sequence + identifiers of sequences that contain each + k-mer as values. + clustering_sim : float + Sequences are added to clusters if they + share a minimum decimal proportion of + distinct k-mers with a cluster representative. + + Returns + ------- + selected_reps : list + List with a tuple per cluster/representative + that the sequence can be added to. Each tuple + has the identifier of the cluster representative + and the decimal proportion of shared distinct + kmers. + """ + current_reps = [reps_groups[k] for k in kmers if k in reps_groups] + current_reps = im.flatten_list(current_reps) + + # count number of kmer hits per representative + counts = Counter(current_reps) + selected_reps = [(k, v/len(kmers)) + for k, v in counts.items() + if v/len(kmers) >= clustering_sim] + + # sort by identifier and then by similarity to always get same order + selected_reps = sorted(selected_reps, key=lambda x: x[0]) + selected_reps = sorted(selected_reps, key=lambda x: x[1], reverse=True) + + return selected_reps def intra_cluster_sim(clusters, sequences, word_size, intra_filter): - """Identify clustered sequences that are similar. - - Determines the percentage of shared k-mers/minimizers - between sequences in the same cluster and excludes - sequences that are similar to other sequences in the - cluster. - - Parameters - ---------- - clusters : dict - Dictionary with the identifiers of sequences - that are cluster representatives as keys and - a list with tuples as values. Each tuple has - the identifier of a sequence that was added to - the cluster, the percentage of shared - kmers/minimizers and the length of the clustered - sequence. - sequences : dict - Dictionary with sequence identifiers as keys - and sequences as values. - word_size : int - Value k for the kmer size. - intra_filter : float - Similarity threshold value. If two sequences in - the same cluster have a similarity value equal - or greater to this value, the shorter sequence - will be excluded from the cluster. - - Returns - ------- - excluded_seqids : dict - Dictionary with the identifiers of sequences - that are cluster representatives as keys and - a list with the sequence identifiers of sequences - that were excluded from the cluster as values. - excluded_sims : dict - A dictionary with the identifiers of sequences - that are cluster representatives as keys - and a list with tuples as values. Each tuple - contains the sequence identifier and the - similarity value for a match that led to - an exclusion. - """ - excluded_seqids = {} - excluded_sims = {} - for representative, clustered in clusters.items(): - # get identifiers of sequences in the cluster - clustered_ids = [c[0] for c in clustered] - clustered_seqs = {seqid: sequences[seqid] for seqid in clustered_ids} - - # create kmer index - kmers_mapping = im.kmer_index(clustered_seqs, word_size, fasta=False) - - excluded = [] - similarities = [] - # for each sequence in the cluster - for seqid, sequence in clustered_seqs.items(): - if seqid not in excluded: - query_kmers = im.determine_minimizers(sequence, word_size, - word_size, position=False) - - # select sequences with same kmers - sims = select_representatives(query_kmers, - kmers_mapping, - intra_filter) - - if len(sims) > 1: - # exclude current sequence - candidates = [s for s in sims if s[0] != seqid] - for c in candidates: - # if query sequence is longer, keep it and exclude candidate - if len(clustered_seqs[seqid]) >= len(clustered_seqs[c[0]]): - picked = (c[0], seqid, c[1]) - # otherwise, exclude query sequence - elif len(clustered_seqs[c[0]]) > len(clustered_seqs[seqid]): - picked = (seqid, c[0], c[1]) - similarities.append(picked) - excluded.append(picked[0]) - - # convert into set first to remove possible duplicates - excluded_seqids[representative] = list(set(excluded)) - excluded_sims[representative] = similarities - - return [excluded_seqids, excluded_sims] + """Identify clustered sequences that are similar. + + Determines the percentage of shared k-mers/minimizers + between sequences in the same cluster and excludes + sequences that are similar to other sequences in the + cluster. + + Parameters + ---------- + clusters : dict + Dictionary with the identifiers of sequences + that are cluster representatives as keys and + a list with tuples as values. Each tuple has + the identifier of a sequence that was added to + the cluster, the percentage of shared + kmers/minimizers and the length of the clustered + sequence. + sequences : dict + Dictionary with sequence identifiers as keys + and sequences as values. + word_size : int + Value k for the kmer size. + intra_filter : float + Similarity threshold value. If two sequences in + the same cluster have a similarity value equal + or greater to this value, the shorter sequence + will be excluded from the cluster. + + Returns + ------- + excluded_seqids : dict + Dictionary with the identifiers of sequences + that are cluster representatives as keys and + a list with the sequence identifiers of sequences + that were excluded from the cluster as values. + excluded_sims : dict + A dictionary with the identifiers of sequences + that are cluster representatives as keys + and a list with tuples as values. Each tuple + contains the sequence identifier and the + similarity value for a match that led to + an exclusion. + """ + excluded_seqids = {} + excluded_sims = {} + for representative, clustered in clusters.items(): + # get identifiers of sequences in the cluster + clustered_ids = [c[0] for c in clustered] + clustered_seqs = {seqid: sequences[seqid] for seqid in clustered_ids} + + # create kmer index + kmers_mapping = im.kmer_index(clustered_seqs, word_size, fasta=False) + + excluded = [] + similarities = [] + # for each sequence in the cluster + for seqid, sequence in clustered_seqs.items(): + if seqid not in excluded: + query_kmers = im.determine_minimizers(sequence, word_size, + word_size, position=False) + + # select sequences with same kmers + sims = select_representatives(query_kmers, + kmers_mapping, + intra_filter) + + if len(sims) > 1: + # exclude current sequence + candidates = [s for s in sims if s[0] != seqid] + for c in candidates: + # if query sequence is longer, keep it and exclude candidate + if len(clustered_seqs[seqid]) >= len(clustered_seqs[c[0]]): + picked = (c[0], seqid, c[1]) + # otherwise, exclude query sequence + elif len(clustered_seqs[c[0]]) > len(clustered_seqs[seqid]): + picked = (seqid, c[0], c[1]) + similarities.append(picked) + excluded.append(picked[0]) + + # convert into set first to remove possible duplicates + excluded_seqids[representative] = list(set(excluded)) + excluded_sims[representative] = similarities + + return [excluded_seqids, excluded_sims] # change name and add parameter to accept k-mer sampling used to cluster? def minimizer_clustering(sorted_sequences, word_size, window_size, position, - offset, clusters, reps_sequences, reps_groups, - seq_num_cluster, clustering_sim, grow): - """Cluster sequences based on shared distinct minimizers. - - Parameters - ---------- - sorted_sequences : dict - Dictionary with sequence identifiers as keys and - sequences as values. Sorted by decreasing sequence - length. - word_size : int - Value k for the kmer size. - window_size : int - Window size used to determine minimizers. - position : bool - If minimizer sequence position should be stored. - offset : int - Value to indicate offset of consecutive kmers. - clusters : dict - Dictionary with the identifiers of sequences - that are cluster representatives as keys and - a list with tuples as values. Each tuple has - the identifier of a sequence that was added to - the cluster, the decimal proportion of shared - distinct minimizers and the length of the clustered - sequence. This dictionary should be empty at the - start of the clustering step during the CreateSchema - process. For the AlleleCall process, the dictionary - should contain the identifiers of the loci - representatives. - reps_sequences : dict - Dictionary with the identifiers of sequences - that are cluster representatives as keys and - their sequences as values. This dictionary should - be empty at the start of the clustering step - during the CreateSchema process. For the AlleleCall - process, the dictionary should contain the - identifiers of the loci representatives as keys - and their sequences as values. - reps_groups : dict - Dictionary with kmers as keys and a list with - identifiers of sequences that contain that kmer - as values. This dictionary should be empty at the - start of the clustering step during the CreateSchema - process. For the AlleleCall process, the dictionary - should contain all kmers contained in the set of - the schema's representatives sequences. - seq_num_cluster : int - Maximum number of clusters that a sequence can be - added to. - clustering_sim : float - Similarity threshold to cluster a sequence into - a cluster. - - Returns - ------- - A list with the following elements: - clusters : dict - Dictionary with the identifiers of sequences - that are clusters representatives as keys and - a list with tuples as values. Each tuple has - the identifier of a sequence that was added to - the cluster, the decimal proportion of shared - distinct minimizers and the length of the clustered - sequence. - reps_sequences : dict - Dictionary with the identifiers of sequences - that are cluster representatives as keys and - their sequences as values. - reps_groups : dict - Dictionary with kmers as keys and a list with - identifiers of sequences that contain that kmer - as values. - """ - # several = {} - for protid, protein in sorted_sequences.items(): - - minimizers = im.determine_minimizers(protein, window_size, - word_size, offset=offset, - position=position) - - distinct_minimizers = set(minimizers) - - selected_reps = select_representatives(distinct_minimizers, - reps_groups, - clustering_sim) - - top = (len(selected_reps) - if len(selected_reps) < seq_num_cluster - else seq_num_cluster) - - # Sort to get most similar at index 0 - if len(selected_reps) > 0: - for i in range(0, top): - clusters[selected_reps[i][0]].append((protid, - selected_reps[i][1], - len(protein), - len(minimizers), - len(distinct_minimizers))) + offset, clusters, reps_sequences, reps_groups, + seq_num_cluster, clustering_sim, grow): + """Cluster sequences based on shared distinct minimizers. + + Parameters + ---------- + sorted_sequences : dict + Dictionary with sequence identifiers as keys and + sequences as values. Sorted by decreasing sequence + length. + word_size : int + Value k for the kmer size. + window_size : int + Window size used to determine minimizers. + position : bool + If minimizer sequence position should be stored. + offset : int + Value to indicate offset of consecutive kmers. + clusters : dict + Dictionary with the identifiers of sequences + that are cluster representatives as keys and + a list with tuples as values. Each tuple has + the identifier of a sequence that was added to + the cluster, the decimal proportion of shared + distinct minimizers and the length of the clustered + sequence. This dictionary should be empty at the + start of the clustering step during the CreateSchema + process. For the AlleleCall process, the dictionary + should contain the identifiers of the loci + representatives. + reps_sequences : dict + Dictionary with the identifiers of sequences + that are cluster representatives as keys and + their sequences as values. This dictionary should + be empty at the start of the clustering step + during the CreateSchema process. For the AlleleCall + process, the dictionary should contain the + identifiers of the loci representatives as keys + and their sequences as values. + reps_groups : dict + Dictionary with kmers as keys and a list with + identifiers of sequences that contain that kmer + as values. This dictionary should be empty at the + start of the clustering step during the CreateSchema + process. For the AlleleCall process, the dictionary + should contain all kmers contained in the set of + the schema's representatives sequences. + seq_num_cluster : int + Maximum number of clusters that a sequence can be + added to. + clustering_sim : float + Similarity threshold to cluster a sequence into + a cluster. + + Returns + ------- + A list with the following elements: + clusters : dict + Dictionary with the identifiers of sequences + that are clusters representatives as keys and + a list with tuples as values. Each tuple has + the identifier of a sequence that was added to + the cluster, the decimal proportion of shared + distinct minimizers and the length of the clustered + sequence. + reps_sequences : dict + Dictionary with the identifiers of sequences + that are cluster representatives as keys and + their sequences as values. + reps_groups : dict + Dictionary with kmers as keys and a list with + identifiers of sequences that contain that kmer + as values. + """ + # several = {} + for protid, protein in sorted_sequences.items(): + + minimizers = im.determine_minimizers(protein, window_size, + word_size, offset=offset, + position=position) + + distinct_minimizers = set(minimizers) + + selected_reps = select_representatives(distinct_minimizers, + reps_groups, + clustering_sim) + + top = (len(selected_reps) + if len(selected_reps) < seq_num_cluster + else seq_num_cluster) + + # Sort to get most similar at index 0 + if len(selected_reps) > 0: + for i in range(0, top): + clusters[selected_reps[i][0]].append((protid, + selected_reps[i][1], + len(protein), + len(minimizers), + len(distinct_minimizers))) #### - # if len(selected_reps) > 1: - # for i in range(0, top): - # several.setdefault(protid, []).append(selected_reps[i][0]) + # if len(selected_reps) > 1: + # for i in range(0, top): + # several.setdefault(protid, []).append(selected_reps[i][0]) #### - else: - if grow is True: - for k in distinct_minimizers: - reps_groups.setdefault(k, []).append(protid) + else: + if grow is True: + for k in distinct_minimizers: + reps_groups.setdefault(k, []).append(protid) - clusters[protid] = [(protid, 1.0, len(protein), - len(minimizers), len(distinct_minimizers))] - reps_sequences[protid] = protein + clusters[protid] = [(protid, 1.0, len(protein), + len(minimizers), len(distinct_minimizers))] + reps_sequences[protid] = protein - # print(several) + # print(several) - return [clusters, reps_sequences, reps_groups] + return [clusters, reps_sequences, reps_groups] def clusterer(sorted_sequences, word_size, window_size, - clustering_sim, representatives, grow, - offset, position, seq_num_cluster): - """Cluster sequences based on shared distinct minimizers. - - Parameters - ---------- - sorted_sequences : dict - Dictionary with sequence identifiers as keys and - sequences as values. Sorted by decreasing sequence - length. - word_size : int - Value k for the kmer size. - window_size : int - Window size used to determine minimizers. - clustering_sim : float - Similarity threshold to cluster a sequence into - a cluster. - representatives : dict - Dictionary with kmers as keys and a list with - identifiers of sequences that contain that kmer - as values. - grow : bool - If it is allowed to create new clusters. - offset : int - Value to indicate offset of consecutive kmers. - position : bool - If minimizer sequence position should be stored. - seq_num_cluster : int - Maximum number of clusters that a sequence can be - added to. - - Returns - ------- - A list with the following elements: - clusters : dict - Dictionary with the identifiers of sequences - that are cluster representatives as keys and - a list with tuples as values. Each tuple has - the identifier of a sequence that was added to - the cluster, the decimal proportion of shared - distinct minimizers and the length of the clustered - sequence. - reps_sequences : dict - Dictionary with the identifiers of sequences - that are cluster representatives as keys and - their sequences as values. - """ - clusters = {} - reps_sequences = {} - if representatives is None: - reps_groups = {} - else: - reps_groups = representatives - clusters_ids = set(im.flatten_list(list(reps_groups.values()))) - clusters = {rep: [] for rep in clusters_ids} - - cluster_results = minimizer_clustering(sorted_sequences, word_size, - window_size, position, - offset, clusters, - reps_sequences, reps_groups, - seq_num_cluster, clustering_sim, - grow) - - return cluster_results[0:2] + clustering_sim, representatives, grow, + offset, position, seq_num_cluster): + """Cluster sequences based on shared distinct minimizers. + + Parameters + ---------- + sorted_sequences : dict + Dictionary with sequence identifiers as keys and + sequences as values. Sorted by decreasing sequence + length. + word_size : int + Value k for the kmer size. + window_size : int + Window size used to determine minimizers. + clustering_sim : float + Similarity threshold to cluster a sequence into + a cluster. + representatives : dict + Dictionary with kmers as keys and a list with + identifiers of sequences that contain that kmer + as values. + grow : bool + If it is allowed to create new clusters. + offset : int + Value to indicate offset of consecutive kmers. + position : bool + If minimizer sequence position should be stored. + seq_num_cluster : int + Maximum number of clusters that a sequence can be + added to. + + Returns + ------- + A list with the following elements: + clusters : dict + Dictionary with the identifiers of sequences + that are cluster representatives as keys and + a list with tuples as values. Each tuple has + the identifier of a sequence that was added to + the cluster, the decimal proportion of shared + distinct minimizers and the length of the clustered + sequence. + reps_sequences : dict + Dictionary with the identifiers of sequences + that are cluster representatives as keys and + their sequences as values. + """ + clusters = {} + reps_sequences = {} + if representatives is None: + reps_groups = {} + else: + reps_groups = representatives + clusters_ids = set(im.flatten_list(list(reps_groups.values()))) + clusters = {rep: [] for rep in clusters_ids} + + cluster_results = minimizer_clustering(sorted_sequences, word_size, + window_size, position, + offset, clusters, + reps_sequences, reps_groups, + seq_num_cluster, clustering_sim, + grow) + + return cluster_results[0:2] def write_clusters(clusters, outfile): - """Write information about clusters to file. - - Parameters - ---------- - clusters : dict - Dictionary with the identifiers of sequences - that are cluster representatives as keys and - a list with tuples as values. Each tuple has - the identifier of a sequence that was added to - the cluster, the decimal proportion of shared - distinct kmers/minimizers and the length of the - clustered sequence. - outfile : str - Path to the file that will be created to save - information about clusters. - """ - cluster_lines = [] - for rep, seqids in clusters.items(): - current_cluster = [] - current_cluster.append('>{0}'.format(rep)) - clustered = [', '.join(['{}']*len(s)).format(*s) - for s in seqids] - current_cluster.extend(clustered) - cluster_lines.append(current_cluster) - - # sort by number of lines to get clusters with more sequences first - cluster_lines = im.sort_iterable(cluster_lines, - sort_key=lambda x: len(x), reverse=True) - cluster_lines = im.flatten_list(cluster_lines) - cluster_text = im.join_list(cluster_lines, '\n') - - fo.write_to_file(cluster_text, outfile, 'w', '\n') + """Write information about clusters to file. + + Parameters + ---------- + clusters : dict + Dictionary with the identifiers of sequences + that are cluster representatives as keys and + a list with tuples as values. Each tuple has + the identifier of a sequence that was added to + the cluster, the decimal proportion of shared + distinct kmers/minimizers and the length of the + clustered sequence. + outfile : str + Path to the file that will be created to save + information about clusters. + """ + cluster_lines = [] + for rep, seqids in clusters.items(): + current_cluster = [] + current_cluster.append('>{0}'.format(rep)) + clustered = [', '.join(['{}']*len(s)).format(*s) + for s in seqids] + current_cluster.extend(clustered) + cluster_lines.append(current_cluster) + + # sort by number of lines to get clusters with more sequences first + cluster_lines = im.sort_iterable(cluster_lines, + sort_key=lambda x: len(x), reverse=True) + cluster_lines = im.flatten_list(cluster_lines) + cluster_text = im.join_list(cluster_lines, '\n') + + fo.write_to_file(cluster_text, outfile, 'w', '\n') def representative_pruner(clusters, sim_cutoff): - """Remove sequences from clusters based on a similarity threshold. - - Parameters - ---------- - clusters : dict - Dictionary with the identifiers of sequences - that are cluster representatives as keys and - a list with tuples as values. Each tuple has - the identifier of a sequence that was added to - the cluster, the percentage of shared - kmers/minimizers and the length of the clustered - sequence. - sim_cutoff : float - Similarity threshold value. Sequences with - equal or greater similarity value are excluded - from clusters. - - Returns - ------- - A list with the following elements: - pruned_clusters : dict - Input dictionary without values for the - sequences that had similarity values - equal or greater that defined threshold. - excluded : list - List with a list per cluster. Each list - has the sequences that were excluded from - a cluster. - """ - excluded = [] - pruned_clusters = {} - for rep, seqids in clusters.items(): - pruned_clusters[rep] = [seqid - for seqid in seqids - if seqid[1] < sim_cutoff] - # do not exclude cluster representatives - excluded.extend([seqid - for seqid in seqids - if seqid[1] >= sim_cutoff and seqid[0] != rep]) - - return [pruned_clusters, excluded] + """Remove sequences from clusters based on a similarity threshold. + + Parameters + ---------- + clusters : dict + Dictionary with the identifiers of sequences + that are cluster representatives as keys and + a list with tuples as values. Each tuple has + the identifier of a sequence that was added to + the cluster, the percentage of shared + kmers/minimizers and the length of the clustered + sequence. + sim_cutoff : float + Similarity threshold value. Sequences with + equal or greater similarity value are excluded + from clusters. + + Returns + ------- + A list with the following elements: + pruned_clusters : dict + Input dictionary without values for the + sequences that had similarity values + equal or greater that defined threshold. + excluded : list + List with a list per cluster. Each list + has the sequences that were excluded from + a cluster. + """ + excluded = [] + pruned_clusters = {} + for rep, seqids in clusters.items(): + pruned_clusters[rep] = [seqid + for seqid in seqids + if seqid[1] < sim_cutoff] + # do not exclude cluster representatives + excluded.extend([seqid + for seqid in seqids + if seqid[1] >= sim_cutoff and seqid[0] != rep]) + + return [pruned_clusters, excluded] def cluster_blaster(seqids, sequences, output_directory, - blast_path, blastdb_path, blastdb_aliastool_path, only_rep=False): - """Align sequences in the same cluster with BLAST. - - Parameters - ---------- - seqids : list - List with cluster identifiers. - sequences : str - Path to the FASTA file with the protein sequences - in all clusters. - output_directory : str - Path to the directory where FASTA files with - the sequences in each cluster and files with - BLAST results will be written to. - blast_path : str - Path to BLAST executables - blastdb_path : str - Path to a BLAST database. - - Returns - ------- - out_files : list - List with the paths to the files with the BLAST - results for each cluster. - """ - indexed_fasta = fao.index_fasta(sequences) - - out_files = [] - for cluster in seqids: - cluster_id = cluster - ids_file = os.path.join(output_directory, - '{0}_ids.txt'.format(cluster_id)) - - fasta_file = os.path.join(output_directory, - '{0}_protein.fasta'.format(cluster_id)) - - if only_rep is False: - with open(ids_file, 'r') as clstr: - cluster_ids = [l.strip() for l in clstr.readlines()] - # create file with protein sequences - fao.get_sequences_by_id(indexed_fasta, cluster_ids, fasta_file) - else: - fao.get_sequences_by_id(indexed_fasta, [cluster_id], fasta_file) - - blast_output = os.path.join(output_directory, - '{0}_blastout.tsv'.format(cluster_id)) + blast_path, blastdb_path, blastdb_aliastool_path, only_rep=False): + """Align sequences in the same cluster with BLAST. + + Parameters + ---------- + seqids : list + List with cluster identifiers. + sequences : str + Path to the FASTA file with the protein sequences + in all clusters. + output_directory : str + Path to the directory where FASTA files with + the sequences in each cluster and files with + BLAST results will be written to. + blast_path : str + Path to BLAST executables + blastdb_path : str + Path to a BLAST database. + + Returns + ------- + out_files : list + List with the paths to the files with the BLAST + results for each cluster. + """ + indexed_fasta = fao.index_fasta(sequences) + + out_files = [] + for cluster in seqids: + cluster_id = cluster + ids_file = os.path.join(output_directory, + '{0}_ids.txt'.format(cluster_id)) + + fasta_file = os.path.join(output_directory, + '{0}_protein.fasta'.format(cluster_id)) + + if only_rep is False: + with open(ids_file, 'r') as clstr: + cluster_ids = [l.strip() for l in clstr.readlines()] + # create file with protein sequences + fao.get_sequences_by_id(indexed_fasta, cluster_ids, fasta_file) + else: + fao.get_sequences_by_id(indexed_fasta, [cluster_id], fasta_file) + + blast_output = os.path.join(output_directory, + '{0}_blastout.tsv'.format(cluster_id)) binary_file = f'{ids_file}.bin' blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, ids_file, binary_file) ids_file = binary_file - # Use subprocess to capture errors and warnings - blast_std = bw.run_blast(blast_path, blastdb_path, fasta_file, - blast_output, 1, 1, - ids_file) + # Use subprocess to capture errors and warnings + blast_std = bw.run_blast(blast_path, blastdb_path, fasta_file, + blast_output, 1, 1, + ids_file) - out_files.append(blast_output) + out_files.append(blast_output) - return out_files + return out_files def blast_seqids(clusters, output_directory, only_rep): - """Create files with the identifiers of the sequences in each cluster. - - Parameters - ---------- - clusters : dict - Dictionary with the identifiers of cluster - representatives as keys and a list with tuples - as values (each tuple has the identifier of a - sequence that is in the cluster, the decimal - proportion of shared minimizers and the length - of that sequence). - output_directory : str - Path to the directory where files with identifiers - will be created. - - Returns - ------- - ids_to_blast : list - List with the identifiers of all clusters. - """ - ids_to_blast = [] - for rep in clusters: - cluster_file = os.path.join(output_directory, f'{rep}_ids.txt') - if only_rep is False: - cluster_ids = [rep] + [seqid[0] for seqid in clusters[rep]] - else: - cluster_ids = [seqid[0] for seqid in clusters[rep]] - - cluster_lines = im.join_list(cluster_ids, '\n') - fo.write_to_file(cluster_lines, cluster_file, 'w', '') - ids_to_blast.append((rep, len(cluster_ids))) - - return ids_to_blast + """Create files with the identifiers of the sequences in each cluster. + + Parameters + ---------- + clusters : dict + Dictionary with the identifiers of cluster + representatives as keys and a list with tuples + as values (each tuple has the identifier of a + sequence that is in the cluster, the decimal + proportion of shared minimizers and the length + of that sequence). + output_directory : str + Path to the directory where files with identifiers + will be created. + + Returns + ------- + ids_to_blast : list + List with the identifiers of all clusters. + """ + ids_to_blast = [] + for rep in clusters: + cluster_file = os.path.join(output_directory, f'{rep}_ids.txt') + if only_rep is False: + cluster_ids = [rep] + [seqid[0] for seqid in clusters[rep]] + else: + cluster_ids = [seqid[0] for seqid in clusters[rep]] + + cluster_lines = im.join_list(cluster_ids, '\n') + fo.write_to_file(cluster_lines, cluster_file, 'w', '') + ids_to_blast.append((rep, len(cluster_ids))) + + return ids_to_blast diff --git a/CHEWBBACA/utils/sequence_manipulation.py b/CHEWBBACA/utils/sequence_manipulation.py index 2f38b6e3..240061cb 100644 --- a/CHEWBBACA/utils/sequence_manipulation.py +++ b/CHEWBBACA/utils/sequence_manipulation.py @@ -16,632 +16,632 @@ from collections import Counter try: - from utils import (constants as ct, - file_operations as fo, - iterables_manipulation as im, - fasta_operations as fao) + from utils import (constants as ct, + file_operations as fo, + iterables_manipulation as im, + fasta_operations as fao) except ModuleNotFoundError: - from CHEWBBACA.utils import (constants as ct, - file_operations as fo, - iterables_manipulation as im, - fasta_operations as fao) + from CHEWBBACA.utils import (constants as ct, + file_operations as fo, + iterables_manipulation as im, + fasta_operations as fao) def translate_sequence(dna_str, table_id): - """Translate a DNA sequence using the BioPython package. + """Translate a DNA sequence using the BioPython package. - Parameters - ---------- - dna_str : str - String representing a DNA sequence. - table_id : int - Translation table identifier. + Parameters + ---------- + dna_str : str + String representing a DNA sequence. + table_id : int + Translation table identifier. - Returns - ------- - protseq : Bio.Seq.Seq - Protein sequence created by translating the - input DNA sequence. - """ - myseq_obj = Seq(dna_str) - protseq = Seq.translate(myseq_obj, table=table_id, cds=True) + Returns + ------- + protseq : Bio.Seq.Seq + Protein sequence created by translating the + input DNA sequence. + """ + myseq_obj = Seq(dna_str) + protseq = Seq.translate(myseq_obj, table=table_id, cds=True) - return protseq + return protseq def translate_dna_aux(dna_sequence, method, table_id): - """Attempt to translate a DNA sequence in specified orientation. - - Attempts to translate an input DNA sequence in specified - orientation and stores exceptions when the input sequence - cannot be translated. - - Parameters - ---------- - dna_sequence : str - String representing a DNA sequence. - method : str - Sequence orientation to attempt translation. - table_id : int - Translation table identifier. - - Returns - ------- - If the sequence can be translated: - protseq : Bio.Seq.Seq - Translated DNA sequence. - myseq : str - String representing the DNA sequence in the - orientation used to translate it. - Otherwise, returns string with the description of the - exception that was raised. - """ - myseq = dna_sequence - # try to translate original sequence - if method == 'original': - try: - protseq = translate_sequence(myseq, table_id) - except Exception as argh: - return argh - # try to translate the reverse complement - elif method == 'revcomp': - try: - myseq = im.reverse_complement(myseq, ct.DNA_BASES) - protseq = translate_sequence(myseq, table_id) - except Exception as argh: - return argh - # try to translate the reverse - elif method == 'rev': - try: - myseq = im.reverse_str(myseq) - protseq = translate_sequence(myseq, table_id) - except Exception as argh: - return argh - # try to translate the reverse reverse complement - elif method == 'revrevcomp': - try: - myseq = im.reverse_str(myseq) - myseq = im.reverse_complement(myseq, ct.DNA_BASES) - protseq = translate_sequence(myseq, table_id) - except Exception as argh: - return argh - - return [protseq, myseq] + """Attempt to translate a DNA sequence in specified orientation. + + Attempts to translate an input DNA sequence in specified + orientation and stores exceptions when the input sequence + cannot be translated. + + Parameters + ---------- + dna_sequence : str + String representing a DNA sequence. + method : str + Sequence orientation to attempt translation. + table_id : int + Translation table identifier. + + Returns + ------- + If the sequence can be translated: + protseq : Bio.Seq.Seq + Translated DNA sequence. + myseq : str + String representing the DNA sequence in the + orientation used to translate it. + Otherwise, returns string with the description of the + exception that was raised. + """ + myseq = dna_sequence + # try to translate original sequence + if method == 'original': + try: + protseq = translate_sequence(myseq, table_id) + except Exception as argh: + return argh + # try to translate the reverse complement + elif method == 'revcomp': + try: + myseq = im.reverse_complement(myseq, ct.DNA_BASES) + protseq = translate_sequence(myseq, table_id) + except Exception as argh: + return argh + # try to translate the reverse + elif method == 'rev': + try: + myseq = im.reverse_str(myseq) + protseq = translate_sequence(myseq, table_id) + except Exception as argh: + return argh + # try to translate the reverse reverse complement + elif method == 'revrevcomp': + try: + myseq = im.reverse_str(myseq) + myseq = im.reverse_complement(myseq, ct.DNA_BASES) + protseq = translate_sequence(myseq, table_id) + except Exception as argh: + return argh + + return [protseq, myseq] def translate_dna(dna_sequence, table_id, min_len): - """Determine if a DNA sequence is valid and translate it. - - Checks if sequence is valid and attempts to translate - it, calling several functions to ensure that the sequence - only has 'ACTG', is multiple of 3 and that it can be - translated in any of 4 different orientations. Stores - exceptions so that it is possible to understand why the - sequence could not be translated. - - Parameters - ---------- - dna_sequence : str - String representing a DNA sequence. - table_id : int - Translation table identifier. - min_len : int - Minimum sequence length. Sequences shorter - than this value are not translated. - - Returns - ------- - If the sequence can be translated: - sequence : list - List with two elemets, the protein sequence - and the DNA sequence in the correct orientation. - coding_strand : str - The sequence orientation that codes for the - protein. - Otherwise: - exception_str : str - A string containing the exceptions that - explain why the the sequence could not be - translated. - """ - original_seq = dna_sequence.upper() - exception_collector = [] - strands = ['sense', 'antisense', 'revsense', 'revantisense'] - translating_methods = ['original', 'revcomp', 'rev', 'revrevcomp'] - - # check if the sequence has ambiguous bases - valid_dna = im.check_str_alphabet(original_seq, ct.DNA_BASES) - if valid_dna is not True: - return 'ambiguous or invalid characters' - - # check if sequence size is multiple of three - valid_length = im.check_str_multiple(original_seq, 3) - if valid_length is not True: - return 'sequence length is not a multiple of 3' - - # check if sequence is not shorter than the accepted minimum length - if len(original_seq) < min_len: - return 'sequence shorter than {0} nucleotides'.format(min_len) - - # try to translate in 4 different orientations - # or reach the conclusion that the sequence cannot be translated - i = 0 - translated = False - while translated is False: - translated_seq = translate_dna_aux(original_seq, translating_methods[i], table_id) - if not isinstance(translated_seq, list): - exception_collector.append('{0}({1})'.format(strands[i], - translated_seq.args[0])) - - i += 1 - if i == len(strands) or isinstance(translated_seq, list) is True: - translated = True - - coding_strand = strands[i-1] - - # if the sequence could be translated, return list with protein and DNA - # sequence in correct orientation - if isinstance(translated_seq, list): - return [translated_seq, coding_strand] - # if it could not be translated, return the string with all exception - # that were collected - else: - exception_str = ','.join(exception_collector) - return exception_str + """Determine if a DNA sequence is valid and translate it. + + Checks if sequence is valid and attempts to translate + it, calling several functions to ensure that the sequence + only has 'ACTG', is multiple of 3 and that it can be + translated in any of 4 different orientations. Stores + exceptions so that it is possible to understand why the + sequence could not be translated. + + Parameters + ---------- + dna_sequence : str + String representing a DNA sequence. + table_id : int + Translation table identifier. + min_len : int + Minimum sequence length. Sequences shorter + than this value are not translated. + + Returns + ------- + If the sequence can be translated: + sequence : list + List with two elemets, the protein sequence + and the DNA sequence in the correct orientation. + coding_strand : str + The sequence orientation that codes for the + protein. + Otherwise: + exception_str : str + A string containing the exceptions that + explain why the the sequence could not be + translated. + """ + original_seq = dna_sequence.upper() + exception_collector = [] + strands = ['sense', 'antisense', 'revsense', 'revantisense'] + translating_methods = ['original', 'revcomp', 'rev', 'revrevcomp'] + + # check if the sequence has ambiguous bases + valid_dna = im.check_str_alphabet(original_seq, ct.DNA_BASES) + if valid_dna is not True: + return 'ambiguous or invalid characters' + + # check if sequence size is multiple of three + valid_length = im.check_str_multiple(original_seq, 3) + if valid_length is not True: + return 'sequence length is not a multiple of 3' + + # check if sequence is not shorter than the accepted minimum length + if len(original_seq) < min_len: + return 'sequence shorter than {0} nucleotides'.format(min_len) + + # try to translate in 4 different orientations + # or reach the conclusion that the sequence cannot be translated + i = 0 + translated = False + while translated is False: + translated_seq = translate_dna_aux(original_seq, translating_methods[i], table_id) + if not isinstance(translated_seq, list): + exception_collector.append('{0}({1})'.format(strands[i], + translated_seq.args[0])) + + i += 1 + if i == len(strands) or isinstance(translated_seq, list) is True: + translated = True + + coding_strand = strands[i-1] + + # if the sequence could be translated, return list with protein and DNA + # sequence in correct orientation + if isinstance(translated_seq, list): + return [translated_seq, coding_strand] + # if it could not be translated, return the string with all exception + # that were collected + else: + exception_str = ','.join(exception_collector) + return exception_str def determine_duplicated_seqs(sequences): - """Create mapping between sequences and sequence identifiers. - - Parameters - ---------- - sequences : dict - Dictionary with sequence identifiers as keys and - sequences as values. - - Returns - ------- - equal_seqs : dict - Dictionary with sequences as keys and sequence - identifiers that are associated with each - sequence as values. - """ - equal_seqs = {} - for seqid, seq in sequences.items(): - # if protein sequence was already added as key - if seq in equal_seqs: - # append new protid - equal_seqs[seq].append(seqid) - # else add new protein sequence as key and protid - # as value - else: - equal_seqs[seq] = [seqid] - - return equal_seqs + """Create mapping between sequences and sequence identifiers. + + Parameters + ---------- + sequences : dict + Dictionary with sequence identifiers as keys and + sequences as values. + + Returns + ------- + equal_seqs : dict + Dictionary with sequences as keys and sequence + identifiers that are associated with each + sequence as values. + """ + equal_seqs = {} + for seqid, seq in sequences.items(): + # if protein sequence was already added as key + if seq in equal_seqs: + # append new protid + equal_seqs[seq].append(seqid) + # else add new protein sequence as key and protid + # as value + else: + equal_seqs[seq] = [seqid] + + return equal_seqs def determine_longest(seqids, sequences): - """Find the longest sequence in a set of sequences. + """Find the longest sequence in a set of sequences. - Parameters - ---------- - seqids : list - List with sequence identifiers. - sequences : dict - Dictionary with sequence identifiers as keys - and sequences as values. + Parameters + ---------- + seqids : list + List with sequence identifiers. + sequences : dict + Dictionary with sequence identifiers as keys + and sequences as values. - Returns - ------- - chosen : str - Sequence identifier of the longest sequence. - """ - seqids_tups = [(seqid, sequences[seqid]) for seqid in seqids] - sorted_tups = sorted(seqids_tups, key=lambda x: len(x[1]), reverse=True) - chosen = sorted_tups[0][0] + Returns + ------- + chosen : str + Sequence identifier of the longest sequence. + """ + seqids_tups = [(seqid, sequences[seqid]) for seqid in seqids] + sorted_tups = sorted(seqids_tups, key=lambda x: len(x[1]), reverse=True) + chosen = sorted_tups[0][0] - return chosen + return chosen def determine_mode(values): - """Compute the mode based on a list of integers. + """Compute the mode based on a list of integers. - Parameters - ---------- - values : list - List with integer values. + Parameters + ---------- + values : list + List with integer values. - Returns - ------- - modes : list - The most frequent integer values. - """ - # Determine frequency of each value - counts = Counter(values) - # Order by frequency - most_common = counts.most_common() - # Get first value and any other value with same frequency - modes = [m[0] for m in most_common - if m[1] == most_common[0][1]] + Returns + ------- + modes : list + The most frequent integer values. + """ + # Determine frequency of each value + counts = Counter(values) + # Order by frequency + most_common = counts.most_common() + # Get first value and any other value with same frequency + modes = [m[0] for m in most_common + if m[1] == most_common[0][1]] - return modes + return modes def mode_filter(sequences, size_threshold): - """Find sequences with length that deviates from the mode value. - - Determines the mode from a set of input sequences and - identifies sequences that have a length value smaller - or greater than the mode based on a threshold. - - Parameters - ---------- - sequences : dict - Dictionary with sequence identifiers as keys and - sequences as values. - size_threshold : float - Sequences with +/- this value * mode will be - reported as above or below the mode. - - Returns - ------- - A list with the following variables: - modes : list - List with mode values determined based on the - length of input sequences. - alm : list - List with the sequence identifiers of the - sequences that are above the mode value by - mode*size_threshold. - asm : list - List with the sequence identifiers of the - sequences that are below the mode value by - mode*size_threshold. - seqs_lengths : dict - Dictionary with sequence identifiers as keys - and sequence lengths as values. - """ - # determine length value of all sequences - seqs_lengths = {seqid: len(seq) for seqid, seq in sequences.items()} - - # determine mode/s - modes = determine_mode(list(seqs_lengths.values())) - - # determine top and bot length value limits - max_mode = max(modes) - top_limit = max_mode + (max_mode*size_threshold) - min_mode = min(modes) - bot_limit = min_mode - (min_mode*size_threshold) - - # determine sequences that are below or above limits - alm = [seqid for seqid, length in seqs_lengths.items() - if length > top_limit] - asm = [seqid for seqid, length in seqs_lengths.items() - if length < bot_limit] - - return [modes, alm, asm, seqs_lengths] + """Find sequences with length that deviates from the mode value. + + Determines the mode from a set of input sequences and + identifies sequences that have a length value smaller + or greater than the mode based on a threshold. + + Parameters + ---------- + sequences : dict + Dictionary with sequence identifiers as keys and + sequences as values. + size_threshold : float + Sequences with +/- this value * mode will be + reported as above or below the mode. + + Returns + ------- + A list with the following variables: + modes : list + List with mode values determined based on the + length of input sequences. + alm : list + List with the sequence identifiers of the + sequences that are above the mode value by + mode*size_threshold. + asm : list + List with the sequence identifiers of the + sequences that are below the mode value by + mode*size_threshold. + seqs_lengths : dict + Dictionary with sequence identifiers as keys + and sequence lengths as values. + """ + # determine length value of all sequences + seqs_lengths = {seqid: len(seq) for seqid, seq in sequences.items()} + + # determine mode/s + modes = determine_mode(list(seqs_lengths.values())) + + # determine top and bot length value limits + max_mode = max(modes) + top_limit = max_mode + (max_mode*size_threshold) + min_mode = min(modes) + bot_limit = min_mode - (min_mode*size_threshold) + + # determine sequences that are below or above limits + alm = [seqid for seqid, length in seqs_lengths.items() + if length > top_limit] + asm = [seqid for seqid, length in seqs_lengths.items() + if length < bot_limit] + + return [modes, alm, asm, seqs_lengths] def get_seqs_dicts(fasta_path, gene_id, table_id, min_len, size_threshold): - """Translate DNA sequences in a FASTA file. - - Translates the set of alleles from a gene. Identifies - sequences that cannot be translated according to the - criteria enforced by chewBBACA. - - Parameters - ---------- - fasta_path : str - Path to the FASTA file with DNA sequences. - gene_id : str - Gene identifier. - table_id : int - Translation table identifier. - min_len : int - Minimum sequence length. Sequences shorter - than this value are not translated. - size_threshold : float - Sequences with +/- this value * mode will be - reported as above or below the mode. - - Returns - ------- - List with following elements: - dna_seqs : dict - Dictionary with sequence identifiers as keys - and DNA sequences as values. - prot_seqs : dict - Dictionary with protein identifiers as keys - and Protein sequences as values. Keys are - consecutive integers to enable alignment - with BLASTp without getting exceptions due - to long sequence identifiers. - invalid : list - List with sequence identifiers of alleles - that are not valid because they could not be - translated. - seqids_map : dict - Dictionary with consecutive integers as keys - and original allele identifiers as values. - total_seqs : int - Total number of sequences that was processed - (including invalid alleles). - """ - sequences = fao.import_sequences(fasta_path) - - # Translate sequences - translated_seqs = {k: translate_dna(v, table_id, min_len) - for k, v in sequences.items()} - - # Add locus identifier to headers - # Some headers might only have the allele identifier - seqids = list(translated_seqs.keys()) - new_seqids = {} - for i in seqids: - new_id = '{0}_{1}'.format(gene_id, i.split('_')[-1]) - new_seqids[i] = new_id - - # Switch sequence ids - sequences = {new_seqids[k]: v - for k, v in translated_seqs.items()} - - valid = {k: v - for k, v in sequences.items() - if isinstance(v, list) is True} - invalid = [[k, v] - for k, v in sequences.items() - if isinstance(v, list) is False] - - seqid = 1 - seqids_map = {} - dna_seqs = {} - prot_seqs = {} - for k, v in valid.items(): - seqids_map[str(seqid)] = k - dna_seqs[k] = v[0][1] - prot_seqs[str(seqid)] = str(v[0][0]) - seqid += 1 - - if size_threshold is not None and len(prot_seqs) > 0: - # Remove alleles based on length mode and size threshold - modes, alm, asm, alleles_lengths = mode_filter(dna_seqs, size_threshold) - excluded = set(asm + alm) - - dna_seqs = {seqid: seq - for seqid, seq in dna_seqs.items() - if seqid not in excluded} - prot_seqs = {seqid: seq - for seqid, seq in prot_seqs.items() - if seqids_map[seqid] not in excluded} - - modes_concat = ':'.join(map(str, modes)) - st_percentage = int(size_threshold*100) - invalid += [[s, 'allele greater than {0}% locus length mode ' - '({1}>{2})'.format(st_percentage, - alleles_lengths[s], - modes_concat)] - for s in alm] - invalid += [[s, 'allele smaller than {0}% locus length mode ' - '({1}<{2})'.format(st_percentage, - alleles_lengths[s], - modes_concat)] - for s in asm] - - total_seqs = len(translated_seqs) - - return [dna_seqs, prot_seqs, invalid, seqids_map, total_seqs] + """Translate DNA sequences in a FASTA file. + + Translates the set of alleles from a gene. Identifies + sequences that cannot be translated according to the + criteria enforced by chewBBACA. + + Parameters + ---------- + fasta_path : str + Path to the FASTA file with DNA sequences. + gene_id : str + Gene identifier. + table_id : int + Translation table identifier. + min_len : int + Minimum sequence length. Sequences shorter + than this value are not translated. + size_threshold : float + Sequences with +/- this value * mode will be + reported as above or below the mode. + + Returns + ------- + List with following elements: + dna_seqs : dict + Dictionary with sequence identifiers as keys + and DNA sequences as values. + prot_seqs : dict + Dictionary with protein identifiers as keys + and Protein sequences as values. Keys are + consecutive integers to enable alignment + with BLASTp without getting exceptions due + to long sequence identifiers. + invalid : list + List with sequence identifiers of alleles + that are not valid because they could not be + translated. + seqids_map : dict + Dictionary with consecutive integers as keys + and original allele identifiers as values. + total_seqs : int + Total number of sequences that was processed + (including invalid alleles). + """ + sequences = fao.import_sequences(fasta_path) + + # Translate sequences + translated_seqs = {k: translate_dna(v, table_id, min_len) + for k, v in sequences.items()} + + # Add locus identifier to headers + # Some headers might only have the allele identifier + seqids = list(translated_seqs.keys()) + new_seqids = {} + for i in seqids: + new_id = '{0}_{1}'.format(gene_id, i.split('_')[-1]) + new_seqids[i] = new_id + + # Switch sequence ids + sequences = {new_seqids[k]: v + for k, v in translated_seqs.items()} + + valid = {k: v + for k, v in sequences.items() + if isinstance(v, list) is True} + invalid = [[k, v] + for k, v in sequences.items() + if isinstance(v, list) is False] + + seqid = 1 + seqids_map = {} + dna_seqs = {} + prot_seqs = {} + for k, v in valid.items(): + seqids_map[str(seqid)] = k + dna_seqs[k] = v[0][1] + prot_seqs[str(seqid)] = str(v[0][0]) + seqid += 1 + + if size_threshold is not None and len(prot_seqs) > 0: + # Remove alleles based on length mode and size threshold + modes, alm, asm, alleles_lengths = mode_filter(dna_seqs, size_threshold) + excluded = set(asm + alm) + + dna_seqs = {seqid: seq + for seqid, seq in dna_seqs.items() + if seqid not in excluded} + prot_seqs = {seqid: seq + for seqid, seq in prot_seqs.items() + if seqids_map[seqid] not in excluded} + + modes_concat = ':'.join(map(str, modes)) + st_percentage = int(size_threshold*100) + invalid += [[s, 'allele greater than {0}% locus length mode ' + '({1}>{2})'.format(st_percentage, + alleles_lengths[s], + modes_concat)] + for s in alm] + invalid += [[s, 'allele smaller than {0}% locus length mode ' + '({1}<{2})'.format(st_percentage, + alleles_lengths[s], + modes_concat)] + for s in asm] + + total_seqs = len(translated_seqs) + + return [dna_seqs, prot_seqs, invalid, seqids_map, total_seqs] def translate_coding_sequences(seqids, protein_file, sequences_file, - translation_table, minimum_length): - """Translate coding sequences. - - Parameters - ---------- - seqids : list - List with the sequence identifiers of the sequences - to be translated. - protein_file : str - Path to a file to save protein sequences. - sequences_file : str - Path to the FASTA file that contains the DNA sequences. - translation_table : int - Translation table identifier. - minimum_length : int - The minimum sequence length value. - - Returns - ------- - A list with following elements: - invalid_alleles : list - List with one sublist per invalid allele. - Each sublist contains a sequence identifer - and the exception message returned after - attempting translation. - total_seqs : int - Total number of DNA sequences that were - translated. - """ - # define limit of records to keep in memory - total_seqs = 0 - prot_lines = [] - line_limit = 10000 - invalid_alleles = [] - cds_index = fao.index_fasta(sequences_file) - for i, seqid in enumerate(seqids): - sequence = str(cds_index.get(seqid).seq) - - translation = translate_dna(sequence, translation_table, minimum_length) - if isinstance(translation, list): - prot_lines.append('>{0}'.format(seqid)) - prot_lines.append(str(translation[0][0])) - total_seqs += 1 - # if returned value is a string, translation failed and - # string contains exceptions - elif isinstance(translation, str): - invalid_alleles.append([seqid, translation]) - - if len(prot_lines)//2 == line_limit or i+1 == len(seqids): - prot_lines = im.join_list(prot_lines, '\n') - fo.write_to_file(prot_lines, protein_file, 'a', '\n') - prot_lines = [] - - return [invalid_alleles, total_seqs] + translation_table, minimum_length): + """Translate coding sequences. + + Parameters + ---------- + seqids : list + List with the sequence identifiers of the sequences + to be translated. + protein_file : str + Path to a file to save protein sequences. + sequences_file : str + Path to the FASTA file that contains the DNA sequences. + translation_table : int + Translation table identifier. + minimum_length : int + The minimum sequence length value. + + Returns + ------- + A list with following elements: + invalid_alleles : list + List with one sublist per invalid allele. + Each sublist contains a sequence identifer + and the exception message returned after + attempting translation. + total_seqs : int + Total number of DNA sequences that were + translated. + """ + # define limit of records to keep in memory + total_seqs = 0 + prot_lines = [] + line_limit = 10000 + invalid_alleles = [] + cds_index = fao.index_fasta(sequences_file) + for i, seqid in enumerate(seqids): + sequence = str(cds_index.get(seqid).seq) + + translation = translate_dna(sequence, translation_table, minimum_length) + if isinstance(translation, list): + prot_lines.append('>{0}'.format(seqid)) + prot_lines.append(str(translation[0][0])) + total_seqs += 1 + # if returned value is a string, translation failed and + # string contains exceptions + elif isinstance(translation, str): + invalid_alleles.append([seqid, translation]) + + if len(prot_lines)//2 == line_limit or i+1 == len(seqids): + prot_lines = im.join_list(prot_lines, '\n') + fo.write_to_file(prot_lines, protein_file, 'a', '\n') + prot_lines = [] + + return [invalid_alleles, total_seqs] def determine_distinct(sequences_file, unique_fasta, map_ids): - """Identify duplicated sequences in a FASTA file. - - Parameters - ---------- - sequences_file : str - Path to a FASTA file. - unique_fasta : str - Path to a FASTA file that will be created to - store distinct sequences. - map_ids : dict - Dictionary with mapping between genome string - identifiers and genome integer identifiers. - - Returns - ------- - pickle_out : str - Pickled file that contains a dictionary with sequence - hashes as keys and a list with pairs of protein - identifiers and genome identifiers as values - """ - out_seqs = [] - duplicates = {} - exhausted = False - # limit of 10000 Fasta records in memory - out_limit = 10000 - seq_generator = fao.sequence_generator(sequences_file) - while exhausted is False: - record = next(seq_generator, None) - if record is not None: - # seq object has to be converted to string - seqid = record.id - sequence = str(record.seq.upper()) - - # use digest() instead of hexdigest() to reduce memory usage? - seq_hash = im.hash_sequence(sequence) - - # add unseen sequence to Fasta file with distinct sequences - if seq_hash not in duplicates: - recout = fao.fasta_str_record(ct.FASTA_RECORD_TEMPLATE, [seqid, sequence]) - out_seqs.append(recout) - - # add CDS hash as key - # add genome integer identifier and protein identifier to values list - # genome identifier and protein identifier can be used to fetch sequences - genome_id, protid = seqid.split('-protein') - genome_id = map_ids[genome_id] - duplicates.setdefault(seq_hash, []).extend([int(protid), int(genome_id)]) - else: - exhausted = True - - # write Fasta records to file - if len(out_seqs) == out_limit or exhausted is True: - if len(out_seqs) > 0: - out_seqs = im.join_list(out_seqs, '\n') - fo.write_to_file(out_seqs, unique_fasta, 'a', '\n') - # reset list to avoid writing same records multiple times - out_seqs = [] - - # save dictionary with genome integer identifiers per distinct sequence - # to pickle and only return file path to avoid keeping all dicts from - # parallel processes in memory - pickle_out = unique_fasta.replace('.fasta', '.duplicates') - fo.pickle_dumper(duplicates, pickle_out) - - return pickle_out + """Identify duplicated sequences in a FASTA file. + + Parameters + ---------- + sequences_file : str + Path to a FASTA file. + unique_fasta : str + Path to a FASTA file that will be created to + store distinct sequences. + map_ids : dict + Dictionary with mapping between genome string + identifiers and genome integer identifiers. + + Returns + ------- + pickle_out : str + Pickled file that contains a dictionary with sequence + hashes as keys and a list with pairs of protein + identifiers and genome identifiers as values + """ + out_seqs = [] + duplicates = {} + exhausted = False + # limit of 10000 Fasta records in memory + out_limit = 10000 + seq_generator = fao.sequence_generator(sequences_file) + while exhausted is False: + record = next(seq_generator, None) + if record is not None: + # seq object has to be converted to string + seqid = record.id + sequence = str(record.seq.upper()) + + # use digest() instead of hexdigest() to reduce memory usage? + seq_hash = im.hash_sequence(sequence) + + # add unseen sequence to Fasta file with distinct sequences + if seq_hash not in duplicates: + recout = fao.fasta_str_record(ct.FASTA_RECORD_TEMPLATE, [seqid, sequence]) + out_seqs.append(recout) + + # add CDS hash as key + # add genome integer identifier and protein identifier to values list + # genome identifier and protein identifier can be used to fetch sequences + genome_id, protid = seqid.split('-protein') + genome_id = map_ids[genome_id] + duplicates.setdefault(seq_hash, []).extend([int(protid), int(genome_id)]) + else: + exhausted = True + + # write Fasta records to file + if len(out_seqs) == out_limit or exhausted is True: + if len(out_seqs) > 0: + out_seqs = im.join_list(out_seqs, '\n') + fo.write_to_file(out_seqs, unique_fasta, 'a', '\n') + # reset list to avoid writing same records multiple times + out_seqs = [] + + # save dictionary with genome integer identifiers per distinct sequence + # to pickle and only return file path to avoid keeping all dicts from + # parallel processes in memory + pickle_out = unique_fasta.replace('.fasta', '.duplicates') + fo.pickle_dumper(duplicates, pickle_out) + + return pickle_out def determine_small(sequences_file, minimum_length, variation=0): - """Find protein sequences that are shorter than a specified length. - - Parameters - ---------- - sequences_file : str - Path to a FASTA file. - minimum_length : int - Sequences with a length value below this value - are considered small. - variation : float - Accept sequences with length variation of up to - minus (`minimum_length`*`variation`). - - Returns - ------- - small_seqids : list - List with the identifiers of small sequences. - """ - variation = minimum_length - (minimum_length*variation) - seq_generator = fao.sequence_generator(sequences_file) - small_seqids = [] - for record in seq_generator: - if len(record.seq) < variation: - small_seqids.append(record.id) - - return small_seqids + """Find protein sequences that are shorter than a specified length. + + Parameters + ---------- + sequences_file : str + Path to a FASTA file. + minimum_length : int + Sequences with a length value below this value + are considered small. + variation : float + Accept sequences with length variation of up to + minus (`minimum_length`*`variation`). + + Returns + ------- + small_seqids : list + List with the identifiers of small sequences. + """ + variation = minimum_length - (minimum_length*variation) + seq_generator = fao.sequence_generator(sequences_file) + small_seqids = [] + for record in seq_generator: + if len(record.seq) < variation: + small_seqids.append(record.id) + + return small_seqids def apply_bsr(blast_results, fasta_file, bsr): - """Find similar sequences based on the BLAST Score Ratio. - - Parameters - ---------- - blast_results : list - List with the path to a file with BLAST - results in tabular format. - fasta_file : str - Path to a FASTA file that contains the - sequences that were aligned. - bsr : float - The BSR value to use as threshold - - Returns - ------- - excluded_alleles : list - List with the identifiers of the sequences - that were highly similar to other sequences. - """ - # Separate self-results from other results - self_scores = {r[0]: r[-1] for r in blast_results if r[0] == r[4]} - blast_results = [r for r in blast_results if r[0] != r[4]] - - # Determine sequence lengths - lengths = {} - for k in self_scores: - record = fasta_file.get(k) - sequence = str(record.seq) - lengths[k] = len(sequence) - - # Exclude based on BSR - excluded_alleles = [] - for result in blast_results: - query = result[0] - target = result[4] - score = result[-1] - if query not in excluded_alleles: - # Determine sequence to exclude based on BSR - try: - self_blast_score = self_scores[query] - query_length = lengths[query] - target_length = lengths[target] - blast_score_ratio = float(score)/float(self_blast_score) - - # Only proceed if target has not been excluded - if blast_score_ratio >= bsr and target not in excluded_alleles: - # Exclude query if target is bigger - if target_length > query_length and query not in excluded_alleles: - excluded_alleles.append(query) - # Exclude target if query is bigger - elif target_length <= query_length: - excluded_alleles.append(target) - # It might not be possible to determine the self-score for some sequences - # This might be related with composition-based stats being enabled - except Exception: - excluded_alleles.append(query) - - return excluded_alleles + """Find similar sequences based on the BLAST Score Ratio. + + Parameters + ---------- + blast_results : list + List with the path to a file with BLAST + results in tabular format. + fasta_file : str + Path to a FASTA file that contains the + sequences that were aligned. + bsr : float + The BSR value to use as threshold + + Returns + ------- + excluded_alleles : list + List with the identifiers of the sequences + that were highly similar to other sequences. + """ + # Separate self-results from other results + self_scores = {r[0]: r[-1] for r in blast_results if r[0] == r[4]} + blast_results = [r for r in blast_results if r[0] != r[4]] + + # Determine sequence lengths + lengths = {} + for k in self_scores: + record = fasta_file.get(k) + sequence = str(record.seq) + lengths[k] = len(sequence) + + # Exclude based on BSR + excluded_alleles = [] + for result in blast_results: + query = result[0] + target = result[4] + score = result[-1] + if query not in excluded_alleles: + # Determine sequence to exclude based on BSR + try: + self_blast_score = self_scores[query] + query_length = lengths[query] + target_length = lengths[target] + blast_score_ratio = float(score)/float(self_blast_score) + + # Only proceed if target has not been excluded + if blast_score_ratio >= bsr and target not in excluded_alleles: + # Exclude query if target is bigger + if target_length > query_length and query not in excluded_alleles: + excluded_alleles.append(query) + # Exclude target if query is bigger + elif target_length <= query_length: + excluded_alleles.append(target) + # It might not be possible to determine the self-score for some sequences + # This might be related with composition-based stats being enabled + except Exception: + excluded_alleles.append(query) + + return excluded_alleles diff --git a/CHEWBBACA/utils/uniprot_requests.py b/CHEWBBACA/utils/uniprot_requests.py index ce375dce..96d85509 100644 --- a/CHEWBBACA/utils/uniprot_requests.py +++ b/CHEWBBACA/utils/uniprot_requests.py @@ -17,320 +17,320 @@ from SPARQLWrapper import SPARQLWrapper, JSON try: - from utils import (constants as ct, - file_operations as fo, - fasta_operations as fao, - sequence_manipulation as sm) + from utils import (constants as ct, + file_operations as fo, + fasta_operations as fao, + sequence_manipulation as sm) except ModuleNotFoundError: - from CHEWBBACA.utils import (constants as ct, - file_operations as fo, - fasta_operations as fao, - sequence_manipulation as sm) + from CHEWBBACA.utils import (constants as ct, + file_operations as fo, + fasta_operations as fao, + sequence_manipulation as sm) UNIPROT_SERVER = SPARQLWrapper(ct.UNIPROT_SPARQL) def website_availability(url): - """Determine if a website is reachable. + """Determine if a website is reachable. - Parameters - ---------- - url : str - URL to a website. + Parameters + ---------- + url : str + URL to a website. - Returns - ------- - reponse : http.client.HTTPResponse or urllib.error.HTTPError - Response with status code 200 if website is reachable - or a different status code if the website was not reachable. - """ - try: - response = urlopen(url) - except Exception as e: - response = e + Returns + ------- + reponse : http.client.HTTPResponse or urllib.error.HTTPError + Response with status code 200 if website is reachable + or a different status code if the website was not reachable. + """ + try: + response = urlopen(url) + except Exception as e: + response = e - return response + return response def select_name(result): - """Extract an annotation from the result of a query to UniProt. - - Parameters - ---------- - result : dict - A dictionary with the results from querying - the UniProt SPARQL endpoint. - - Returns - ------- - A list with the following elements: - selected_name : str - The annotation description. - selected_url : str - The URL to the UniProt page about the protein record. - selected_label : str - A label that has descriptive value. - """ - url = '' - name = '' - label = '' - selected_name = '' - selected_url = '' - selected_label = '' - - i = 0 - found = False - # Get the entries with results - try: - aux = result['results']['bindings'] - # Response does not contain annotation data - except Exception as e: - aux = {} - - # only check results that are not empty - if len(aux) > 0: - # iterate over all results to find suitable - while found is False: - current_res = aux[i] - res_keys = aux[i].keys() - - # annotation name can be associated - # to different keys - if 'fname' in res_keys: - name = str(current_res['fname']['value']) - elif 'sname2' in res_keys: - name = str(current_res['sname2']['value']) - elif 'label' in res_keys: - name = str(current_res['label']['value']) - - if 'label' in res_keys: - label = str(current_res['label']['value']) - else: - label = name - - # get UniProt URL - if 'uri' in res_keys: - url = str(current_res['seq']['value']) - elif 'seq' in res_keys: - url = str(current_res['seq']['value']) - - if any([term in name for term in ct.UNIPROT_UNINFORMATIVE]) is False: - selected_name = name - selected_url = url - selected_label = label - found = True - else: - if selected_name == '': - selected_name = name - selected_url = url - selected_label = label - - i += 1 - if i == len(aux): - found = True - - return [selected_name, selected_url, selected_label] + """Extract an annotation from the result of a query to UniProt. + + Parameters + ---------- + result : dict + A dictionary with the results from querying + the UniProt SPARQL endpoint. + + Returns + ------- + A list with the following elements: + selected_name : str + The annotation description. + selected_url : str + The URL to the UniProt page about the protein record. + selected_label : str + A label that has descriptive value. + """ + url = '' + name = '' + label = '' + selected_name = '' + selected_url = '' + selected_label = '' + + i = 0 + found = False + # Get the entries with results + try: + aux = result['results']['bindings'] + # Response does not contain annotation data + except Exception as e: + aux = {} + + # only check results that are not empty + if len(aux) > 0: + # iterate over all results to find suitable + while found is False: + current_res = aux[i] + res_keys = aux[i].keys() + + # annotation name can be associated + # to different keys + if 'fname' in res_keys: + name = str(current_res['fname']['value']) + elif 'sname2' in res_keys: + name = str(current_res['sname2']['value']) + elif 'label' in res_keys: + name = str(current_res['label']['value']) + + if 'label' in res_keys: + label = str(current_res['label']['value']) + else: + label = name + + # get UniProt URL + if 'uri' in res_keys: + url = str(current_res['seq']['value']) + elif 'seq' in res_keys: + url = str(current_res['seq']['value']) + + if any([term in name for term in ct.UNIPROT_UNINFORMATIVE]) is False: + selected_name = name + selected_url = url + selected_label = label + found = True + else: + if selected_name == '': + selected_name = name + selected_url = url + selected_label = label + + i += 1 + if i == len(aux): + found = True + + return [selected_name, selected_url, selected_label] def uniprot_query(sequence): - """Construct a SPARQL query to search for exact matches in UniProt. - - Parameters - ---------- - sequence : str - The Protein sequence that will be added to - the query/searched for. - - Returns - ------- - query : str - The SPARQL query that will allow to search for - exact matches in the UniProt database. - """ - query = ('PREFIX rdf: ' - 'PREFIX up: ' - 'select ?seq ?fname ?sname2 ?label where {' - '{?b a up:Simple_Sequence; rdf:value ' - '"'+sequence+'". ?seq up:sequence ?b. ' - 'OPTIONAL{?seq up:submittedName ?sname. ?sname up:fullName ?sname2} ' - 'OPTIONAL{?seq up:recommendedName ?rname.?rname up:fullName ?fname} }' - 'UNION{?seq a up:Sequence; rdf:value "'+sequence+'"; ' - 'rdfs:label ?label. }}') - - return query + """Construct a SPARQL query to search for exact matches in UniProt. + + Parameters + ---------- + sequence : str + The Protein sequence that will be added to + the query/searched for. + + Returns + ------- + query : str + The SPARQL query that will allow to search for + exact matches in the UniProt database. + """ + query = ('PREFIX rdf: ' + 'PREFIX up: ' + 'select ?seq ?fname ?sname2 ?label where {' + '{?b a up:Simple_Sequence; rdf:value ' + '"'+sequence+'". ?seq up:sequence ?b. ' + 'OPTIONAL{?seq up:submittedName ?sname. ?sname up:fullName ?sname2} ' + 'OPTIONAL{?seq up:recommendedName ?rname.?rname up:fullName ?fname} }' + 'UNION{?seq a up:Sequence; rdf:value "'+sequence+'"; ' + 'rdfs:label ?label. }}') + + return query def get_data(sparql_query): - """Send requests to query UniProts's SPARQL endpoint. - - Parameters - ---------- - sparql_query : str - SPARQL query. - - Returns - ------- - result : dict - Dictionary with data retrieved from UniProt. - """ - tries = 0 - failed = [] - max_tries = ct.MAX_RETRIES - success = False - while success is False and tries < max_tries: - try: - UNIPROT_SERVER.setQuery(sparql_query) - UNIPROT_SERVER.setReturnFormat(JSON) - UNIPROT_SERVER.setTimeout(60) - result = UNIPROT_SERVER.query().convert() - success = True - # Might fail if sequence is too long for GET method(URITooLong) - except Exception as e: - failed.append(e) - tries += 1 - time.sleep(1) - - if success is False: - result = {} - - return [result, failed] + """Send requests to query UniProts's SPARQL endpoint. + + Parameters + ---------- + sparql_query : str + SPARQL query. + + Returns + ------- + result : dict + Dictionary with data retrieved from UniProt. + """ + tries = 0 + failed = [] + max_tries = ct.MAX_RETRIES + success = False + while success is False and tries < max_tries: + try: + UNIPROT_SERVER.setQuery(sparql_query) + UNIPROT_SERVER.setReturnFormat(JSON) + UNIPROT_SERVER.setTimeout(60) + result = UNIPROT_SERVER.query().convert() + success = True + # Might fail if sequence is too long for GET method(URITooLong) + except Exception as e: + failed.append(e) + tries += 1 + time.sleep(1) + + if success is False: + result = {} + + return [result, failed] def get_proteomes(proteome_ids, output_dir): - """Download reference proteomes from UniProt's FTP. - - Parameters - ---------- - proteomes : list - List with a sublist per proteome to download. - Each sublist has the information about a proteome - that was contained in the README file with the list - of UniProt's reference proteomes. - output_dir : str - Path to the output directory where downloaded - proteomes will be saved to. - - Returns - ------- - proteomes_files : list - Local paths to the downloaded proteomes. - """ - print('Downloading reference proteomes...') - # construct FTP URLs for each proteome - downloaded = 0 - proteomes_files = [] - for pid in proteome_ids: - domain = '{0}{1}'.format(pid[3][0].upper(), pid[3][1:]) - proteome_id = '{0}_{1}'.format(pid[0], pid[1]) - proteome_file = '{0}.fasta.gz'.format(proteome_id) - local_proteome_file = fo.join_paths(output_dir, [proteome_file]) - proteome_url = fo.join_paths(ct.UNIPROT_PROTEOMES_FTP, [domain, pid[0], proteome_file]) - res = fo.download_file(proteome_url, local_proteome_file) - proteomes_files.append(local_proteome_file) - downloaded += 1 - print('\r', 'Downloaded {0}/{1}'.format(downloaded, len(proteome_ids)), end='') - time.sleep(0.1) - - return proteomes_files + """Download reference proteomes from UniProt's FTP. + + Parameters + ---------- + proteomes : list + List with a sublist per proteome to download. + Each sublist has the information about a proteome + that was contained in the README file with the list + of UniProt's reference proteomes. + output_dir : str + Path to the output directory where downloaded + proteomes will be saved to. + + Returns + ------- + proteomes_files : list + Local paths to the downloaded proteomes. + """ + print('Downloading reference proteomes...') + # construct FTP URLs for each proteome + downloaded = 0 + proteomes_files = [] + for pid in proteome_ids: + domain = '{0}{1}'.format(pid[3][0].upper(), pid[3][1:]) + proteome_id = '{0}_{1}'.format(pid[0], pid[1]) + proteome_file = '{0}.fasta.gz'.format(proteome_id) + local_proteome_file = fo.join_paths(output_dir, [proteome_file]) + proteome_url = fo.join_paths(ct.UNIPROT_PROTEOMES_FTP, [domain, pid[0], proteome_file]) + res = fo.download_file(proteome_url, local_proteome_file) + proteomes_files.append(local_proteome_file) + downloaded += 1 + print('\r', 'Downloaded {0}/{1}'.format(downloaded, len(proteome_ids)), end='') + time.sleep(0.1) + + return proteomes_files def get_annotation(gene, translation_table): - """Retrieve and select best annotation for a locus. - - Parameters - ---------- - gene : str - Path to a FASTA file with the DNA sequences for - the alleles of the locus. - translation_table : int - Translation table used to translate DNA sequences. - - Returns - ------- - gene : str - Path to a FASTA file with the DNA sequences for - the alleles of the locus. - selected_name : str - Product name selected from the terms retrieved from UniProt. - selected_url : str - URL for the page with information about the selected terms. - """ - selected_url = '' - selected_name = '' - # Import locus alleles - sequences = fao.import_sequences(gene) - queried = [] - for seqid, sequence in sequences.items(): - # Translate allele - protein_sequence = str(sm.translate_sequence(sequence, - table_id=translation_table)) - - if protein_sequence not in queried: - query = uniprot_query(protein_sequence) - result, failed = get_data(query) - queried.append(protein_sequence) - if len(result) > 0: - name, url, label = select_name(result) - - lowercase_name = name.lower() - if any([term in lowercase_name for term in ct.UNIPROT_UNINFORMATIVE]) is True: - if selected_name == '': - selected_name = name - selected_url = url - continue - elif name == '': - continue - else: - selected_name = name - selected_url = url - break - - return [gene, selected_name, selected_url, failed] + """Retrieve and select best annotation for a locus. + + Parameters + ---------- + gene : str + Path to a FASTA file with the DNA sequences for + the alleles of the locus. + translation_table : int + Translation table used to translate DNA sequences. + + Returns + ------- + gene : str + Path to a FASTA file with the DNA sequences for + the alleles of the locus. + selected_name : str + Product name selected from the terms retrieved from UniProt. + selected_url : str + URL for the page with information about the selected terms. + """ + selected_url = '' + selected_name = '' + # Import locus alleles + sequences = fao.import_sequences(gene) + queried = [] + for seqid, sequence in sequences.items(): + # Translate allele + protein_sequence = str(sm.translate_sequence(sequence, + table_id=translation_table)) + + if protein_sequence not in queried: + query = uniprot_query(protein_sequence) + result, failed = get_data(query) + queried.append(protein_sequence) + if len(result) > 0: + name, url, label = select_name(result) + + lowercase_name = name.lower() + if any([term in lowercase_name for term in ct.UNIPROT_UNINFORMATIVE]) is True: + if selected_name == '': + selected_name = name + selected_url = url + continue + elif name == '': + continue + else: + selected_name = name + selected_url = url + break + + return [gene, selected_name, selected_url, failed] def extract_proteome_terms(header_items): - """Extract information from the header of a proteome record. - - Extracts the sequence identifier, product name, gene name - and species name fields from the sequence header of a - reference proteome from Uniprot. - - Parameters - ---------- - header_items : dict - Dictionary with the keys and values from a Biopython - Bio.SeqRecord.SeqRecord object. Created by passing the - Biopython Bio.SeqRecord.SeqRecord object to the vars - function. - - Returns - ------- - seqid : str - Sequence identifier of the record. Composed - of the db, UniqueIdentifier and EntryName fields. - record_product : str - ProteinName field value in the sequence header. - record_gene_name : str - GeneName field (GN) in the sequence header. - record_species : str - OrganismName field (OS) in the sequence header. - """ - # some tags might be missing - seqid = header_items.get('id', 'not_found') - record_description = header_items.get('description', 'not_found') - if record_description != '': - if 'OS=' in record_description: - # get organism name - record_species = (record_description.split('OS=')[1]).split(' OX=')[0] - record_product = (record_description.split(seqid+' ')[1]).split(' OS=')[0] - else: - record_species = 'not_found' - record_product = 'not_found' - - if 'GN=' in record_description: - record_gene_name = (record_description.split('GN=')[1]).split(' PE=')[0] - else: - record_gene_name = 'not_found' - - return [seqid, record_product, record_gene_name, record_species] + """Extract information from the header of a proteome record. + + Extracts the sequence identifier, product name, gene name + and species name fields from the sequence header of a + reference proteome from Uniprot. + + Parameters + ---------- + header_items : dict + Dictionary with the keys and values from a Biopython + Bio.SeqRecord.SeqRecord object. Created by passing the + Biopython Bio.SeqRecord.SeqRecord object to the vars + function. + + Returns + ------- + seqid : str + Sequence identifier of the record. Composed + of the db, UniqueIdentifier and EntryName fields. + record_product : str + ProteinName field value in the sequence header. + record_gene_name : str + GeneName field (GN) in the sequence header. + record_species : str + OrganismName field (OS) in the sequence header. + """ + # some tags might be missing + seqid = header_items.get('id', 'not_found') + record_description = header_items.get('description', 'not_found') + if record_description != '': + if 'OS=' in record_description: + # get organism name + record_species = (record_description.split('OS=')[1]).split(' OX=')[0] + record_product = (record_description.split(seqid+' ')[1]).split(' OS=')[0] + else: + record_species = 'not_found' + record_product = 'not_found' + + if 'GN=' in record_description: + record_gene_name = (record_description.split('GN=')[1]).split(' PE=')[0] + else: + record_gene_name = 'not_found' + + return [seqid, record_product, record_gene_name, record_species]