From 3be2401884ab7add8e8ee8b4b6b5a21e153f4c9d Mon Sep 17 00:00:00 2001
From: rfm-targa <rmamede@medicina.ulisboa.pt>
Date: Thu, 11 Jul 2024 19:14:32 +0100
Subject: [PATCH] Converting to tabs.

---
 CHEWBBACA/utils/chewiens_requests.py          |  864 ++++-----
 CHEWBBACA/utils/constants.py                  |  316 ++--
 CHEWBBACA/utils/distance_matrix.py            |  738 ++++----
 CHEWBBACA/utils/fasttree_wrapper.py           |   38 +-
 CHEWBBACA/utils/file_operations.py            | 1050 +++++------
 CHEWBBACA/utils/gene_prediction.py            |  422 ++---
 CHEWBBACA/utils/iterables_manipulation.py     | 1646 ++++++++---------
 CHEWBBACA/utils/join_profiles.py              |  172 +-
 CHEWBBACA/utils/mafft_wrapper.py              |   72 +-
 CHEWBBACA/utils/multiprocessing_operations.py |  420 ++---
 CHEWBBACA/utils/process_datetime.py           |  220 +--
 CHEWBBACA/utils/profile_hasher.py             |  360 ++--
 CHEWBBACA/utils/profiles_sqlitedb.py          | 1190 ++++++------
 CHEWBBACA/utils/sequence_clustering.py        |  936 +++++-----
 CHEWBBACA/utils/sequence_manipulation.py      | 1166 ++++++------
 CHEWBBACA/utils/uniprot_requests.py           |  578 +++---
 16 files changed, 5094 insertions(+), 5094 deletions(-)

diff --git a/CHEWBBACA/utils/chewiens_requests.py b/CHEWBBACA/utils/chewiens_requests.py
index 6a546217..a798d56a 100644
--- a/CHEWBBACA/utils/chewiens_requests.py
+++ b/CHEWBBACA/utils/chewiens_requests.py
@@ -19,480 +19,480 @@
 from urllib.parse import urlparse, urlencode, urlsplit, parse_qs
 
 try:
-    from utils import constants as ct
+	from utils import constants as ct
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import constants as ct
+	from CHEWBBACA.utils import constants as ct
 
 
 def simple_get_request(url, headers, endpoint_list=None,
-                       parameters=None, verify=False, timeout=30):
-    """Request data through a GET method sent to an endpoint.
-
-    Construct an URI for an endpoint and request data through
-    a GET method sent to that endpoint.
-
-    Parameters
-    ----------
-    url : str
-        The base URL for a Chewie-NS instance.
-    headers : dict
-        HTTP headers for GET requests.
-    endpoint_list : list
-        List with elements that will be concatenated to the
-        base URI to create the URI for the API endpoint.
-    parameters : dict or list or bytes
-        Dictionary, list of tuples or bytes to send in the
-        query string for the Request.
-    verify : bool
-        If the SSL certificates should be verified in HTTPS
-        requests (False for no verification, True otherwise).
-    timeout : int
-        Maximum time in seconds to wait for response.
-
-    Returns
-    -------
-    url : str
-        Complete URI for the API endpoint.
-    res : requests.models.Response
-        The response returned through the GET method.
-    """
-    # unpack list of sequential endpoints and pass to create URI
-    if endpoint_list is not None:
-        url = make_url(url, *endpoint_list)
-
-    res = requests.get(url, headers=headers, params=parameters,
-                       timeout=timeout, verify=verify)
-
-    # return queried endpoint URI and response
-    return [url, res]
+					   parameters=None, verify=False, timeout=30):
+	"""Request data through a GET method sent to an endpoint.
+
+	Construct an URI for an endpoint and request data through
+	a GET method sent to that endpoint.
+
+	Parameters
+	----------
+	url : str
+		The base URL for a Chewie-NS instance.
+	headers : dict
+		HTTP headers for GET requests.
+	endpoint_list : list
+		List with elements that will be concatenated to the
+		base URI to create the URI for the API endpoint.
+	parameters : dict or list or bytes
+		Dictionary, list of tuples or bytes to send in the
+		query string for the Request.
+	verify : bool
+		If the SSL certificates should be verified in HTTPS
+		requests (False for no verification, True otherwise).
+	timeout : int
+		Maximum time in seconds to wait for response.
+
+	Returns
+	-------
+	url : str
+		Complete URI for the API endpoint.
+	res : requests.models.Response
+		The response returned through the GET method.
+	"""
+	# unpack list of sequential endpoints and pass to create URI
+	if endpoint_list is not None:
+		url = make_url(url, *endpoint_list)
+
+	res = requests.get(url, headers=headers, params=parameters,
+					   timeout=timeout, verify=verify)
+
+	# return queried endpoint URI and response
+	return [url, res]
 
 
 def simple_post_request(url, headers, endpoint_list=None,
-                        data=None, files=None, verify=False):
-    """Send data through a POST method.
-
-    Construct an URI for an endpoint and send data through
-    a POST method.
-
-    Parameters
-    ----------
-    url : str
-        The base URL for a Chewie-NS instance.
-    headers : dict
-        HTTP headers for POST requests.
-    endpoint_list : list
-        List with elements that will be concatenated to the
-        base URI to create the URI for the API endpoint.
-    data
-        Data to send. Must be an object that can be JSON
-        serialized.
-    files : dict
-        Dictionary with file data to send (see:
-        https://requests.readthedocs.io/en/master/user
-        /quickstart/#post-a-multipart-encoded-file).
-    verify : bool
-        If the SSL certificates should be verified in HTTPS
-        requests (False for no verification, True otherwise).
-
-    Returns
-    -------
-    url : str
-        Complete URI for the API endpoint.
-    res : requests.models.Response
-        The response returned through the POST method.
-    """
-    # unpack list of sequential endpoints and pass to create URI
-    if endpoint_list is not None:
-        url = make_url(url, *endpoint_list)
-
-    # if the content is defined as 'application/json'
-    # the data has to be passed after being processed with
-    # json.dumps()
-    res = requests.post(url, headers=headers, data=data,
-                        files=files, verify=verify)
-
-    return [url, res]
+						data=None, files=None, verify=False):
+	"""Send data through a POST method.
+
+	Construct an URI for an endpoint and send data through
+	a POST method.
+
+	Parameters
+	----------
+	url : str
+		The base URL for a Chewie-NS instance.
+	headers : dict
+		HTTP headers for POST requests.
+	endpoint_list : list
+		List with elements that will be concatenated to the
+		base URI to create the URI for the API endpoint.
+	data
+		Data to send. Must be an object that can be JSON
+		serialized.
+	files : dict
+		Dictionary with file data to send (see:
+		https://requests.readthedocs.io/en/master/user
+		/quickstart/#post-a-multipart-encoded-file).
+	verify : bool
+		If the SSL certificates should be verified in HTTPS
+		requests (False for no verification, True otherwise).
+
+	Returns
+	-------
+	url : str
+		Complete URI for the API endpoint.
+	res : requests.models.Response
+		The response returned through the POST method.
+	"""
+	# unpack list of sequential endpoints and pass to create URI
+	if endpoint_list is not None:
+		url = make_url(url, *endpoint_list)
+
+	# if the content is defined as 'application/json'
+	# the data has to be passed after being processed with
+	# json.dumps()
+	res = requests.post(url, headers=headers, data=data,
+						files=files, verify=verify)
+
+	return [url, res]
 
 
 def check_connection(url, headers=ct.HEADERS_GET_JSON):
-    """Verify connection to a Chewie-NS instance.
-
-    Perform a GET request to the stats/summary endpoint of a
-    Chewie-NS instance to determine if it is possible to
-    establish a connection.
-
-    Parameters
-    ----------
-    url : str
-       The base URL for a Chewie-NS instance.
-    headers : dict
-        HTTP headers for GET requests.
-
-    Returns
-    -------
-    connection : bool
-        True if the GET request to the stats/summary
-        endpoint was successful (HTTP response status
-        code 200 or 201), False otherwise.
-    """
-    try:
-        url, res = simple_get_request(url, headers, ['stats', 'summary'])
-        server_status = res.status_code
-        if server_status in [200, 201]:
-            connection = True
-        else:
-            connection = False
-    except Exception:
-        connection = False
-
-    return connection
+	"""Verify connection to a Chewie-NS instance.
+
+	Perform a GET request to the stats/summary endpoint of a
+	Chewie-NS instance to determine if it is possible to
+	establish a connection.
+
+	Parameters
+	----------
+	url : str
+	   The base URL for a Chewie-NS instance.
+	headers : dict
+		HTTP headers for GET requests.
+
+	Returns
+	-------
+	connection : bool
+		True if the GET request to the stats/summary
+		endpoint was successful (HTTP response status
+		code 200 or 201), False otherwise.
+	"""
+	try:
+		url, res = simple_get_request(url, headers, ['stats', 'summary'])
+		server_status = res.status_code
+		if server_status in [200, 201]:
+			connection = True
+		else:
+			connection = False
+	except Exception:
+		connection = False
+
+	return connection
 
 
 def user_info(base_url, headers_get):
-    """Get the information about a registered user in Chewie-NS.
-
-    Get the user identifier, role and permission level for a
-    user registered in a Chewie-NS instance. The permission
-    level is used to determine if the user can upload schemas
-    to Chewie-NS.
-
-    Parameters
-    ----------
-    base_url : str
-        The base URL for a Chewie-NS instance.
-    headers_get : dict
-        HTTP headers for GET requests.
-
-    Returns
-    -------
-    user_id : str
-        User identifier in the Chewie-NS instance.
-    user_role : str
-        User role in the Chewie-NS instance.
-    permission : bool
-        True if the user has Admin or Contributor privileges,
-        False otherwise.
-    """
-    # verify user role to check permission
-    user_url, user_info = simple_get_request(base_url, headers_get,
-                                             ['user', 'current_user'])
-    user_info = user_info.json()
-
-    user_id = str(user_info['id'])
-    user_role = user_info['roles'].split(' ')[-1][:-1]
-    permission = any(role in user_role for role in ['Admin', 'Contributor'])
-
-    return [user_id, user_role, permission]
+	"""Get the information about a registered user in Chewie-NS.
+
+	Get the user identifier, role and permission level for a
+	user registered in a Chewie-NS instance. The permission
+	level is used to determine if the user can upload schemas
+	to Chewie-NS.
+
+	Parameters
+	----------
+	base_url : str
+		The base URL for a Chewie-NS instance.
+	headers_get : dict
+		HTTP headers for GET requests.
+
+	Returns
+	-------
+	user_id : str
+		User identifier in the Chewie-NS instance.
+	user_role : str
+		User role in the Chewie-NS instance.
+	permission : bool
+		True if the user has Admin or Contributor privileges,
+		False otherwise.
+	"""
+	# verify user role to check permission
+	user_url, user_info = simple_get_request(base_url, headers_get,
+											 ['user', 'current_user'])
+	user_info = user_info.json()
+
+	user_id = str(user_info['id'])
+	user_role = user_info['roles'].split(' ')[-1][:-1]
+	permission = any(role in user_role for role in ['Admin', 'Contributor'])
+
+	return [user_id, user_role, permission]
 
 
 def species_ids(species_id, base_url, headers_get):
-    """Get a species identifier or name in Chewie-NS.
-
-    Parameters
-    ----------
-    species_id : str
-        The identifier of the species in Chewie-NS or the
-        name of the species.
-    base_url : str
-        The base URL for a Chewie-NS instance.
-    headers_get : dict
-        HTTP headers for GET requests.
-
-    Returns
-    -------
-    A list with the species identifier and name if the species
-    exists in Chewie-NS or a 404 HTTP status code if the species
-    was not found.
-    """
-    try:
-        # check if species unique integer identifier was provided
-        int(species_id)
-        species_url, species_info = simple_get_request(base_url, headers_get,
-                                                       ['species', species_id])
-        if species_info.status_code in [200, 201]:
-            species_name = species_info.json()[0]['name']['value']
-            return [species_id, species_name]
-        else:
-            return 404
-    except ValueError:
-        # check if the scientific name for species was provided
-        species_name = species_id
-        # get the list of species added to Chewie-NS
-        ns_species = species_list(base_url, headers_get, ['species', 'list'])
-        # try to get the species unique integer identifier
-        species_id = ns_species.get(species_name, 'not_found')
-        if species_id != 'not_found':
-            return [species_id, species_name]
-        else:
-            return 404
+	"""Get a species identifier or name in Chewie-NS.
+
+	Parameters
+	----------
+	species_id : str
+		The identifier of the species in Chewie-NS or the
+		name of the species.
+	base_url : str
+		The base URL for a Chewie-NS instance.
+	headers_get : dict
+		HTTP headers for GET requests.
+
+	Returns
+	-------
+	A list with the species identifier and name if the species
+	exists in Chewie-NS or a 404 HTTP status code if the species
+	was not found.
+	"""
+	try:
+		# check if species unique integer identifier was provided
+		int(species_id)
+		species_url, species_info = simple_get_request(base_url, headers_get,
+													   ['species', species_id])
+		if species_info.status_code in [200, 201]:
+			species_name = species_info.json()[0]['name']['value']
+			return [species_id, species_name]
+		else:
+			return 404
+	except ValueError:
+		# check if the scientific name for species was provided
+		species_name = species_id
+		# get the list of species added to Chewie-NS
+		ns_species = species_list(base_url, headers_get, ['species', 'list'])
+		# try to get the species unique integer identifier
+		species_id = ns_species.get(species_name, 'not_found')
+		if species_id != 'not_found':
+			return [species_id, species_name]
+		else:
+			return 404
 
 
 def species_list(base_url, headers_get, endpoint_list):
-    """Get the list of species in a Chewie-NS instance.
-
-    Parameters
-    ----------
-    base_url : str
-        The base URL for a Chewie Nomenclature Server
-        instance.
-    headers_get : dict
-        HTTP headers for GET requests.
-    endpoint_list : list
-        List with elements that will be concatenated
-        to the base URI to create the URI for the API endpoint.
-
-    Returns
-    -------
-    species_lst : dict
-        Dictionary with species names as keys and
-        species identifiers as values.
-    """
-    species_url, res = simple_get_request(base_url, headers_get, endpoint_list)
-    res = res.json()
-    species_lst = {}
-    for sp in res:
-        # get species scientific name
-        species = sp['name']['value']
-        # get species unique integer identifier from URl
-        species_url = sp['species']['value']
-        species_id = species_url.split('/')[-1]
-        species_lst[species] = species_id
-
-    return species_lst
+	"""Get the list of species in a Chewie-NS instance.
+
+	Parameters
+	----------
+	base_url : str
+		The base URL for a Chewie Nomenclature Server
+		instance.
+	headers_get : dict
+		HTTP headers for GET requests.
+	endpoint_list : list
+		List with elements that will be concatenated
+		to the base URI to create the URI for the API endpoint.
+
+	Returns
+	-------
+	species_lst : dict
+		Dictionary with species names as keys and
+		species identifiers as values.
+	"""
+	species_url, res = simple_get_request(base_url, headers_get, endpoint_list)
+	res = res.json()
+	species_lst = {}
+	for sp in res:
+		# get species scientific name
+		species = sp['name']['value']
+		# get species unique integer identifier from URl
+		species_url = sp['species']['value']
+		species_id = species_url.split('/')[-1]
+		species_lst[species] = species_id
+
+	return species_lst
 
 
 def is_url(url):
-    """Check if a URL is valid.
+	"""Check if a URL is valid.
 
-    Parameters
-    ----------
-    url : str
-        The URL to be validated.
+	Parameters
+	----------
+	url : str
+		The URL to be validated.
 
-    Returns
-    -------
-    True if the URL is valid, False otherwise.
-    """
-    try:
-        result = urlparse(url)
-        return all([result.scheme, result.netloc, result.path])
-    except Exception:
-        return False
+	Returns
+	-------
+	True if the URL is valid, False otherwise.
+	"""
+	try:
+		result = urlparse(url)
+		return all([result.scheme, result.netloc, result.path])
+	except Exception:
+		return False
 
 
 def make_url(base_url, *res, **params):
-    """Create a URL.
-
-    Parameters
-    ----------
-    base_url : str
-        The base URL.
-    res : list
-        List with elements that will be concatenated to the
-        base URI to create the URI for the API endpoint.
-    params : str
-        Addtional parameters (WIP).
-
-    Returns
-    -------
-    url : str
-        URL constructed by adding endpoints and additional
-        parameters. If the provided base URL is invalid
-        returns 'An invalid URL was provided.'
-    """
-    url = base_url
-    # Check if the url is valid
-    if is_url(url):
-
-        if url[-1] == '/':
-            url = url[:-1]
-
-        # Add the endpoints
-        for r in res:
-            url = '{0}/{1}'.format(url, r)
-
-        # Add params if they are provided
-        if params:
-            url = '{0}?{1}'.format(url, urlencode(params))
-
-        return url
-    else:
-        return 'An invalid URL was provided.'
+	"""Create a URL.
+
+	Parameters
+	----------
+	base_url : str
+		The base URL.
+	res : list
+		List with elements that will be concatenated to the
+		base URI to create the URI for the API endpoint.
+	params : str
+		Addtional parameters (WIP).
+
+	Returns
+	-------
+	url : str
+		URL constructed by adding endpoints and additional
+		parameters. If the provided base URL is invalid
+		returns 'An invalid URL was provided.'
+	"""
+	url = base_url
+	# Check if the url is valid
+	if is_url(url):
+
+		if url[-1] == '/':
+			url = url[:-1]
+
+		# Add the endpoints
+		for r in res:
+			url = '{0}/{1}'.format(url, r)
+
+		# Add params if they are provided
+		if params:
+			url = '{0}?{1}'.format(url, urlencode(params))
+
+		return url
+	else:
+		return 'An invalid URL was provided.'
 
 
 def get_sequence_from_url(url):
-    """Extract sequence from URL query component.
+	"""Extract sequence from URL query component.
 
-    Parameters
-    ----------
-    url : str
-        URL that contains the sequence.
+	Parameters
+	----------
+	url : str
+		URL that contains the sequence.
 
-    Returns
-    -------
-    seq : str
-        Sequence extracted from the URL.
-    """
-    seq = parse_qs(urlsplit(url).query)['sequence'][0]
+	Returns
+	-------
+	seq : str
+		Sequence extracted from the URL.
+	"""
+	seq = parse_qs(urlsplit(url).query)['sequence'][0]
 
-    return seq
+	return seq
 
 
 def login_user_to_NS(server_url, email, password):
-    """Login user to a Chewie-NS instance.
-
-    Parameters
-    ----------
-    server_url : str
-        The base URL for a Chewie-NS instance.
-    email : str
-        Email linked to user's account in Chewie-NS.
-    password : str
-        Password for the user's account in Chewie-NS.
-
-    Returns
-    -------
-    token : str
-        Authorization token to perform requests to Chewie-NS
-        instance if login is successful. Otherwise, returns
-        False.
-    """
-    auth_params = {}
-    auth_params['email'] = email
-    auth_params['password'] = password
-
-    auth_url, auth_r = simple_post_request(server_url,
-                                           ct.HEADERS_POST_JSON,
-                                           ['auth', 'login'],
-                                           data=json.dumps(auth_params))
-
-    auth_result = auth_r.json()
-    if auth_result['status'] == 'success':
-        token = auth_result['access_token']
-    else:
-        token = False
-
-    return token
+	"""Login user to a Chewie-NS instance.
+
+	Parameters
+	----------
+	server_url : str
+		The base URL for a Chewie-NS instance.
+	email : str
+		Email linked to user's account in Chewie-NS.
+	password : str
+		Password for the user's account in Chewie-NS.
+
+	Returns
+	-------
+	token : str
+		Authorization token to perform requests to Chewie-NS
+		instance if login is successful. Otherwise, returns
+		False.
+	"""
+	auth_params = {}
+	auth_params['email'] = email
+	auth_params['password'] = password
+
+	auth_url, auth_r = simple_post_request(server_url,
+										   ct.HEADERS_POST_JSON,
+										   ['auth', 'login'],
+										   data=json.dumps(auth_params))
+
+	auth_result = auth_r.json()
+	if auth_result['status'] == 'success':
+		token = auth_result['access_token']
+	else:
+		token = False
+
+	return token
 
 
 def capture_login_credentials(base_url):
-    """Capture login credentials to login user into Chewie-NS.
-
-    Parameters
-    ----------
-    base_url : str
-        The base URL for a Chewie-NS instance.
-
-    Returns
-    -------
-    token : str
-        Authorization token to perform requests to Chewie-NS.
-    """
-    print('\nPlease provide login credentials:')
-    user = input('USERNAME: ')
-    password = getpass('PASSWORD: ')
-    print()
-    # get token
-    token = login_user_to_NS(base_url, user, password)
-    # if login was not successful, stop the program
-    if token is False:
-        sys.exit('Invalid credentials.')
-
-    return token
+	"""Capture login credentials to login user into Chewie-NS.
+
+	Parameters
+	----------
+	base_url : str
+		The base URL for a Chewie-NS instance.
+
+	Returns
+	-------
+	token : str
+		Authorization token to perform requests to Chewie-NS.
+	"""
+	print('\nPlease provide login credentials:')
+	user = input('USERNAME: ')
+	password = getpass('PASSWORD: ')
+	print()
+	# get token
+	token = login_user_to_NS(base_url, user, password)
+	# if login was not successful, stop the program
+	if token is False:
+		sys.exit('Invalid credentials.')
+
+	return token
 
 
 def get_species_schemas(schema_id, species_id, base_url, headers_get):
-    """Determine if there is a schema with specified identifier.
-
-    Retrieves the list of schemas for a species in a Chewie-NS
-    instance and determines if it has a schema with the
-    specified identifier.
-
-    Parameters
-    ----------
-    schema_id : str
-        The identifier of the schema in Chewie-NS.
-    species_id : str
-        The identifier of the schema's species in Chewie-NS.
-    base_url : str
-        The base URL for the Chewie-NS instance.
-    headers_get : dict
-        HTTP headers for GET requests.
-
-    Returns
-    -------
-    A list with the schema identifier, the schema URI, the
-    schema name and a dictionary with all properties for
-    the schema in Chewie-NS.
-
-    Raises
-    ------
-    SystemExit
-        - If the schema with the specified identifier does
-          not exist.
-        - If the process cannot retrieve the list of schemas
-          for the species.
-    """
-    # get the list of schemas for the species
-    schemas_url, schema_get = simple_get_request(base_url, headers_get,
-                                                 ['species', species_id,
-                                                  'schemas'])
-    schema_get_status = schema_get.status_code
-    if schema_get_status in [200, 201]:
-        species_schemas = schema_get.json()
-
-        # extract schemas identifiers, URIs and names from response
-        schemas_info = []
-        for s in species_schemas:
-            schema_uri = s['schemas']['value']
-            scid = schema_uri.split('/')[-1]
-            schema_name = s['name']['value']
-            schemas_info.append([scid, schema_uri, schema_name])
-
-        # select schema with specified identifier
-        schema = [s for s in schemas_info if schema_id in s]
-        if len(schema) > 0:
-            # get schema parameters
-            schema = schema[0]
-            url, schema_params = simple_get_request(schema[1], headers_get)
-            schema_params = schema_params.json()[0]
-            schema.append(schema_params)
-            return schema
-        else:
-            sys.exit('Could not find a schema with provided identifier.')
-    else:
-        sys.exit('Could not retrieve schemas for current species.')
+	"""Determine if there is a schema with specified identifier.
+
+	Retrieves the list of schemas for a species in a Chewie-NS
+	instance and determines if it has a schema with the
+	specified identifier.
+
+	Parameters
+	----------
+	schema_id : str
+		The identifier of the schema in Chewie-NS.
+	species_id : str
+		The identifier of the schema's species in Chewie-NS.
+	base_url : str
+		The base URL for the Chewie-NS instance.
+	headers_get : dict
+		HTTP headers for GET requests.
+
+	Returns
+	-------
+	A list with the schema identifier, the schema URI, the
+	schema name and a dictionary with all properties for
+	the schema in Chewie-NS.
+
+	Raises
+	------
+	SystemExit
+		- If the schema with the specified identifier does
+		  not exist.
+		- If the process cannot retrieve the list of schemas
+		  for the species.
+	"""
+	# get the list of schemas for the species
+	schemas_url, schema_get = simple_get_request(base_url, headers_get,
+												 ['species', species_id,
+												  'schemas'])
+	schema_get_status = schema_get.status_code
+	if schema_get_status in [200, 201]:
+		species_schemas = schema_get.json()
+
+		# extract schemas identifiers, URIs and names from response
+		schemas_info = []
+		for s in species_schemas:
+			schema_uri = s['schemas']['value']
+			scid = schema_uri.split('/')[-1]
+			schema_name = s['name']['value']
+			schemas_info.append([scid, schema_uri, schema_name])
+
+		# select schema with specified identifier
+		schema = [s for s in schemas_info if schema_id in s]
+		if len(schema) > 0:
+			# get schema parameters
+			schema = schema[0]
+			url, schema_params = simple_get_request(schema[1], headers_get)
+			schema_params = schema_params.json()[0]
+			schema.append(schema_params)
+			return schema
+		else:
+			sys.exit('Could not find a schema with provided identifier.')
+	else:
+		sys.exit('Could not retrieve schemas for current species.')
 
 
 def upload_file(file, filename, url, headers, verify_ssl):
-    """Upload a file to a Chewie-NS instance.
-
-    Parameters
-    ----------
-    file : str
-        Path to the file to upload.
-    filename : str
-        Name used to save the file in Chewie-NS.
-    url : str
-        Endpoint URL that receives the POST request.
-    headers : dict
-        HTTP POST request headers.
-    verify_sll : bool
-        If the SSL certificates should be verified in
-        HTTPS requests (False for no verification, True
-        otherwise).
-
-    Returns
-    -------
-    response : requests.models.Response
-        Response object from the 'requests' module.
-    """
-    file_handle = open(file, 'rb')
-    files = {'file': (filename, file_handle)}
-    up_url, response = simple_post_request(url, headers,
-                                           files=files,
-                                           verify=verify_ssl)
-
-    file_handle.close()
-
-    return response
+	"""Upload a file to a Chewie-NS instance.
+
+	Parameters
+	----------
+	file : str
+		Path to the file to upload.
+	filename : str
+		Name used to save the file in Chewie-NS.
+	url : str
+		Endpoint URL that receives the POST request.
+	headers : dict
+		HTTP POST request headers.
+	verify_sll : bool
+		If the SSL certificates should be verified in
+		HTTPS requests (False for no verification, True
+		otherwise).
+
+	Returns
+	-------
+	response : requests.models.Response
+		Response object from the 'requests' module.
+	"""
+	file_handle = open(file, 'rb')
+	files = {'file': (filename, file_handle)}
+	up_url, response = simple_post_request(url, headers,
+										   files=files,
+										   verify=verify_ssl)
+
+	file_handle.close()
+
+	return response
diff --git a/CHEWBBACA/utils/constants.py b/CHEWBBACA/utils/constants.py
index 08648b00..1ba6bf84 100755
--- a/CHEWBBACA/utils/constants.py
+++ b/CHEWBBACA/utils/constants.py
@@ -80,12 +80,12 @@
 GENETIC_CODES = {1: 'The Standard Code',
 				 2: 'The Vertebrate Mitochondrial Code',
 				 3: 'The Yeast Mitochondrial Code',
-                 4: 'The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code',
+				 4: 'The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code',
 				 5: 'The Invertebrate Mitochondrial Code',
 				 6: 'The Ciliate, Dasycladacean and Hexamita Nuclear Code',
 				 9: 'The Echinoderm and Flatworm Mitochondrial Code',
 				 10: 'The Euplotid Nuclear Code',
-                 11: 'The Bacterial, Archaeal and Plant Plastid Code',
+				 11: 'The Bacterial, Archaeal and Plant Plastid Code',
 				 12: 'The Alternative Yeast Nuclear Code',
 				 13: 'The Ascidian Mitochondrial Code',
 				 14: 'The Alternative Flatworm Mitochondrial Code',
@@ -95,7 +95,7 @@
 				 22: 'Scenedesmus obliquus Mitochondrial Code',
 				 23: 'Thraustochytrium Mitochondrial Code',
 				 24: 'Rhabdopleuridae Mitochondrial Code',
-                 25: 'Candidate Division SR1 and Gracilibacteria Code'}
+				 25: 'Candidate Division SR1 and Gracilibacteria Code'}
 
 GENETIC_CODES_DEFAULT = 11
 
@@ -111,23 +111,23 @@
 
 # Chewie-NS related constants
 HEADERS_GET_ = {'Authorization': None,
-                'accept': 'application/octet-stream'}
+				'accept': 'application/octet-stream'}
 
 HEADERS_GET_JSON = {'Authorization': None,
-                    'accept': 'application/json'}
+					'accept': 'application/json'}
 
 HEADERS_POST = {'Authorization': None,
-                'user_id': None}
+				'user_id': None}
 
 HEADERS_POST_JSON = {'Authorization': None,
-                     'Content-type': 'application/json',
-                     'accept': 'application/json',
-                     'user_id': None}
+					 'Content-type': 'application/json',
+					 'accept': 'application/json',
+					 'user_id': None}
 
 # List of Chewie-NS instance identifiers and URLs
 HOST_NS = {'main': 'https://chewbbaca.online/api/NS/api/',
-           'tutorial': 'https://tutorial.chewbbaca.online/api/NS/api/',
-           'local': 'http://127.0.0.1:5000/NS/api/'}
+		   'tutorial': 'https://tutorial.chewbbaca.online/api/NS/api/',
+		   'local': 'http://127.0.0.1:5000/NS/api/'}
 
 # Authors, GitHub repository, documentation, tutorial and contacts
 AUTHORS = 'Rafael Mamede, Pedro Cerqueira, Mickael Silva, João Carriço, Mário Ramirez'
@@ -152,18 +152,18 @@
 # To avoid this warning the TXT file must be converted to binary with the blastdb_aliastool
 # Performance can be severely affected if the TXT is not converted to binary
 IGNORE_RAISED = ['Warning: [blastp] To obtain better run time '
-                 'performance, please run blastdb_aliastool '
-                 '-seqid_file_in <INPUT_FILE_NAME> -seqid_file_out '
-                 '<OUT_FILE_NAME> and use <OUT_FILE_NAME> as the '
-                 'argument to -seqidlist']
+				 'performance, please run blastdb_aliastool '
+				 '-seqid_file_in <INPUT_FILE_NAME> -seqid_file_out '
+				 '<OUT_FILE_NAME> and use <OUT_FILE_NAME> as the '
+				 'argument to -seqidlist']
 
 # Path to MAFFT executable
 MAFFT_ALIAS = shutil.which('mafft')
 
 # Replacements for genome and loci identifiers
 CHAR_REPLACEMENTS = [("|", "_"), ("_", "-"), ("(", ""),
-                     (")", ""), ("'", ""), ("\"", ""),
-                     (":", "")]
+					 (")", ""), ("'", ""), ("\"", ""),
+					 (":", "")]
 
 # Minimum Python version
 MIN_PYTHON = [(3, 6, 0), '3.6.0']
@@ -185,11 +185,11 @@
 # AlleleCall logfile basename and content template
 LOGFILE_BASENAME = 'logging_info.txt'
 LOGFILE_TEMPLATE = ('Started script at: {0}\n'
-                    'Finished script at: {1}\n'
-                    'Number of inputs: {2}\n'
-                    'Number of loci: {3}\n'
-                    'Used this number of CPU cores: {4}\n'
-                    'Used a BSR of: {5}\n')
+					'Finished script at: {1}\n'
+					'Number of inputs: {2}\n'
+					'Number of loci: {3}\n'
+					'Used this number of CPU cores: {4}\n'
+					'Used a BSR of: {5}\n')
 
 # Basename for files created by the AlleleCall module
 RESULTS_COORDINATES_BASENAME = 'results_contigsInfo.tsv'
@@ -208,7 +208,7 @@
 GENE_LIST_BASENAME = '.genes_list'
 # Header for TSV file with loci stats
 LOCI_STATS_HEADER = ('Locus\tEXC\tINF\tPLOT3\tPLOT5\tLOTSC\tNIPH\t'
-                     'NIPHEM\tALM\tASM\tPAMA\tLNF\tTotal_CDS')
+					 'NIPHEM\tALM\tASM\tPAMA\tLNF\tTotal_CDS')
 # Header for TSV file with information about extracted CDSs
 CDS_TABLE_HEADER = 'Genome\tContig\tStart\tStop\tProtein_ID\tCoding_Strand\n'
 # Headers for TSV files with paralogous loci count and per genome
@@ -219,8 +219,8 @@
 
 # Allele calling classifications
 ALLELECALL_CLASSIFICATIONS = ['EXC', 'INF', 'PLOT3', 'PLOT5',
-                              'LOTSC', 'NIPH', 'NIPHEM', 'ALM',
-                              'ASM', 'PAMA', 'LNF']
+							  'LOTSC', 'NIPH', 'NIPHEM', 'ALM',
+							  'ASM', 'PAMA', 'LNF']
 
 # PLNF classificaton for modes {1,2,3}
 PROBABLE_LNF = 'PLNF'
@@ -245,17 +245,17 @@
 
 # Dictionary template to map variables returned by AlleleCall
 ALLELECALL_DICT = {'classification_files': None,
-                   'basename_map': None,
-                   'cds_coordinates': None,
-                   'cds_counts': None,
-                   'dna_fasta': None,
-                   'protein_fasta': None,
-                   'dna_hashtable': None,
-                   'protein_hashtable': None,
-                   'invalid_alleles': None,
-                   'unclassified_ids': None,
-                   'self_scores': None,
-                   'representatives': None}
+				   'basename_map': None,
+				   'cds_coordinates': None,
+				   'cds_counts': None,
+				   'dna_fasta': None,
+				   'protein_fasta': None,
+				   'dna_hashtable': None,
+				   'protein_hashtable': None,
+				   'invalid_alleles': None,
+				   'unclassified_ids': None,
+				   'self_scores': None,
+				   'representatives': None}
 
 GENOME_LIST = 'listGenomes2Call.txt'
 LOCI_LIST = 'listGenes2Call.txt'
@@ -282,7 +282,7 @@
 
 # File header for file with summary statistics created by PrepExternalSchema
 PREPEXTERNAL_SUMMARY_STATS_HEADER = ('Gene\tTotal_alleles\tValid_alleles\t'
-                                     'Number_representatives')
+									 'Number_representatives')
 
 # Default loci presence thresholds used to compute the cgMLST
 CGMLST_THRESHOLDS = [0.95, 0.99, 1]
@@ -293,15 +293,15 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
-    <meta charset="UTF-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>Schema Evaluator - React Edition</title>
+	<meta charset="UTF-8" />
+	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
+	<title>Schema Evaluator - React Edition</title>
 </head>
 <body style="background-color: #f6f6f6">
-    <noscript> You need to enable JavaScript to run this app. </noscript>
-    <div id="root"></div>
-    <script> preComputedData = {0} </script>
-    <script src="./report_bundle.js"></script>
+	<noscript> You need to enable JavaScript to run this app. </noscript>
+	<div id="root"></div>
+	<script> preComputedData = {0} </script>
+	<script src="./report_bundle.js"></script>
 </body>
 </html>
 """)
@@ -311,19 +311,19 @@
 LOCUS_REPORT_HTML = ("""
 <!DOCTYPE html>
 <html lang="en">
-    <head>
-        <meta charset="UTF-8" />
-        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-        <title>Schema Evaluator - Individual Analysis</title>
-    </head>
-    <body style="background-color: #f6f6f6">
-        <noscript> You need to enable JavaScript to run this app. </noscript>
-        <div id="root"></div>
-        <script src="https://s3-eu-west-1.amazonaws.com/biojs/msa/latest/msa.js"></script>
-        <link type=text/css rel=stylesheet href=https://s3-eu-west-1.amazonaws.com/biojs/msa/latest/msa.css />
-        <script> preComputedDataInd = {0} </script>
-        <script src="./report_bundle.js"></script>
-    </body>
+	<head>
+		<meta charset="UTF-8" />
+		<meta name="viewport" content="width=device-width, initial-scale=1.0" />
+		<title>Schema Evaluator - Individual Analysis</title>
+	</head>
+	<body style="background-color: #f6f6f6">
+		<noscript> You need to enable JavaScript to run this app. </noscript>
+		<div id="root"></div>
+		<script src="https://s3-eu-west-1.amazonaws.com/biojs/msa/latest/msa.js"></script>
+		<link type=text/css rel=stylesheet href=https://s3-eu-west-1.amazonaws.com/biojs/msa/latest/msa.css />
+		<script> preComputedDataInd = {0} </script>
+		<script src="./report_bundle.js"></script>
+	</body>
 </html>
 """)
 
@@ -332,15 +332,15 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
-    <meta charset="UTF-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>AlleleCall Report - React Edition</title>
+	<meta charset="UTF-8" />
+	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
+	<title>AlleleCall Report - React Edition</title>
 </head>
 <body style="background-color: #f6f6f6">
-    <noscript> You need to enable JavaScript to run this app. </noscript>
-    <div id="root"></div>
-    <script> preComputedData = {0} </script>
-    <script src="./report_bundle.js"></script>
+	<noscript> You need to enable JavaScript to run this app. </noscript>
+	<div id="root"></div>
+	<script> preComputedData = {0} </script>
+	<script src="./report_bundle.js"></script>
 </body>
 </html>
 """)
@@ -377,107 +377,107 @@
 
 # Table header for the Schema report summary data
 SCHEMA_SUMMARY_TABLE_HEADERS = ('Loci\tAlleles\tValid Alleles\tInvalid '
-                                'alleles\tIncomplete ORF\tAmbiguous Bases\t'
-                                'Missing Start/Stop Codon\tIn-frame Stop '
-                                'Codon\tAlleles < {0}bp\tAlleles below '
-                                'threshold\tAlleles above threshold')
+								'alleles\tIncomplete ORF\tAmbiguous Bases\t'
+								'Missing Start/Stop Codon\tIn-frame Stop '
+								'Codon\tAlleles < {0}bp\tAlleles below '
+								'threshold\tAlleles above threshold')
 
 # Column headers for the Loci Analysis Table in the Schema report
 LOCI_ANALYSIS_COLUMNS = ('Locus\tTotal Alleles\tValid Alleles\tInvalid '
-                         'Alleles\tProportion of Validated Alleles\tDistinct '
-                         'Protein Alleles\tIncomplete '
-                         'ORF\tAmbiguous Bases\tMissing '
-                         'Start/Stop Codon\tIn-frame Stop Codon\tAlleles '
-                         '< {0}bp\tAlleles below threshold\tAlleles above '
-                         'threshold\tMissing Allele IDs')
+						 'Alleles\tProportion of Validated Alleles\tDistinct '
+						 'Protein Alleles\tIncomplete '
+						 'ORF\tAmbiguous Bases\tMissing '
+						 'Start/Stop Codon\tIn-frame Stop Codon\tAlleles '
+						 '< {0}bp\tAlleles below threshold\tAlleles above '
+						 'threshold\tMissing Allele IDs')
 
 # Column headers for the Summary Table in the Loci reports
 LOCUS_COLUMNS = ('Locus\tTotal Alleles\tValid Alleles\tInvalid '
-                 'Alleles\tProportion of Validated Alleles\tDistinct '
-                 'Protein Alleles\t'
-                 'Incomplete ORF\tAmbiguous Bases\tMissing Start/Stop '
-                 'Codon\tIn-frame Stop Codon\tAlleles < {0}bp\tSize Range '
-                 '(bp)\tLength Median (bp)\tLength Mode (bp)\tAlleles below '
-                 'threshold ({1}bp)\tAlleles above threshold ({2}bp)\t'
-                 'Missing Allele IDs')
+				 'Alleles\tProportion of Validated Alleles\tDistinct '
+				 'Protein Alleles\t'
+				 'Incomplete ORF\tAmbiguous Bases\tMissing Start/Stop '
+				 'Codon\tIn-frame Stop Codon\tAlleles < {0}bp\tSize Range '
+				 '(bp)\tLength Median (bp)\tLength Mode (bp)\tAlleles below '
+				 'threshold ({1}bp)\tAlleles above threshold ({2}bp)\t'
+				 'Missing Allele IDs')
 
 # Column headers for the Invalid Alleles table in the loci reports
 INVALID_ALLELES_COLUMNS = ['Allele ID', 'Exception Category',
-                           'Exception Description']
+						   'Exception Description']
 
 TRANSLATION_EXCEPTIONS = ['Extra in frame stop codon',
-                          'is not a start codon',
-                          'is not a stop codon',
-                          'sequence length is not a multiple of 3',
-                          'ambiguous or invalid characters']
+						  'is not a start codon',
+						  'is not a stop codon',
+						  'sequence length is not a multiple of 3',
+						  'ambiguous or invalid characters']
 
 DISTINCT_ALLELES_COLUMNS = ['Protein Allele ID',
-                            'Count',
-                            'List of Distinct Alleles']
+							'Count',
+							'List of Distinct Alleles']
 
 # Column headers for the Sample Stats table in the allele calling report
 SAMPLE_STATS_COLUMNS = ['Sample', 'Total Contigs', 'Total CDSs',
-                        'Proportion of Classified CDSs', 'Identified Loci',
-                        'Proportion of Identified Loci',
-                        'Valid Classifications', 'Invalid Classifications']
+						'Proportion of Classified CDSs', 'Identified Loci',
+						'Proportion of Identified Loci',
+						'Valid Classifications', 'Invalid Classifications']
 
 # Column headers for the Loci Stats table in the allele calling report
 LOCI_STATS_COLUMNS = ['Locus', 'Total CDSs', 'Valid Classes',
-                      'Invalid Classes', 'Proportion Samples']
+					  'Invalid Classes', 'Proportion Samples']
 
 # Column headers for the Summary Stats table in the allele calling report
 SUMMARY_STATS_COLUMNS = ['Total Samples', 'Total Loci', 'Total CDSs',
-                         'Total CDSs Classified', 'EXC', 'INF',
-                         'PLOT3', 'PLOT5', 'LOTSC', 'NIPH',
-                         'NIPHEM', 'ALM', 'ASM', 'PAMA', 'LNF']
+						 'Total CDSs Classified', 'EXC', 'INF',
+						 'PLOT3', 'PLOT5', 'LOTSC', 'NIPH',
+						 'NIPHEM', 'ALM', 'ASM', 'PAMA', 'LNF']
 
 # Exception messages
 
 # Input file is a FASTA file but chewBBACA expects a file with a list of
 # file paths
 FASTA_INPUT_EXCEPTION = ('Input file is a FASTA file. Please provide '
-                         'the path to the parent directory that contains '
-                         'the FASTA files or a file with the list of full '
-                         'paths to the FASTA files (one per line).')
+						 'the path to the parent directory that contains '
+						 'the FASTA files or a file with the list of full '
+						 'paths to the FASTA files (one per line).')
 
 # Some input files have an invalid file extension
 INVALID_EXTENSION_EXCEPTION = ('The following input files do not have a '
-                               'valid file extension:\n{0}\nPlease ensure '
-                               'that the filenames end with one of the '
-                               f'following extensions: {FASTA_EXTENSIONS}.')
+							   'valid file extension:\n{0}\nPlease ensure '
+							   'that the filenames end with one of the '
+							   f'following extensions: {FASTA_EXTENSIONS}.')
 
 # Some of the file paths provided do not exist
 MISSING_INPUTS_EXCEPTION = ('Could not find some of the files provided in '
-                            'the input list. Please verify that you\'ve '
-                            'provided valid paths to the following input '
-                            'files.\n{0}')
+							'the input list. Please verify that you\'ve '
+							'provided valid paths to the following input '
+							'files.\n{0}')
 
 # Files that do not have the expected format of a FASTA file
 NON_FASTA_EXCEPTION = ('The following input files are not in FASTA format:\n{0}')
 
 # Input directory does not contain FASTA files
 MISSING_FASTAS_EXCEPTION = ('Could not get input files. Please provide '
-                            'a directory with FASTA files or a file with '
-                            'the list of full paths to the FASTA files '
-                            'and ensure that filenames end with one of '
-                            f'the following extensions: {FASTA_EXTENSIONS}.')
+							'a directory with FASTA files or a file with '
+							'the list of full paths to the FASTA files '
+							'and ensure that filenames end with one of '
+							f'the following extensions: {FASTA_EXTENSIONS}.')
 
 # Input path is neither a file nor a directory
 INVALID_INPUT_PATH = ('Input argument is not a valid directory or '
-                      'file with a list of paths to FASTA files. Please '
-                      'provide a valid input, either a folder with FASTA '
-                      'files or a file with the list of full paths to FASTA '
-                      'files (one per line and ending with one of the '
-                      f'following file extensions: {FASTA_EXTENSIONS}).')
+					  'file with a list of paths to FASTA files. Please '
+					  'provide a valid input, either a folder with FASTA '
+					  'files or a file with the list of full paths to FASTA '
+					  'files (one per line and ending with one of the '
+					  f'following file extensions: {FASTA_EXTENSIONS}).')
 
 # Path to schema does not exist
 SCHEMA_PATH_MISSING = ('Path to input schema does not exist. Please provide '
-                       'a valid path.')
+					   'a valid path.')
 
 # Path to schema does not include expected files
 SCHEMA_INVALID_PATH = ('Provided path does not include all the necessary '
-                       'schema files. Please verify that you have passed '
-                       'the correct path to the schema.')
+					   'schema files. Please verify that you have passed '
+					   'the correct path to the schema.')
 
 # User provided legacy schema. Tell user to adapt with the PrepExternalSchema module
 ADAPT_LEGACY_SCHEMA = ('Schema does not include a config file or includes files that'
@@ -487,31 +487,31 @@
 
 # Output directory exists
 OUTPUT_DIRECTORY_EXISTS = ('Output directory already exists. Please '
-                           'provide a path to a directory that will be '
-                           'created to store the results.')
+						   'provide a path to a directory that will be '
+						   'created to store the results.')
 
 # Input file is a FASTA file but chewBBACA expects a file with a list of
 # loci file paths
 FASTA_LOCI_LIST_EXCEPTION = ('Path provided to --gl is for a FASTA file. Please provide '
-                             'a file with the list of locus identifiers or full '
-                             'paths to the loci FASTA files (one per line).')
+							 'a file with the list of locus identifiers or full '
+							 'paths to the loci FASTA files (one per line).')
 
 # Invalid paths to loci FASTA files
 MISSING_LOCI_EXCEPTION = ('Could not find some of the loci FASTA files provided in '
-                          'the input list. Please verify that you\'ve '
-                          'provided valid paths to the following input '
-                          'files.\n{0}')
+						  'the input list. Please verify that you\'ve '
+						  'provided valid paths to the following input '
+						  'files.\n{0}')
 
 # Invalid format for loci files
 NON_FASTA_LOCI_EXCEPTION = ('The following loci files are not in FASTA format:\n{0}')
 
 # User does not have permissions to upload schemas to Chewie-NS
 LOADSCHEMA_NO_PERMISSIONS = ('Current user has no Administrator or Contributor '
-                             'permissions.\nNot allowed to upload schemas.')
+							 'permissions.\nNot allowed to upload schemas.')
 
 # PTF is missing from schema's directory
 LOADSCHEMA_MISSING_PTF = ('Please ensure that the schema\'s directory includes the '
-                          'Prodigal training file used to create the schema.')
+						  'Prodigal training file used to create the schema.')
 
 # Path for PTF does not exist
 INVALID_PTF_PATH = 'Invalid path for Prodigal training file.'
@@ -532,27 +532,27 @@
 INVALID_ST_TYPE = ('\nInvalid size threshold value used to create schema. Value must be None or a positive float in the [0.0, 1.0] interval.')
 
 INVALID_GENETIC_CODE = ('\nInvalid genetic code value.\nValue must correspond to '
-                 		'one of the accepted genetic codes\n\nAccepted genetic '
-                 		'codes:\n{0}')
+				 		'one of the accepted genetic codes\n\nAccepted genetic '
+				 		'codes:\n{0}')
 
 INVALID_WS = ('\nWord size for the clustering step '
-                     'must be equal or greater than {0} and '
-                     'equal or smaller than {1}.')
+					 'must be equal or greater than {0} and '
+					 'equal or smaller than {1}.')
 INVALID_WS_TYPE = ('\nSchema created with invalid clustering word size value.')
 
 INVALID_CS = ('\nClustering similarity threshold value '
-              'must be contained in the [0.0, 1.0] '
-              'interval.')
+			  'must be contained in the [0.0, 1.0] '
+			  'interval.')
 INVALID_CS_TYPE = ('\nSchema created with invalid clustering threshold value.')
 
 INVALID_RF = ('\nRepresentative filter threshold value '
-              'must be contained in the [0.0, 1.0] '
-              'interval.')
+			  'must be contained in the [0.0, 1.0] '
+			  'interval.')
 INVALID_RF_TYPE = ('\nSchema created with invalid representative filter value.')
 
 INVALID_ICF = ('\nIntra-cluster filter value '
-               'must be contained in the [0.0, 1.0] '
-               'interval.')
+			   'must be contained in the [0.0, 1.0] '
+			   'interval.')
 INVALID_ICF_TYPE = ('\nSchema created with invalid intra-cluster filter value.')
 
 NS_CANNOT_CONNECT = ('Failed to establish a connection to the Chewie-NS instance at {0}.')
@@ -560,50 +560,50 @@
 PYTHON_VERSION = ('Python version found: {0}\nPlease use Python >= {1}')
 
 CPU_RESET_WARNING = ('Warning! You have provided a CPU core count value '
-             		 'that is equal to or exceeds the number of CPU '
-             		 'cores in your system! Resetting to: {0}')
+			 		 'that is equal to or exceeds the number of CPU '
+			 		 'cores in your system! Resetting to: {0}')
 CPU_VALUE_WARNING = ('Warning! You have provided a CPU core count value '
-              		 'that is close to the maximum core count of your '
-              		 'machine ({0}/{1}). This may affect your system '
-              		 'responsiveness.')
+			  		 'that is close to the maximum core count of your '
+			  		 'machine ({0}/{1}). This may affect your system '
+			  		 'responsiveness.')
 
 BLAST_NO_PATH = ('Could not find BLAST executables.')
 BLAST_NO_VERSION = ('Could not determine BLAST version. Please make '
-                 	'sure that BLAST>={0}.{1} is installed.')
+				 	'sure that BLAST>={0}.{1} is installed.')
 BLAST_UPDATE = ('Found BLAST {0}.{1}. Please update BLAST to version >={2}.{3}')
 
 MULTIPLE_PTFS = ('Found more than one Prodigal training '
-                 'file in the schema directory.\nPlease maintain '
-                 'only the training file used in the schema '
-                 'creation process.')
+				 'file in the schema directory.\nPlease maintain '
+				 'only the training file used in the schema '
+				 'creation process.')
 MISSING_PTF = ('Could not find a Prodigal training file in the schema directory.')
 
 INVALID_PTF_PATH = ('Cannot find specified Prodigal training file.'
-                    '\nPlease provide a valid training file.\nYou '
-                    'can create a training file for a species of '
-                    'interest with the following command:\n\n  prodigal '
-                    '-i <reference_genome> -t <training_file.trn> -p '
-                    'single\n\nIt is strongly advised to provide a '
-                    'high-quality and closed genome for the training '
-                    'process.')
+					'\nPlease provide a valid training file.\nYou '
+					'can create a training file for a species of '
+					'interest with the following command:\n\n  prodigal '
+					'-i <reference_genome> -t <training_file.trn> -p '
+					'single\n\nIt is strongly advised to provide a '
+					'high-quality and closed genome for the training '
+					'process.')
 
 DIFFERENT_PTF_PROMPT = ('Prodigal training file is not the one '
-              			'used to create the schema. Using this training '
+			  			'used to create the schema. Using this training '
 			  			'file might lead to results not consistent with '
 			  			'previous runs and invalidate the schema for '
 			  			'usage with Chewie-NS.\nContinue process?\n')
 MULTIPLE_PTF_PROMPT = ('Prodigal training file is not any of the {0} '
-                       'used in previous runs.\nContinue?\n')
+					   'used in previous runs.\nContinue?\n')
 
 ARGS_DIFFER = ('Provided argument values differ from the values '
 			   'used for schema creation:\n')
 
 ARGS_DIFFER_PROMPT = ('\nContinuing might lead to results not '
-                      'consistent with previous runs.\nProviding '
-                      'parameter values that differ from the values '
-                      'used for schema creation will also invalidate '
-                      'the schema for uploading and synchronization '
-                      'with Chewie-NS.\nContinue? (yes/no)\n')
+					  'consistent with previous runs.\nProviding '
+					  'parameter values that differ from the values '
+					  'used for schema creation will also invalidate '
+					  'the schema for uploading and synchronization '
+					  'with Chewie-NS.\nContinue? (yes/no)\n')
 
 MISSING_CONFIG = ('Could not find a valid config file.')
 
diff --git a/CHEWBBACA/utils/distance_matrix.py b/CHEWBBACA/utils/distance_matrix.py
index e32c6d25..478c7b43 100644
--- a/CHEWBBACA/utils/distance_matrix.py
+++ b/CHEWBBACA/utils/distance_matrix.py
@@ -25,408 +25,408 @@
 import pandas as pd
 
 try:
-    from utils import (
-        constants as ct,
-        file_operations as fo,
-        iterables_manipulation as im,
-        multiprocessing_operations as mo)
+	from utils import (
+		constants as ct,
+		file_operations as fo,
+		iterables_manipulation as im,
+		multiprocessing_operations as mo)
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import (
-        constants as ct,
-        file_operations as fo,
-        iterables_manipulation as im,
-        multiprocessing_operations as mo)
+	from CHEWBBACA.utils import (
+		constants as ct,
+		file_operations as fo,
+		iterables_manipulation as im,
+		multiprocessing_operations as mo)
 
 
 def tsv_to_nparray(input_file, array_dtype='int32'):
-    """Read matrix of allelic profiles and convert to Numpy array.
-
-    File lines are read as a generator to exclude the sample identifier
-    and avoid loading huge files into memory.
-
-    Parameters
-    ----------
-    input_file : str
-        Path to the TSV file with the matrix with
-        allelic profiles.
-    array_dtype : str
-        Array data type.
-
-    Returns
-    -------
-    np_array : ndarray
-        Numpy array with the numeric values for
-        all allelic profiles.
-    """
-    # import matrix without column and row identifiers
-    with open(input_file, 'r') as infile:
-        lines = ('\t'.join(line.split('\t')[1:])
-                 for line in infile)
-        # dtype=float32 should be faster than integer dtypes
-        # but runs faster with dtype=int32 in test setup
-        # dtype=int32 supports max integer of 2,147,483,647
-        # should be safe even when arrays are multiplied
-        np_array = np.genfromtxt(fname=lines, delimiter='\t',
-                                 dtype=array_dtype, skip_header=1)
-
-        # need to reshape array to 2D if it only has info for one sample
-        try:
-            # if it does not have columns it needs to be reshaped
-            np_array.shape[1]
-        except Exception:
-            np_array = np.array([np_array])
-
-    return np_array
+	"""Read matrix of allelic profiles and convert to Numpy array.
+
+	File lines are read as a generator to exclude the sample identifier
+	and avoid loading huge files into memory.
+
+	Parameters
+	----------
+	input_file : str
+		Path to the TSV file with the matrix with
+		allelic profiles.
+	array_dtype : str
+		Array data type.
+
+	Returns
+	-------
+	np_array : ndarray
+		Numpy array with the numeric values for
+		all allelic profiles.
+	"""
+	# import matrix without column and row identifiers
+	with open(input_file, 'r') as infile:
+		lines = ('\t'.join(line.split('\t')[1:])
+				 for line in infile)
+		# dtype=float32 should be faster than integer dtypes
+		# but runs faster with dtype=int32 in test setup
+		# dtype=int32 supports max integer of 2,147,483,647
+		# should be safe even when arrays are multiplied
+		np_array = np.genfromtxt(fname=lines, delimiter='\t',
+								 dtype=array_dtype, skip_header=1)
+
+		# need to reshape array to 2D if it only has info for one sample
+		try:
+			# if it does not have columns it needs to be reshaped
+			np_array.shape[1]
+		except Exception:
+			np_array = np.array([np_array])
+
+	return np_array
 
 
 def compute_distances(indexes, np_matrix, genome_ids, tmp_directory):
-    """Compute pairwise allelic differences and number of shared loci.
-
-    Parameters
-    ----------
-    indexes : list
-        List with the line index of the allelic profiles
-        that will be processed.
-    np_matrix : ndarray
-        Numpy array with dtype=int32 values for allelic profiles.
-    genome_ids : list
-        List with sample identifiers.
-    tmp_directory : str
-        Path to temporary directory where pickle files with
-        results will be stored.
-
-    Returns
-    -------
-    output_files : list
-        List with the paths to all pickle files that were created
-        to store results.
-    """
-    # multiply one row per cycle to avoid memory overflow
-    # read only part of the matrix for huge files and process in chunks?
-    output_files = {}
-    for i in indexes:
-        current_genome = genome_ids[i]
-        # get one row to perform pairwise comparisons against whole matrix
-        current_row = np_matrix[i:i+1, :]
-        # do not multiply against rows that were multiplied against
-        # matrix's rows in previous iterations
-        # combinations instead of permutations
-        permutation_rows = np_matrix[i:, :]
-
-        # multiply 1D-array per whole matrix
-        # all non-shared loci will be converted to 0
-        # values different than 0 correspond to shared loci
-        multiplied = current_row * permutation_rows
-        # count number of shared loci, non-zero values
-        # pairwise_shared_loci = np.count_nonzero(multiplied, axis=-1)
-
-        # subtraction will lead to values different than 0 for loci that have different alleles
-        # multiplying ensures that we only keep results for shared loci and not for
-        # loci that are not shared and that had value different than 0 from subtraction
-        pairwise_allelic_differences = np.count_nonzero(multiplied * (current_row - permutation_rows), axis=-1)
-
-        output_file = os.path.join(tmp_directory, current_genome)
-        # fo.pickle_dumper([pairwise_shared_loci, pairwise_allelic_differences], output_file)
-        fo.pickle_dumper([pairwise_allelic_differences], output_file)
-        output_files[current_genome] = output_file
-
-    return output_files
+	"""Compute pairwise allelic differences and number of shared loci.
+
+	Parameters
+	----------
+	indexes : list
+		List with the line index of the allelic profiles
+		that will be processed.
+	np_matrix : ndarray
+		Numpy array with dtype=int32 values for allelic profiles.
+	genome_ids : list
+		List with sample identifiers.
+	tmp_directory : str
+		Path to temporary directory where pickle files with
+		results will be stored.
+
+	Returns
+	-------
+	output_files : list
+		List with the paths to all pickle files that were created
+		to store results.
+	"""
+	# multiply one row per cycle to avoid memory overflow
+	# read only part of the matrix for huge files and process in chunks?
+	output_files = {}
+	for i in indexes:
+		current_genome = genome_ids[i]
+		# get one row to perform pairwise comparisons against whole matrix
+		current_row = np_matrix[i:i+1, :]
+		# do not multiply against rows that were multiplied against
+		# matrix's rows in previous iterations
+		# combinations instead of permutations
+		permutation_rows = np_matrix[i:, :]
+
+		# multiply 1D-array per whole matrix
+		# all non-shared loci will be converted to 0
+		# values different than 0 correspond to shared loci
+		multiplied = current_row * permutation_rows
+		# count number of shared loci, non-zero values
+		# pairwise_shared_loci = np.count_nonzero(multiplied, axis=-1)
+
+		# subtraction will lead to values different than 0 for loci that have different alleles
+		# multiplying ensures that we only keep results for shared loci and not for
+		# loci that are not shared and that had value different than 0 from subtraction
+		pairwise_allelic_differences = np.count_nonzero(multiplied * (current_row - permutation_rows), axis=-1)
+
+		output_file = os.path.join(tmp_directory, current_genome)
+		# fo.pickle_dumper([pairwise_shared_loci, pairwise_allelic_differences], output_file)
+		fo.pickle_dumper([pairwise_allelic_differences], output_file)
+		output_files[current_genome] = output_file
+
+	return output_files
 
 
 def get_sample_ids(input_file, delimiter='\t'):
-    r"""Extract the sample identifiers from a matrix with allelic profiles.
+	r"""Extract the sample identifiers from a matrix with allelic profiles.
 
-    Parameters
-    ----------
-    input_file : str
-        Path to the input file that contains a matrix
-        with allelic profiles.
-    delimiter : str, optional
-        Field delimiter. The default is '\t'.
+	Parameters
+	----------
+	input_file : str
+		Path to the input file that contains a matrix
+		with allelic profiles.
+	delimiter : str, optional
+		Field delimiter. The default is '\t'.
 
-    Returns
-    -------
-    sample_ids : list
-        List with the sample identifiers.
-    """
-    with open(input_file, 'r') as infile:
-        reader = csv.reader(infile, delimiter=delimiter)
-        sample_ids = [line[0] for line in reader][1:]
+	Returns
+	-------
+	sample_ids : list
+		List with the sample identifiers.
+	"""
+	with open(input_file, 'r') as infile:
+		reader = csv.reader(infile, delimiter=delimiter)
+		sample_ids = [line[0] for line in reader][1:]
 
-    return sample_ids
+	return sample_ids
 
 
 # def write_matrices(pickled_results, genome_ids, output_pairwise,
 #                    output_p, col_ids):
 def write_matrices(pickled_results, genome_ids, output_pairwise, col_ids):
-    """Write above diagonal matrices with allelic differences and shared loci.
-
-    Parameters
-    ----------
-    pickled_results : dict
-        Dictionary with sample identifiers as keys
-        and paths to binary files with pickled results
-        as values.
-    genome_ids : list
-        List with sample identifiers.
-    output_pairwise : str
-        Path to the output file to which the matrix
-        with pairwise allelic differences will be saved.
-    output_p : str
-        Path to the output file to which the matrix
-        with pairwise shared loci will be saved.
-    col_ids: list
-        List with sample identifiers to add as headers.
-
-    Returns
-    -------
-    None.
-    """
-    # sl_lines = [col_ids]
-    ad_lines = [col_ids]
-    limit = 300
-    for g in genome_ids:
-        current_file = pickled_results[g]
-        # load data
-        data = fo.pickle_loader(current_file)
-
-        # shared_loci = list(data[0])
-        # shared_loci = list(map(str, shared_loci))
-        allele_diffs = list(data[0])
-        allele_diffs = list(map(str, allele_diffs))
-
-        padding = [''] * (len(genome_ids)-len(allele_diffs))
-
-        # sl_line = [g] + padding + shared_loci
-        # sl_lines.append(sl_line)
-        ad_line = [g] + padding + allele_diffs
-        ad_lines.append(ad_line)
-
-        # if len(sl_lines) >= limit or g == genome_ids[-1]:
-        if len(ad_lines) >= limit or g == genome_ids[-1]:
-            ad_lines = [im.join_list(line, '\t') for line in ad_lines]
-            fo.write_lines(ad_lines, output_pairwise, joiner='\n', write_mode='a')
-            ad_lines = []
-            # write_lines(sl_lines, output_p, mode='a')
-            # sl_lines = []
-
-    return True
+	"""Write above diagonal matrices with allelic differences and shared loci.
+
+	Parameters
+	----------
+	pickled_results : dict
+		Dictionary with sample identifiers as keys
+		and paths to binary files with pickled results
+		as values.
+	genome_ids : list
+		List with sample identifiers.
+	output_pairwise : str
+		Path to the output file to which the matrix
+		with pairwise allelic differences will be saved.
+	output_p : str
+		Path to the output file to which the matrix
+		with pairwise shared loci will be saved.
+	col_ids: list
+		List with sample identifiers to add as headers.
+
+	Returns
+	-------
+	None.
+	"""
+	# sl_lines = [col_ids]
+	ad_lines = [col_ids]
+	limit = 300
+	for g in genome_ids:
+		current_file = pickled_results[g]
+		# load data
+		data = fo.pickle_loader(current_file)
+
+		# shared_loci = list(data[0])
+		# shared_loci = list(map(str, shared_loci))
+		allele_diffs = list(data[0])
+		allele_diffs = list(map(str, allele_diffs))
+
+		padding = [''] * (len(genome_ids)-len(allele_diffs))
+
+		# sl_line = [g] + padding + shared_loci
+		# sl_lines.append(sl_line)
+		ad_line = [g] + padding + allele_diffs
+		ad_lines.append(ad_line)
+
+		# if len(sl_lines) >= limit or g == genome_ids[-1]:
+		if len(ad_lines) >= limit or g == genome_ids[-1]:
+			ad_lines = [im.join_list(line, '\t') for line in ad_lines]
+			fo.write_lines(ad_lines, output_pairwise, joiner='\n', write_mode='a')
+			ad_lines = []
+			# write_lines(sl_lines, output_p, mode='a')
+			# sl_lines = []
+
+	return True
 
 
 def transpose_matrix(input_file, output_directory):
-    """Transpose lines in a TSV file.
-
-    Parameters
-    ----------
-    input_file : str
-        Path to the input TSV file.
-    output_directory : str
-        Path to the directory to which intermediate files
-        with transposed lines will be written.
-
-    Returns
-    -------
-    output_transpose : str
-        Path to the file with the transposed matrix.
-        This file is created by concatenating all
-        files saved into `output_directory`.
-    """
-    file_id = 1
-    transpose_files = []
-    input_basename = os.path.basename(input_file)
-    with open(input_file, 'r') as infile:
-        # get columns names
-        columns = [e.strip() for e in (infile.__next__()).split('\t')]
-        # divide into smaller sets to avoid loading huge files
-        num_col_sets = math.ceil(len(columns)/500)
-        col_sets = im.divide_list_into_n_chunks(columns, num_col_sets)
-        # use Pandas to read columns sets and save transpose
-        for c in col_sets:
-            # dtype=str or Pandas converts values into floats
-            df = pd.read_csv(input_file, usecols=c, delimiter='\t', dtype=str)
-            output_basename = input_basename.replace('.tsv', '_{0}.tsv'.format(file_id))
-            output_file = os.path.join(output_directory, output_basename)
-            # transpose columns
-            df = df.T
-            # do not save header that contains row indexes
-            df.to_csv(output_file, sep='\t', header=False)
-            transpose_files.append(output_file)
-            file_id += 1
-
-    # concatenate all files with transposed lines
-    output_transpose = input_file.replace('.tsv', '_transpose.tsv')
-    fo.concatenate_files(transpose_files, output_transpose)
-
-    return output_transpose
+	"""Transpose lines in a TSV file.
+
+	Parameters
+	----------
+	input_file : str
+		Path to the input TSV file.
+	output_directory : str
+		Path to the directory to which intermediate files
+		with transposed lines will be written.
+
+	Returns
+	-------
+	output_transpose : str
+		Path to the file with the transposed matrix.
+		This file is created by concatenating all
+		files saved into `output_directory`.
+	"""
+	file_id = 1
+	transpose_files = []
+	input_basename = os.path.basename(input_file)
+	with open(input_file, 'r') as infile:
+		# get columns names
+		columns = [e.strip() for e in (infile.__next__()).split('\t')]
+		# divide into smaller sets to avoid loading huge files
+		num_col_sets = math.ceil(len(columns)/500)
+		col_sets = im.divide_list_into_n_chunks(columns, num_col_sets)
+		# use Pandas to read columns sets and save transpose
+		for c in col_sets:
+			# dtype=str or Pandas converts values into floats
+			df = pd.read_csv(input_file, usecols=c, delimiter='\t', dtype=str)
+			output_basename = input_basename.replace('.tsv', '_{0}.tsv'.format(file_id))
+			output_file = os.path.join(output_directory, output_basename)
+			# transpose columns
+			df = df.T
+			# do not save header that contains row indexes
+			df.to_csv(output_file, sep='\t', header=False)
+			transpose_files.append(output_file)
+			file_id += 1
+
+	# concatenate all files with transposed lines
+	output_transpose = input_file.replace('.tsv', '_transpose.tsv')
+	fo.concatenate_files(transpose_files, output_transpose)
+
+	return output_transpose
 
 
 def merge_triangular_matrices(upper_matrix, lower_matrix, output_file, matrix_size):
-    """Merge two triangular matrices to create a symmetric matrix.
-
-    Parameters
-    ----------
-    upper_matrix : str
-        Path to the TSV file that contains the upper
-        triangular matrix.
-    lower_matrix : str
-        Path to the TSV file that contains the lower
-        triangular matrix.
-    output_file : str
-        Path to the output file to which the symmetric
-        matrix will be saved.
-    matrix_size : int
-        Total number of lines in the triangular matrix.
-
-    Returns
-    -------
-    None.
-    """
-    with open(upper_matrix, 'r') as upper_handle, open(lower_matrix, 'r') as lower_handle:
-        upper_reader = csv.reader(upper_handle, delimiter='\t')
-        lower_reader = csv.reader(lower_handle, delimiter='\t')
-
-        merged_lines = []
-        for i in range(matrix_size):
-            upper_line = upper_reader.__next__()
-            lower_line = lower_reader.__next__()
-            merged_line = [e
-                           if e != ''
-                           else lower_line[i]
-                           for i, e in enumerate(upper_line)]
-            merged_lines.append(merged_line)
-
-            if len(merged_lines) >= 200 or i == (matrix_size-1):
-                merged_lines = [im.join_list(line, '\t') for line in merged_lines]
-                fo.write_lines(merged_lines, output_file, joiner='\n', write_mode='a')
-                merged_lines = []
+	"""Merge two triangular matrices to create a symmetric matrix.
+
+	Parameters
+	----------
+	upper_matrix : str
+		Path to the TSV file that contains the upper
+		triangular matrix.
+	lower_matrix : str
+		Path to the TSV file that contains the lower
+		triangular matrix.
+	output_file : str
+		Path to the output file to which the symmetric
+		matrix will be saved.
+	matrix_size : int
+		Total number of lines in the triangular matrix.
+
+	Returns
+	-------
+	None.
+	"""
+	with open(upper_matrix, 'r') as upper_handle, open(lower_matrix, 'r') as lower_handle:
+		upper_reader = csv.reader(upper_handle, delimiter='\t')
+		lower_reader = csv.reader(lower_handle, delimiter='\t')
+
+		merged_lines = []
+		for i in range(matrix_size):
+			upper_line = upper_reader.__next__()
+			lower_line = lower_reader.__next__()
+			merged_line = [e
+						   if e != ''
+						   else lower_line[i]
+						   for i, e in enumerate(upper_line)]
+			merged_lines.append(merged_line)
+
+			if len(merged_lines) >= 200 or i == (matrix_size-1):
+				merged_lines = [im.join_list(line, '\t') for line in merged_lines]
+				fo.write_lines(merged_lines, output_file, joiner='\n', write_mode='a')
+				merged_lines = []
 
 
 def symmetrify_matrix(input_matrix, matrix_size, tmp_directory):
-    """Symmetrify a triangular matrix.
+	"""Symmetrify a triangular matrix.
 
-    Parameters
-    ----------
-    input_matrix : str
-        Path to TSV file that contains the triangular matrix.
-    matrix_size : int
-        Total number of lines in input file.
-    tmp_directory : str
-        Path to the output temporary directory.
+	Parameters
+	----------
+	input_matrix : str
+		Path to TSV file that contains the triangular matrix.
+	matrix_size : int
+		Total number of lines in input file.
+	tmp_directory : str
+		Path to the output temporary directory.
 
-    Returns
-    -------
-    symmetric_output : str
-        Path to the output file that contains the symmetric
-        matrix.
-    """
-    output_transpose = transpose_matrix(input_matrix, tmp_directory)
+	Returns
+	-------
+	symmetric_output : str
+		Path to the output file that contains the symmetric
+		matrix.
+	"""
+	output_transpose = transpose_matrix(input_matrix, tmp_directory)
 
-    # merge upper and lower diagonal matrices into symmetric matrix
-    symmetric_output = input_matrix.replace('.tsv', '_symmetric.tsv')
+	# merge upper and lower diagonal matrices into symmetric matrix
+	symmetric_output = input_matrix.replace('.tsv', '_symmetric.tsv')
 
-    merge_triangular_matrices(input_matrix, output_transpose,
-                              symmetric_output, matrix_size)
+	merge_triangular_matrices(input_matrix, output_transpose,
+							  symmetric_output, matrix_size)
 
-    # delete files with triangular matrices
-    os.remove(input_matrix)
-    os.remove(output_transpose)
+	# delete files with triangular matrices
+	os.remove(input_matrix)
+	os.remove(output_transpose)
 
-    return symmetric_output
+	return symmetric_output
 
 
 def main(input_matrix, output_directory, cpu_cores, symmetric, masked):
-    """Compute a distance matrix based on allelic profiles.
-
-    Parameters
-    ----------
-    input_matrix : str
-        Path to a TSV file with allelic profiles determined by
-        the AlleleCall module.
-    output_directory : str
-        Path to the output directory.
-    cpu_cores : int
-        Number of CPU cores used to compute distances.
-    symmetric : bool
-        Determine a symmetric pairwise distance matrix, instead
-        of a triangular matrix.
-    masked : bool
-        False if the input matrix values are masked, True otherwise.
-        The process will mask the matrix values it this value is False.
-
-    Returns
-    -------
-    output_pairwise : str
-        Path to the TSV file that contains the distance matrix.
-    """
-    # Create output directory if it does not exist
-    if os.path.isdir(output_directory) is False:
-        os.mkdir(output_directory)
-
-    # determine input basename
-    input_basename = os.path.basename(input_matrix)
-    # remove extension that is after last '.'
-    input_basename = '.'.join(input_basename.split('.')[0:-1])
-
-    output_masked = os.path.join(output_directory, ct.MASKED_PROFILES_BASENAME)
-    if masked is False:
-        print('Masking profile matrix...', end='')
-        profiles_matrix = pd.read_csv(input_matrix,
-                                      header=0, index_col=0,
-                                      sep='\t', low_memory=False)
-        masked_profiles = profiles_matrix.apply(im.replace_chars)
-        masked_profiles.to_csv(output_masked, sep='\t')
-        print('masked matrix available at {0}'.format(output_masked))
-    else:
-        output_masked = input_matrix
-
-    # create temp directory to store pairwise distances per genome
-    tmp_directory = os.path.join(output_directory, 'temp', 'pairwise_distances')
-    if os.path.isdir(tmp_directory) is False:
-        os.mkdir(tmp_directory)
-
-    # get sample identifiers
-    genome_ids = get_sample_ids(input_matrix, delimiter='\t')
-    total_genomes = len(genome_ids)
-
-    np_matrix = tsv_to_nparray(output_masked)
-
-    rows_indexes = [i for i in range(len(np_matrix))]
-    random.shuffle(rows_indexes)
-    # divide inputs into 20 lists for 5% progress resolution
-    parallel_inputs = im.divide_list_into_n_chunks(rows_indexes, 20)
-
-    common_args = [[l, np_matrix, genome_ids, tmp_directory, compute_distances]
-                   for l in parallel_inputs]
-
-    # increasing cpu cores can greatly increase memory usage
-    print('Computing pairwise distances...')
-    results = mo.map_async_parallelizer(common_args,
-                                        mo.function_helper,
-                                        cpu_cores,
-                                        show_progress=True)
-
-    merged = im.merge_dictionaries(results)
-
-    print('\nCreating distance matrix...', end='')
-    # create files with headers
-    col_ids = ['FILE'] + genome_ids
-    output_pairwise = os.path.join(output_directory, ct.DISTANCE_MATRIX_BASENAME)
-    # output_p = os.path.join(output_directory,
-    #                         '{0}_shared_loci.tsv'.format(input_basename))
-
-    # Import arrays per genome and save to matrix file
-    # results = write_matrices(merged, genome_ids, output_pairwise, output_p, col_ids)
-    results = write_matrices(merged, genome_ids, output_pairwise, col_ids)
-
-    if symmetric is True:
-        # add 1 to include header
-        output_pairwise = symmetrify_matrix(output_pairwise,
-                                            len(genome_ids)+1,
-                                            tmp_directory)
-        # output_p = symmetrify_matrix(output_p,
-        #                              len(genome_ids)+1,
-        #                              tmp_directory)
-
-    print('done.')
-
-    return [output_pairwise]
+	"""Compute a distance matrix based on allelic profiles.
+
+	Parameters
+	----------
+	input_matrix : str
+		Path to a TSV file with allelic profiles determined by
+		the AlleleCall module.
+	output_directory : str
+		Path to the output directory.
+	cpu_cores : int
+		Number of CPU cores used to compute distances.
+	symmetric : bool
+		Determine a symmetric pairwise distance matrix, instead
+		of a triangular matrix.
+	masked : bool
+		False if the input matrix values are masked, True otherwise.
+		The process will mask the matrix values it this value is False.
+
+	Returns
+	-------
+	output_pairwise : str
+		Path to the TSV file that contains the distance matrix.
+	"""
+	# Create output directory if it does not exist
+	if os.path.isdir(output_directory) is False:
+		os.mkdir(output_directory)
+
+	# determine input basename
+	input_basename = os.path.basename(input_matrix)
+	# remove extension that is after last '.'
+	input_basename = '.'.join(input_basename.split('.')[0:-1])
+
+	output_masked = os.path.join(output_directory, ct.MASKED_PROFILES_BASENAME)
+	if masked is False:
+		print('Masking profile matrix...', end='')
+		profiles_matrix = pd.read_csv(input_matrix,
+									  header=0, index_col=0,
+									  sep='\t', low_memory=False)
+		masked_profiles = profiles_matrix.apply(im.replace_chars)
+		masked_profiles.to_csv(output_masked, sep='\t')
+		print('masked matrix available at {0}'.format(output_masked))
+	else:
+		output_masked = input_matrix
+
+	# create temp directory to store pairwise distances per genome
+	tmp_directory = os.path.join(output_directory, 'temp', 'pairwise_distances')
+	if os.path.isdir(tmp_directory) is False:
+		os.mkdir(tmp_directory)
+
+	# get sample identifiers
+	genome_ids = get_sample_ids(input_matrix, delimiter='\t')
+	total_genomes = len(genome_ids)
+
+	np_matrix = tsv_to_nparray(output_masked)
+
+	rows_indexes = [i for i in range(len(np_matrix))]
+	random.shuffle(rows_indexes)
+	# divide inputs into 20 lists for 5% progress resolution
+	parallel_inputs = im.divide_list_into_n_chunks(rows_indexes, 20)
+
+	common_args = [[l, np_matrix, genome_ids, tmp_directory, compute_distances]
+				   for l in parallel_inputs]
+
+	# increasing cpu cores can greatly increase memory usage
+	print('Computing pairwise distances...')
+	results = mo.map_async_parallelizer(common_args,
+										mo.function_helper,
+										cpu_cores,
+										show_progress=True)
+
+	merged = im.merge_dictionaries(results)
+
+	print('\nCreating distance matrix...', end='')
+	# create files with headers
+	col_ids = ['FILE'] + genome_ids
+	output_pairwise = os.path.join(output_directory, ct.DISTANCE_MATRIX_BASENAME)
+	# output_p = os.path.join(output_directory,
+	#                         '{0}_shared_loci.tsv'.format(input_basename))
+
+	# Import arrays per genome and save to matrix file
+	# results = write_matrices(merged, genome_ids, output_pairwise, output_p, col_ids)
+	results = write_matrices(merged, genome_ids, output_pairwise, col_ids)
+
+	if symmetric is True:
+		# add 1 to include header
+		output_pairwise = symmetrify_matrix(output_pairwise,
+											len(genome_ids)+1,
+											tmp_directory)
+		# output_p = symmetrify_matrix(output_p,
+		#                              len(genome_ids)+1,
+		#                              tmp_directory)
+
+	print('done.')
+
+	return [output_pairwise]
diff --git a/CHEWBBACA/utils/fasttree_wrapper.py b/CHEWBBACA/utils/fasttree_wrapper.py
index 53fc12dc..ba30c50f 100644
--- a/CHEWBBACA/utils/fasttree_wrapper.py
+++ b/CHEWBBACA/utils/fasttree_wrapper.py
@@ -16,22 +16,22 @@
 
 
 def call_fasttree(alignment_file, tree_file):
-    """Compute a phylogenetic tree based on MSA data.
-
-    Parameters
-    ----------
-    alignment_file : str
-        Path to a file with a MSA.
-    tree_file : str
-        Path to the output tree file in Newick format.
-    """
-    proc = subprocess.Popen(['FastTree', '-fastest', '-nosupport',
-                             '-noml', '-out', tree_file, alignment_file],
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.PIPE)
-
-    # Read the stdout from FastTree
-    stdout = proc.stdout.readlines()
-    stderr = proc.stderr.readlines()
-
-    return [stdout, stderr]
+	"""Compute a phylogenetic tree based on MSA data.
+
+	Parameters
+	----------
+	alignment_file : str
+		Path to a file with a MSA.
+	tree_file : str
+		Path to the output tree file in Newick format.
+	"""
+	proc = subprocess.Popen(['FastTree', '-fastest', '-nosupport',
+							 '-noml', '-out', tree_file, alignment_file],
+							stdout=subprocess.PIPE,
+							stderr=subprocess.PIPE)
+
+	# Read the stdout from FastTree
+	stdout = proc.stdout.readlines()
+	stderr = proc.stderr.readlines()
+
+	return [stdout, stderr]
diff --git a/CHEWBBACA/utils/file_operations.py b/CHEWBBACA/utils/file_operations.py
index 1cc418b2..f327f466 100644
--- a/CHEWBBACA/utils/file_operations.py
+++ b/CHEWBBACA/utils/file_operations.py
@@ -33,665 +33,665 @@
 import pandas as pd
 
 try:
-    from utils import iterables_manipulation as im
+	from utils import iterables_manipulation as im
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import iterables_manipulation as im
+	from CHEWBBACA.utils import iterables_manipulation as im
 
 
 def file_basename(file_path, file_extension=True):
-    """Extract file basename from a path.
+	"""Extract file basename from a path.
 
-    Parameters
-    ----------
-    file_path : str
-        Path to the file.
-    file_extension : bool
-        Specify if the basename should include the file
-        extension.
+	Parameters
+	----------
+	file_path : str
+		Path to the file.
+	file_extension : bool
+		Specify if the basename should include the file
+		extension.
 
-    Returns
-    -------
-    basename : str
-        File basename extracted from input path.
-    """
-    basename = os.path.basename(file_path)
+	Returns
+	-------
+	basename : str
+		File basename extracted from input path.
+	"""
+	basename = os.path.basename(file_path)
 
-    if file_extension is False:
-        # Remove suffix after last '.'
-        basename = str((pathlib.Path(basename)).with_suffix(''))
+	if file_extension is False:
+		# Remove suffix after last '.'
+		basename = str((pathlib.Path(basename)).with_suffix(''))
 
-    return basename
+	return basename
 
 
 def split_joiner(string, indices, delimiter):
-    """
-    """
-    split_string = string.split(delimiter)
-    joined_string = delimiter.join([split_string[i] for i in indices])
+	"""
+	"""
+	split_string = string.split(delimiter)
+	joined_string = delimiter.join([split_string[i] for i in indices])
 
-    return joined_string
+	return joined_string
 
 
 def remove_files(files):
-    """Delete a list of files.
+	"""Delete a list of files.
 
-    Parameters
-    ----------
-    files : list
-        List with paths to the files to be deleted.
-    """
-    for f in files:
-        if os.path.isfile(f) is True:
-            os.remove(f)
+	Parameters
+	----------
+	files : list
+		List with paths to the files to be deleted.
+	"""
+	for f in files:
+		if os.path.isfile(f) is True:
+			os.remove(f)
 
 
 def hash_file(file, hash_object, buffer_size=65536):
-    """Compute hash based on the contents of a file.
-
-    Parameters
-    ----------
-    file : str
-        Path to a file.
-    hash_object : _hashlib.HASH
-        Hashlib object to update based on file
-        contents.
-    buffer_size : int
-        Buffer size (amount of data, in KB, to
-        read from file).
-
-    Returns
-    -------
-    hash_str : str
-        Hash computed from file contents.
-    """
-    updated_hash = hash_object
-
-    with open(file, 'rb') as f:
-        # read file in chunks
-        while True:
-            data = f.read(buffer_size)
-            if not data:
-                break
-            updated_hash.update(data)
-
-    hash_str = updated_hash.hexdigest()
-
-    return hash_str
+	"""Compute hash based on the contents of a file.
+
+	Parameters
+	----------
+	file : str
+		Path to a file.
+	hash_object : _hashlib.HASH
+		Hashlib object to update based on file
+		contents.
+	buffer_size : int
+		Buffer size (amount of data, in KB, to
+		read from file).
+
+	Returns
+	-------
+	hash_str : str
+		Hash computed from file contents.
+	"""
+	updated_hash = hash_object
+
+	with open(file, 'rb') as f:
+		# read file in chunks
+		while True:
+			data = f.read(buffer_size)
+			if not data:
+				break
+			updated_hash.update(data)
+
+	hash_str = updated_hash.hexdigest()
+
+	return hash_str
 
 
 def filter_by_substring(files, suffixes, reverse=False):
-    """Filter files names based on a list of suffixes.
+	"""Filter files names based on a list of suffixes.
 
-    Parameters
-    ----------
-    files : list
-        A list with filenames or file paths.
-    suffixes : list
-        List with suffixes.
-    reverse : bool
-        False to select files that contain any of the suffixes.
-        True to select files that do not contain any of the
-        suffixes.
+	Parameters
+	----------
+	files : list
+		A list with filenames or file paths.
+	suffixes : list
+		List with suffixes.
+	reverse : bool
+		False to select files that contain any of the suffixes.
+		True to select files that do not contain any of the
+		suffixes.
 
-    Returns
-    -------
-    filtered : list
-        List with files that passed filtering.
-    """
-    filtered = [file for file in files
-                if any([suffix in file for suffix in suffixes])]
+	Returns
+	-------
+	filtered : list
+		List with files that passed filtering.
+	"""
+	filtered = [file for file in files
+				if any([suffix in file for suffix in suffixes])]
 
-    if reverse is True:
-        filtered = list(set(files)-set(filtered))
+	if reverse is True:
+		filtered = list(set(files)-set(filtered))
 
-    return filtered
+	return filtered
 
 
 def filter_by_extension(files, extensions):
-    """Filter files based on file extension.
-
-    Parameters
-    ----------
-    files : list
-        A list with filenames or file paths.
-    extensions : list
-        List with file extensions.
-
-    Returns
-    -------
-    has_extension : list
-        List with files that end with one of the provided
-        file extensions.
-    no_extension : list
-        List with files that do not end with one of the provided
-        file extensions.
-    """
-    has_extension = [file for file in files
-                     if any([file.endswith(ext) for ext in extensions])]
-    no_extension = list(set(files)-set(has_extension))
-
-    return has_extension, no_extension
+	"""Filter files based on file extension.
+
+	Parameters
+	----------
+	files : list
+		A list with filenames or file paths.
+	extensions : list
+		List with file extensions.
+
+	Returns
+	-------
+	has_extension : list
+		List with files that end with one of the provided
+		file extensions.
+	no_extension : list
+		List with files that do not end with one of the provided
+		file extensions.
+	"""
+	has_extension = [file for file in files
+					 if any([file.endswith(ext) for ext in extensions])]
+	no_extension = list(set(files)-set(has_extension))
+
+	return has_extension, no_extension
 
 
 def create_directory(directory_path):
-    """Create a diretory if it does not exist."""
-    if not os.path.exists(directory_path):
-        os.makedirs(directory_path)
-        return True
-    else:
-        return False
+	"""Create a diretory if it does not exist."""
+	if not os.path.exists(directory_path):
+		os.makedirs(directory_path)
+		return True
+	else:
+		return False
 
 
 def join_paths(parent_path, child_paths):
-    """Create path by joining a parent directory and a list of child paths."""
-    joined_paths = os.path.join(parent_path, *child_paths)
+	"""Create path by joining a parent directory and a list of child paths."""
+	joined_paths = os.path.join(parent_path, *child_paths)
 
-    return joined_paths
+	return joined_paths
 
 
 def listdir_fullpath(directory_path, substring_filter=False):
-    """Get the full path for all files in a directory.
+	"""Get the full path for all files in a directory.
 
-    Parameters
-    ----------
-    directory_path : str
-        Path to a directory.
-    substring_filter : str
-        Only list files that contain this substring.
+	Parameters
+	----------
+	directory_path : str
+		Path to a directory.
+	substring_filter : str
+		Only list files that contain this substring.
 
-    Returns
-    -------
-    file_list : list
-        List containing the full path for every selected
-        file in the input directory.
-    """
-    file_list = [os.path.join(directory_path, file)
-                 for file in os.listdir(directory_path)]
+	Returns
+	-------
+	file_list : list
+		List containing the full path for every selected
+		file in the input directory.
+	"""
+	file_list = [os.path.join(directory_path, file)
+				 for file in os.listdir(directory_path)]
 
-    if substring_filter is not False:
-        file_list = filter_by_substring(file_list, [substring_filter])
+	if substring_filter is not False:
+		file_list = filter_by_substring(file_list, [substring_filter])
 
-    return file_list
+	return file_list
 
 
 def delete_directory(directory_path, max_retries=5):
-    """Delete a directory."""
-    for i in range(max_retries):
-        # might fail to delete files due to permission issues
-        try:
-            shutil.rmtree(directory_path)
-        # path does not exist
-        # FileNotFoundError is a subclass of OSError
-        # the latter will catch the former if checked first
-        except FileNotFoundError:
-            break
-        # sleep before retry
-        except OSError:
-            time.sleep(i)
-
-    # check if directory still exists
-    exists = os.path.isdir(directory_path)
-    if exists is True:
-        print('Could not remove {0}'.format(directory_path))
-
-    return exists
+	"""Delete a directory."""
+	for i in range(max_retries):
+		# might fail to delete files due to permission issues
+		try:
+			shutil.rmtree(directory_path)
+		# path does not exist
+		# FileNotFoundError is a subclass of OSError
+		# the latter will catch the former if checked first
+		except FileNotFoundError:
+			break
+		# sleep before retry
+		except OSError:
+			time.sleep(i)
+
+	# check if directory still exists
+	exists = os.path.isdir(directory_path)
+	if exists is True:
+		print('Could not remove {0}'.format(directory_path))
+
+	return exists
 
 
 def read_file(input_file):
-    """Read a text file.
+	"""Read a text file.
 
-    Parameters
-    ----------
-    input_file : str
-        Path to the file to read.
+	Parameters
+	----------
+	input_file : str
+		Path to the file to read.
 
-    Returns
-    -------
-    file_content : str
-        Single string with the file content.
-    """
-    with open(input_file, 'r') as infile:
-        file_content = infile.read()
+	Returns
+	-------
+	file_content : str
+		Single string with the file content.
+	"""
+	with open(input_file, 'r') as infile:
+		file_content = infile.read()
 
-    return file_content
+	return file_content
 
 
 def read_lines(input_file, strip=True, num_lines=None):
-    """Read lines in a file.
+	"""Read lines in a file.
 
-    Parameters
-    ----------
-    input_file : str
-        Path to the input file.
-    strip : bool
-        Specify if lines should be stripped of leading
-        and trailing white spaces and new line characters.
+	Parameters
+	----------
+	input_file : str
+		Path to the input file.
+	strip : bool
+		Specify if lines should be stripped of leading
+		and trailing white spaces and new line characters.
 
-    Returns
-    -------
-    lines : list
-        List with the lines read from the input file.
-    """
-    with open(input_file, 'r') as infile:
-        if num_lines is None:
-            lines = [line for line in infile.readlines()]
-        else:
-            lines = list(islice(infile, num_lines))
+	Returns
+	-------
+	lines : list
+		List with the lines read from the input file.
+	"""
+	with open(input_file, 'r') as infile:
+		if num_lines is None:
+			lines = [line for line in infile.readlines()]
+		else:
+			lines = list(islice(infile, num_lines))
 
-    if strip is True:
-        lines = [line.strip() for line in lines]
+	if strip is True:
+		lines = [line.strip() for line in lines]
 
-    return lines
+	return lines
 
 
 def pickle_dumper(content, output_file):
-    """Use the Pickle module to serialize an object.
+	"""Use the Pickle module to serialize an object.
 
-    Parameters
-    ----------
-    content : type
-        Variable that refers to the object that will
-        be serialized and written to the output file.
-    output_file : str
-        Path to the output file.
-    """
-    with open(output_file, 'wb') as poutfile:
-        pickle.dump(content, poutfile)
+	Parameters
+	----------
+	content : type
+		Variable that refers to the object that will
+		be serialized and written to the output file.
+	output_file : str
+		Path to the output file.
+	"""
+	with open(output_file, 'wb') as poutfile:
+		pickle.dump(content, poutfile)
 
 
 def pickle_loader(input_file):
-    """Use the Pickle module to de-serialize an object.
+	"""Use the Pickle module to de-serialize an object.
 
-    Parameters
-    ----------
-    input_file : str
-        Path to file with byte stream to be de-serialized.
+	Parameters
+	----------
+	input_file : str
+		Path to file with byte stream to be de-serialized.
 
-    Returns
-    -------
-    content : type
-        Variable that refers to the de-serialized
-        object.
-    """
-    with open(input_file, 'rb') as pinfile:
-        content = pickle.load(pinfile)
+	Returns
+	-------
+	content : type
+		Variable that refers to the de-serialized
+		object.
+	"""
+	with open(input_file, 'rb') as pinfile:
+		content = pickle.load(pinfile)
 
-    return content
+	return content
 
 
 def file_zipper(input_file, zip_file):
-    """Zip (compresses) a file.
+	"""Zip (compresses) a file.
 
-    Parameters
-    ----------
-    input_file : str
-        Path to the file that will be compressed.
-    zip_file : str
-        Path to the ZIP file that will be created.
+	Parameters
+	----------
+	input_file : str
+		Path to the file that will be compressed.
+	zip_file : str
+		Path to the ZIP file that will be created.
 
-    Returns
-    -------
-    zip_file : str
-        Path to the ZIP file that was created by
-        compressing the input file.
-    """
-    with zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
-        zf.write(input_file, os.path.basename(input_file))
+	Returns
+	-------
+	zip_file : str
+		Path to the ZIP file that was created by
+		compressing the input file.
+	"""
+	with zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
+		zf.write(input_file, os.path.basename(input_file))
 
-    return zip_file
+	return zip_file
 
 
 def unzip_file(compressed_file, archive_type='.gz'):
-    """Uncompresse a file.
+	"""Uncompresse a file.
 
-    Parameters
-    ----------
-    compressed_file : str
-        Path to the compressed file.
-    archive_type : str
-        Archive format.
+	Parameters
+	----------
+	compressed_file : str
+		Path to the compressed file.
+	archive_type : str
+		Archive format.
 
-    Returns
-    -------
-    uncompressed_file : str
-        Path to the uncompressed file.
-    """
-    lines = []
-    with gzip.open(compressed_file, 'rb') as f:
-        for line in f:
-            lines.append(line.decode())
+	Returns
+	-------
+	uncompressed_file : str
+		Path to the uncompressed file.
+	"""
+	lines = []
+	with gzip.open(compressed_file, 'rb') as f:
+		for line in f:
+			lines.append(line.decode())
 
-    # save uncompressed contents
-    uncompressed_file = compressed_file.rstrip('.gz')
-    write_lines(lines, uncompressed_file, joiner='')
+	# save uncompressed contents
+	uncompressed_file = compressed_file.rstrip('.gz')
+	write_lines(lines, uncompressed_file, joiner='')
 
-    return uncompressed_file
+	return uncompressed_file
 
 
 def download_file(file_url, outfile, max_tries=3):
-    """Download a file.
-
-    Parameters
-    ----------
-    file_url : str
-        URL to download file.
-    outfile : str
-        Path to the downloaded file.
-    max_tries : int
-        Maximum number of retries if the download
-        fails.
-    """
-    tries = 0
-    downloaded = False
-    while downloaded is False and tries <= max_tries:
-        try:
-            res = urllib.request.urlretrieve(file_url, outfile)
-            if os.path.isfile(outfile) is True:
-                downloaded = True
-        except Exception as e:
-            print(e)
-            time.sleep(1)
-        tries += 1
-
-    return downloaded
+	"""Download a file.
+
+	Parameters
+	----------
+	file_url : str
+		URL to download file.
+	outfile : str
+		Path to the downloaded file.
+	max_tries : int
+		Maximum number of retries if the download
+		fails.
+	"""
+	tries = 0
+	downloaded = False
+	while downloaded is False and tries <= max_tries:
+		try:
+			res = urllib.request.urlretrieve(file_url, outfile)
+			if os.path.isfile(outfile) is True:
+				downloaded = True
+		except Exception as e:
+			print(e)
+			time.sleep(1)
+		tries += 1
+
+	return downloaded
 
 
 def concatenate_files(files, output_file, header=None):
-    """Concatenate files.
-
-    Parameters
-    ----------
-    files : list
-        List with the paths to the files to concatenate.
-    output_file : str
-        Path to the output file that will store the concatenation
-        of input files.
-    header : str or NoneType
-        Specify a header that should be written as the first line
-        in the output file.
-
-    Returns
-    -------
-    output_file : str
-        Path to the output file that was created with
-        the concatenation of input files.
-    """
-    with open(output_file, 'w') as outfile:
-        if header is not None:
-            outfile.write(header)
-        for file in files:
-            with open(file, 'r') as infile:
-                shutil.copyfileobj(infile, outfile)
-
-    return output_file
+	"""Concatenate files.
+
+	Parameters
+	----------
+	files : list
+		List with the paths to the files to concatenate.
+	output_file : str
+		Path to the output file that will store the concatenation
+		of input files.
+	header : str or NoneType
+		Specify a header that should be written as the first line
+		in the output file.
+
+	Returns
+	-------
+	output_file : str
+		Path to the output file that was created with
+		the concatenation of input files.
+	"""
+	with open(output_file, 'w') as outfile:
+		if header is not None:
+			outfile.write(header)
+		for file in files:
+			with open(file, 'r') as infile:
+				shutil.copyfileobj(infile, outfile)
+
+	return output_file
 
 
 def write_to_file(text, output_file, write_mode, end_char):
-    """Write a single string to a file.
-
-    Parameters
-    ----------
-    text : str
-        A single string to write to the output file.
-    output_file : str
-        Path to the output file.
-    write_mode : str
-        Specify write mode ('w' creates file if it does not
-        exist and truncates and over-writes existing file,
-        'a' creates file if it does not exist and appends to
-        the end of file if it exists.).
-    end_char : str
-        Character added to the end of the file.
-    """
-    with open(output_file, write_mode) as out:
-        out.write(text+end_char)
+	"""Write a single string to a file.
+
+	Parameters
+	----------
+	text : str
+		A single string to write to the output file.
+	output_file : str
+		Path to the output file.
+	write_mode : str
+		Specify write mode ('w' creates file if it does not
+		exist and truncates and over-writes existing file,
+		'a' creates file if it does not exist and appends to
+		the end of file if it exists.).
+	end_char : str
+		Character added to the end of the file.
+	"""
+	with open(output_file, write_mode) as out:
+		out.write(text+end_char)
 
 
 def write_lines(lines, output_file, joiner='\n', write_mode='w'):
-    """Write a list of strings to a file.
-
-    Parameters
-    ----------
-    lines : list
-        List with the lines/strings to write to the output
-        file.
-    output_file : str
-        Path to the output file.
-    joiner : str
-        Character used to join lines.
-    write_mode : str
-        Specify write mode ('w' creates file if it does not
-        exist and truncates and over-writes existing file,
-        'a' creates file if it does not exist and appends to
-        the end of file if it exists).
-    """
-    joined_lines = im.join_list(lines, joiner)
-
-    write_to_file(joined_lines, output_file, write_mode, '\n')
+	"""Write a list of strings to a file.
+
+	Parameters
+	----------
+	lines : list
+		List with the lines/strings to write to the output
+		file.
+	output_file : str
+		Path to the output file.
+	joiner : str
+		Character used to join lines.
+	write_mode : str
+		Specify write mode ('w' creates file if it does not
+		exist and truncates and over-writes existing file,
+		'a' creates file if it does not exist and appends to
+		the end of file if it exists).
+	"""
+	joined_lines = im.join_list(lines, joiner)
+
+	write_to_file(joined_lines, output_file, write_mode, '\n')
 
 
 def read_tabular(input_file, delimiter='\t'):
-    """Read a tabular (TSV) file.
+	"""Read a tabular (TSV) file.
 
-    Parameters
-    ----------
-    input_file : str
-        Path to a tabular file.
-    delimiter : str
-        Delimiter used to separate file fields.
+	Parameters
+	----------
+	input_file : str
+		Path to a tabular file.
+	delimiter : str
+		Delimiter used to separate file fields.
 
-    Returns
-    -------
-    lines : list
-        A list with a sublist per line in the input file.
-        Each sublist has the fields that were separated by
-        the defined delimiter.
-    """
-    with open(input_file, 'r') as infile:
-        reader = csv.reader(infile, delimiter=delimiter)
-        lines = [line for line in reader]
+	Returns
+	-------
+	lines : list
+		A list with a sublist per line in the input file.
+		Each sublist has the fields that were separated by
+		the defined delimiter.
+	"""
+	with open(input_file, 'r') as infile:
+		reader = csv.reader(infile, delimiter=delimiter)
+		lines = [line for line in reader]
 
-    return lines
+	return lines
 
 
 def input_timeout(prompt, timeout=30):
-    """Add timeout feature when requesting user input.
-
-    Parameters
-    ----------
-    prompt : str
-        Message to print to stdout to request for user
-        input.
-    timeout : int
-        Maximum number of seconds that the process will
-        wait for input.
-
-    Returns
-    -------
-    String with user input.
-
-    Raises
-    ------
-    SystemExit
-        - If there is no user input before timeout.
-    """
-    pool = ThreadPool(processes=1)
-    answer = pool.apply_async(input, args=[prompt])
-
-    try:
-        return answer.get(timeout=timeout)
-    except TimeoutError:
-        sys.exit('Timed out.')
+	"""Add timeout feature when requesting user input.
+
+	Parameters
+	----------
+	prompt : str
+		Message to print to stdout to request for user
+		input.
+	timeout : int
+		Maximum number of seconds that the process will
+		wait for input.
+
+	Returns
+	-------
+	String with user input.
+
+	Raises
+	------
+	SystemExit
+		- If there is no user input before timeout.
+	"""
+	pool = ThreadPool(processes=1)
+	answer = pool.apply_async(input, args=[prompt])
+
+	try:
+		return answer.get(timeout=timeout)
+	except TimeoutError:
+		sys.exit('Timed out.')
 
 
 def is_file_empty(file_path):
-    """Check if file is empty by confirming if its size is 0 bytes."""
-    # Check if file exist and it is empty
-    return os.path.exists(file_path) and os.stat(file_path).st_size == 0
+	"""Check if file is empty by confirming if its size is 0 bytes."""
+	# Check if file exist and it is empty
+	return os.path.exists(file_path) and os.stat(file_path).st_size == 0
 
 
 def is_file_empty_2(file_name):
-    """Check if file is empty by confirming if its size is 0 bytes."""
-    # Check if file exist and it is empty
-    return os.path.isfile(file_name) and os.path.getsize(file_name) == 0
+	"""Check if file is empty by confirming if its size is 0 bytes."""
+	# Check if file exist and it is empty
+	return os.path.isfile(file_name) and os.path.getsize(file_name) == 0
 
 
 def is_file_empty_3(file_name):
-    """Check if file is empty by reading first character in it."""
-    # open ile in read mode
-    with open(file_name, 'r') as read_obj:
-        # read first character
-        one_char = read_obj.read(1)
-        # if not fetched then file is empty
-        if not one_char:
-            return True
-        elif one_char == " ":
-            return True
-    return False
+	"""Check if file is empty by reading first character in it."""
+	# open ile in read mode
+	with open(file_name, 'r') as read_obj:
+		# read first character
+		one_char = read_obj.read(1)
+		# if not fetched then file is empty
+		if not one_char:
+			return True
+		elif one_char == " ":
+			return True
+	return False
 
 
 def create_short(schema_files, schema_dir):
-    """Create the 'short' directory for a chewBBACA schema.
+	"""Create the 'short' directory for a chewBBACA schema.
 
-    Creates the 'short' directory inside a schema's main
-    directory and copies the FASTA files with the representative
-    alleles into the 'short' folder.
+	Creates the 'short' directory inside a schema's main
+	directory and copies the FASTA files with the representative
+	alleles into the 'short' folder.
 
-    Parameters
-    ----------
-    schema_files : list
-        List with paths to FASTA files that contain the loci
-        representative alleles.
-    schema_dir : str
-        Path to the schema's directory.
+	Parameters
+	----------
+	schema_files : list
+		List with paths to FASTA files that contain the loci
+		representative alleles.
+	schema_dir : str
+		Path to the schema's directory.
 
-    Returns
-    -------
-    True on completion.
-    """
-    short_path = join_paths(schema_dir, ['short'])
-    create_directory(short_path)
+	Returns
+	-------
+	True on completion.
+	"""
+	short_path = join_paths(schema_dir, ['short'])
+	create_directory(short_path)
 
-    for file in schema_files:
-        locus_id = file_basename(file, False)
-        short_file = join_paths(short_path, [f'{locus_id}_short.fasta'])
-        shutil.copy(file, short_file)
+	for file in schema_files:
+		locus_id = file_basename(file, False)
+		short_file = join_paths(short_path, [f'{locus_id}_short.fasta'])
+		shutil.copy(file, short_file)
 
-    return True
+	return True
 
 
 def copy_file(source, destination):
-    """Copy a file to specified destination."""
-    shutil.copy(source, destination)
+	"""Copy a file to specified destination."""
+	shutil.copy(source, destination)
 
 def move_file(source, destination):
-    """Move a file to specified destination."""
-    shutil.move(source, destination)
+	"""Move a file to specified destination."""
+	shutil.move(source, destination)
 
 
 def matching_lines(input_file, pattern):
-    """Retrieve lines from a file that contain a pattern.
+	"""Retrieve lines from a file that contain a pattern.
 
-    Parameters
-    ----------
-    input_file : str
-        Path to input file.
-    pattern : str
-        Pattern to match.
+	Parameters
+	----------
+	input_file : str
+		Path to input file.
+	pattern : str
+		Pattern to match.
 
-    Returns
-    -------
-    matched_lines : list
-        List with lines that contain the pattern.
-    """
-    with open(input_file, 'r') as infile:
-        matched_lines = [line for line in infile if pattern in line]
+	Returns
+	-------
+	matched_lines : list
+		List with lines that contain the pattern.
+	"""
+	with open(input_file, 'r') as infile:
+		matched_lines = [line for line in infile if pattern in line]
 
-    return matched_lines
+	return matched_lines
 
 
 def transpose_matrix(input_file, output_directory):
-    """Transpose a TSV file.
-
-    Parameters
-    ----------
-    input_file : str
-        Path to the input TSV file.
-    output_directory : str
-        Path to the directory to which intermediate files
-        and the complete transposed file will be written.
-
-    Returns
-    -------
-    transposed_file : str
-        Path to the file with the transposed matrix.
-        This file is created by concatenating all
-        intermediate files.
-    """
-    intermediate_files = []
-    with open(input_file, 'r') as infile:
-        # Get column identifiers
-        columns = [c.strip() for c in (infile.__next__()).split('\t')]
-        # Divide into smaller sets to avoid loading complete file
-        total_column_sets = math.ceil(len(columns)/100)
-        column_sets = im.divide_list_into_n_chunks(columns, total_column_sets)
-        # Use Pandas to read columns sets and save transpose
-        for i, c in enumerate(column_sets):
-            # dtype=str or Pandas converts values into floats
-            df = pd.read_csv(input_file, usecols=c, delimiter='\t', dtype=str)
-            output_file = join_paths(output_directory, ['chunk{0}.tsv'.format(i)])
-            # Transpose columns
-            df = df.T
-            # Do not save header that contains row indexes
-            df.to_csv(output_file, sep='\t', header=False)
-            intermediate_files.append(output_file)
-
-    # Concatenate all files with transposed lines
-    transposed_file = input_file.replace('.tsv', '_transpose.tsv')
-    concatenate_files(intermediate_files, transposed_file)
-
-    # Delete intermediate files
-    remove_files(intermediate_files)
-
-    return transposed_file
+	"""Transpose a TSV file.
+
+	Parameters
+	----------
+	input_file : str
+		Path to the input TSV file.
+	output_directory : str
+		Path to the directory to which intermediate files
+		and the complete transposed file will be written.
+
+	Returns
+	-------
+	transposed_file : str
+		Path to the file with the transposed matrix.
+		This file is created by concatenating all
+		intermediate files.
+	"""
+	intermediate_files = []
+	with open(input_file, 'r') as infile:
+		# Get column identifiers
+		columns = [c.strip() for c in (infile.__next__()).split('\t')]
+		# Divide into smaller sets to avoid loading complete file
+		total_column_sets = math.ceil(len(columns)/100)
+		column_sets = im.divide_list_into_n_chunks(columns, total_column_sets)
+		# Use Pandas to read columns sets and save transpose
+		for i, c in enumerate(column_sets):
+			# dtype=str or Pandas converts values into floats
+			df = pd.read_csv(input_file, usecols=c, delimiter='\t', dtype=str)
+			output_file = join_paths(output_directory, ['chunk{0}.tsv'.format(i)])
+			# Transpose columns
+			df = df.T
+			# Do not save header that contains row indexes
+			df.to_csv(output_file, sep='\t', header=False)
+			intermediate_files.append(output_file)
+
+	# Concatenate all files with transposed lines
+	transposed_file = input_file.replace('.tsv', '_transpose.tsv')
+	concatenate_files(intermediate_files, transposed_file)
+
+	# Delete intermediate files
+	remove_files(intermediate_files)
+
+	return transposed_file
 
 
 def count_repeated_matrix(matrix_file, ignore_values):
-    """Count number of repeated elements per line in a matrix.
-
-    Parameters
-    ----------
-    matrix_file : str
-        Path to the file that contains the matrix.
-    ignore_values : list
-        List of values to ignore when counting repeated elements.
-
-    Returns
-    -------
-    repeated_values : set
-        Set of values that appear more than once in at least a row
-        of the matrix.
-    repeated_counts : dict
-        Dictionary with row identifiers as keys and the total number
-        of times a value is repeated in that row as values.
-    """
-    repeated_counts = {}
-    repeated_values = set()
-    with open(matrix_file, 'r') as infile:
-        csv_reader = csv.reader(infile, delimiter='\t')
-        header = csv_reader.__next__()
-        for i, l in enumerate(csv_reader):
-            genome_id = l[0]
-            # Count number of ocurrences for each hash/seqid
-            hashes = [h for h in l[1:] if h not in ignore_values]
-            hash_counts = Counter(hashes)
-            repeated_hashes = [count
-                               for count in hash_counts.most_common()
-                               if count[1] > 1]
-            for h in repeated_hashes:
-                repeated_values.add(h[0])
-            # Store number of times a CDS is repeated -1 to adjust
-            # CDS counts per input
-            repeated_counts[genome_id] = sum((h[1]-1) for h in repeated_hashes)
-
-    return [repeated_values, repeated_counts]
+	"""Count number of repeated elements per line in a matrix.
+
+	Parameters
+	----------
+	matrix_file : str
+		Path to the file that contains the matrix.
+	ignore_values : list
+		List of values to ignore when counting repeated elements.
+
+	Returns
+	-------
+	repeated_values : set
+		Set of values that appear more than once in at least a row
+		of the matrix.
+	repeated_counts : dict
+		Dictionary with row identifiers as keys and the total number
+		of times a value is repeated in that row as values.
+	"""
+	repeated_counts = {}
+	repeated_values = set()
+	with open(matrix_file, 'r') as infile:
+		csv_reader = csv.reader(infile, delimiter='\t')
+		header = csv_reader.__next__()
+		for i, l in enumerate(csv_reader):
+			genome_id = l[0]
+			# Count number of ocurrences for each hash/seqid
+			hashes = [h for h in l[1:] if h not in ignore_values]
+			hash_counts = Counter(hashes)
+			repeated_hashes = [count
+							   for count in hash_counts.most_common()
+							   if count[1] > 1]
+			for h in repeated_hashes:
+				repeated_values.add(h[0])
+			# Store number of times a CDS is repeated -1 to adjust
+			# CDS counts per input
+			repeated_counts[genome_id] = sum((h[1]-1) for h in repeated_hashes)
+
+	return [repeated_values, repeated_counts]
diff --git a/CHEWBBACA/utils/gene_prediction.py b/CHEWBBACA/utils/gene_prediction.py
index f2a5eb95..aa352214 100644
--- a/CHEWBBACA/utils/gene_prediction.py
+++ b/CHEWBBACA/utils/gene_prediction.py
@@ -15,244 +15,244 @@
 import pyrodigal
 
 try:
-    from utils import (constants as ct,
-                       file_operations as fo,
-                       fasta_operations as fao,
-                       iterables_manipulation as im)
+	from utils import (constants as ct,
+					   file_operations as fo,
+					   fasta_operations as fao,
+					   iterables_manipulation as im)
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import (constants as ct,
-                                 file_operations as fo,
-                                 fasta_operations as fao,
-                                 iterables_manipulation as im)
+	from CHEWBBACA.utils import (constants as ct,
+								 file_operations as fo,
+								 fasta_operations as fao,
+								 iterables_manipulation as im)
 
 
 def create_gene_finder(training_data, closed, mask, meta):
-    """Create a Pyrodigal GeneFinder object.
-
-    Parameters
-    ----------
-    training_data : pyrodigal.TrainingInfo
-        A training info instance used to predict genes in single
-        mode.
-    closed : bool
-        True to prevent prediction of partial genes at edges of
-        sequences, False otherwise.
-    meta: bool
-        True to run Prodigal in `meta` mode (uses pre-trained
-        profiles).
-
-    Returns
-    -------
-    gene_finder : pyrodigal.GeneFinder
-        A GeneFinder object configured based on provided arguments.
-    """
-    gene_finder = pyrodigal.GeneFinder(training_info=training_data,
-                                       closed=closed,
-                                       mask=mask,
-                                       meta=meta)
-
-    return gene_finder
+	"""Create a Pyrodigal GeneFinder object.
+
+	Parameters
+	----------
+	training_data : pyrodigal.TrainingInfo
+		A training info instance used to predict genes in single
+		mode.
+	closed : bool
+		True to prevent prediction of partial genes at edges of
+		sequences, False otherwise.
+	meta: bool
+		True to run Prodigal in `meta` mode (uses pre-trained
+		profiles).
+
+	Returns
+	-------
+	gene_finder : pyrodigal.GeneFinder
+		A GeneFinder object configured based on provided arguments.
+	"""
+	gene_finder = pyrodigal.GeneFinder(training_info=training_data,
+									   closed=closed,
+									   mask=mask,
+									   meta=meta)
+
+	return gene_finder
 
 
 def train_gene_finder(gene_finder, sequences, translation_table):
-    """Train a Pyrodigal GeneFinder object based on a set of sequences.
+	"""Train a Pyrodigal GeneFinder object based on a set of sequences.
 
-    Parameters
-    ----------
-    gene_finder : pyrodigal.GeneFinder
-        A GeneFinder object.
-    sequences : list
-        Sequences used to train the GeneFinder (list
-        of bytes objects).
-    translation_table : int
-        Translation table to use.
+	Parameters
+	----------
+	gene_finder : pyrodigal.GeneFinder
+		A GeneFinder object.
+	sequences : list
+		Sequences used to train the GeneFinder (list
+		of bytes objects).
+	translation_table : int
+		Translation table to use.
 
-    Return
-    ------
-    gene_finder : pyrodigal.GeneFinder
-        A GeneFinder object configured based on provided arguments.
-    """
-    gene_finder.train(*sequences, translation_table=translation_table)
+	Return
+	------
+	gene_finder : pyrodigal.GeneFinder
+		A GeneFinder object configured based on provided arguments.
+	"""
+	gene_finder.train(*sequences, translation_table=translation_table)
 
-    return gene_finder
+	return gene_finder
 
 
 def read_training_file(training_file):
-    """Load training info for Pyrodigal from Prodigal training file.
+	"""Load training info for Pyrodigal from Prodigal training file.
 
-    Parameters
-    ----------
-    training_file : str
-        Path to Prodigal training file.
+	Parameters
+	----------
+	training_file : str
+		Path to Prodigal training file.
 
-    Returns
-    -------
-    training_data : pyrodigal.TrainingInfo
-        The deserialized training info.
-    """
-    with open(training_file, 'rb') as infile:
-        training_data = pyrodigal.TrainingInfo.load(infile)
+	Returns
+	-------
+	training_data : pyrodigal.TrainingInfo
+		The deserialized training info.
+	"""
+	with open(training_file, 'rb') as infile:
+		training_data = pyrodigal.TrainingInfo.load(infile)
 
-    return training_data
+	return training_data
 
 
 def get_gene_info(contig_id, genome_id, protid, genes):
-    """Get genes information from a pyrodigal.Genes object.
-
-    Parameters
-    ----------
-    contig_id : str
-        The unique identifier of the sequence/contig.
-    genome_id : str
-        The unique identifier of the genome/file.
-    protid : int
-        The integer identifier to attriute to the first gene.
-    genes : pyrodigal.Genes
-        The list of genes predicted by Prodigal.
-
-    Returns
-    -------
-    gene_info : list
-        List with one sublist per gene predicted. Each sublist
-        includes the sequence SHA256 hash, the DNA sequence, the
-        genome identifier, the contig identifier, the start position
-        in the sequence, the end position, the integer identifier and
-        the strand the gene was identified in.
-    protid : int
-        The integer identifier to attribute to the first gene
-        in the next sequence/contig.
-    """
-    gene_info = []
-    for gene in genes:
-        sequence = gene.sequence()
-        sequence_hash = im.hash_sequence(sequence)
-        gene_info.append([sequence_hash, sequence, genome_id, contig_id,
-                          str(gene.begin), str(gene.end), str(protid),
-                          str(gene.strand)])
-        protid += 1
-
-    return gene_info, protid
+	"""Get genes information from a pyrodigal.Genes object.
+
+	Parameters
+	----------
+	contig_id : str
+		The unique identifier of the sequence/contig.
+	genome_id : str
+		The unique identifier of the genome/file.
+	protid : int
+		The integer identifier to attriute to the first gene.
+	genes : pyrodigal.Genes
+		The list of genes predicted by Prodigal.
+
+	Returns
+	-------
+	gene_info : list
+		List with one sublist per gene predicted. Each sublist
+		includes the sequence SHA256 hash, the DNA sequence, the
+		genome identifier, the contig identifier, the start position
+		in the sequence, the end position, the integer identifier and
+		the strand the gene was identified in.
+	protid : int
+		The integer identifier to attribute to the first gene
+		in the next sequence/contig.
+	"""
+	gene_info = []
+	for gene in genes:
+		sequence = gene.sequence()
+		sequence_hash = im.hash_sequence(sequence)
+		gene_info.append([sequence_hash, sequence, genome_id, contig_id,
+						  str(gene.begin), str(gene.end), str(protid),
+						  str(gene.strand)])
+		protid += 1
+
+	return gene_info, protid
 
 
 def write_gene_fasta(gene_info, output_file):
-    """Write a FASTA file based on the results returned by `get_gene_info`.
-
-    Parameters
-    ----------
-    gene_info : list
-        List with the data for the genes returned by `get_gene_info`.
-    output_file : str
-        Path to the output FASTA file.
-    """
-    fasta_sequences = []
-    for gene in gene_info:
-        fasta_str = ct.FASTA_CDS_TEMPLATE.format(gene[2], gene[6], gene[1])
-        fasta_sequences.append(fasta_str)
-    fo.write_lines(fasta_sequences, output_file)
+	"""Write a FASTA file based on the results returned by `get_gene_info`.
+
+	Parameters
+	----------
+	gene_info : list
+		List with the data for the genes returned by `get_gene_info`.
+	output_file : str
+		Path to the output FASTA file.
+	"""
+	fasta_sequences = []
+	for gene in gene_info:
+		fasta_str = ct.FASTA_CDS_TEMPLATE.format(gene[2], gene[6], gene[1])
+		fasta_sequences.append(fasta_str)
+	fo.write_lines(fasta_sequences, output_file)
 
 
 def write_coordinates_pickle(gene_info, contig_sizes, output_file):
-    """Write gene coordinates to a pickle file.
-
-    Parameters
-    ----------
-    gene_info : list
-        List with the data for the genes returned by `get_gene_info`.
-    contig_sizes : dict
-        Dictionary with contig/sequence identifiers as keys and
-        contig/sequence size as values.
-    output_file : str
-    Path to the output file.
-    """
-    gene_coordinates = {}
-    for gene in gene_info:
-        gene_coordinates.setdefault(gene[0], []).append(gene[2:])
-    fo.pickle_dumper([gene_coordinates, contig_sizes], output_file)
+	"""Write gene coordinates to a pickle file.
+
+	Parameters
+	----------
+	gene_info : list
+		List with the data for the genes returned by `get_gene_info`.
+	contig_sizes : dict
+		Dictionary with contig/sequence identifiers as keys and
+		contig/sequence size as values.
+	output_file : str
+	Path to the output file.
+	"""
+	gene_coordinates = {}
+	for gene in gene_info:
+		gene_coordinates.setdefault(gene[0], []).append(gene[2:])
+	fo.pickle_dumper([gene_coordinates, contig_sizes], output_file)
 
 
 def predict_genome_genes(input_file, output_directory, gene_finder,
-                         translation_table):
-    """Predict genes for sequences in a FASTA file.
-
-    Parameters
-    ----------
-    input_file : str
-        Path to the FASTA file.
-    output_directory : str
-        Path to the output_directory to store files with
-        the results.
-    gene_finder : pyrodigal.GeneFinder
-        A GeneFinder object.
-    translation_table : int
-        Translation table used to configure the GeneFinder
-        (None type if the GeneFinder does not need to be
-        configured).
-
-    Returns
-    -------
-    input_file : str
-        Path to the input FASTA file.
-    total_genome : int
-        Total number of genes predicted.
-    fasta_outfile : str
-        Path to the output FASTA file that contains the
-        predited gene sequences.
-    coordinates_outfile : str
-        Path to the output pickle file that contains the gene
-        coordinates and contig size data.
-    """
-    # Get genome unique identifier
-    genome_basename = input_file[1]
-    records = fao.sequence_generator(input_file[0])
-    records = {rec.id: bytes(rec.seq) for rec in records}
-    contig_sizes = {recid: len(sequence)
-                    for recid, sequence in records.items()}
-
-    # Train based on input sequences
-    # Only train if there is no GeneFinder object
-    if gene_finder is not None:
-        current_gene_finder = gene_finder
-    else:
-        current_gene_finder = create_gene_finder(None, True, True, False)
-        current_gene_finder = train_gene_finder(current_gene_finder,
-                                                records.values(),
-                                                translation_table)
-
-    # Predict genes for all input contigs
-    contig_genes = {}
-    for recid, sequence in records.items():
-        genes = current_gene_finder.find_genes(sequence)
-        contig_genes[recid] = genes
-
-    # Extract data from Gene objects
-    protid = 1
-    gene_info = []
+						 translation_table):
+	"""Predict genes for sequences in a FASTA file.
+
+	Parameters
+	----------
+	input_file : str
+		Path to the FASTA file.
+	output_directory : str
+		Path to the output_directory to store files with
+		the results.
+	gene_finder : pyrodigal.GeneFinder
+		A GeneFinder object.
+	translation_table : int
+		Translation table used to configure the GeneFinder
+		(None type if the GeneFinder does not need to be
+		configured).
+
+	Returns
+	-------
+	input_file : str
+		Path to the input FASTA file.
+	total_genome : int
+		Total number of genes predicted.
+	fasta_outfile : str
+		Path to the output FASTA file that contains the
+		predited gene sequences.
+	coordinates_outfile : str
+		Path to the output pickle file that contains the gene
+		coordinates and contig size data.
+	"""
+	# Get genome unique identifier
+	genome_basename = input_file[1]
+	records = fao.sequence_generator(input_file[0])
+	records = {rec.id: bytes(rec.seq) for rec in records}
+	contig_sizes = {recid: len(sequence)
+					for recid, sequence in records.items()}
+
+	# Train based on input sequences
+	# Only train if there is no GeneFinder object
+	if gene_finder is not None:
+		current_gene_finder = gene_finder
+	else:
+		current_gene_finder = create_gene_finder(None, True, True, False)
+		current_gene_finder = train_gene_finder(current_gene_finder,
+												records.values(),
+												translation_table)
+
+	# Predict genes for all input contigs
+	contig_genes = {}
+	for recid, sequence in records.items():
+		genes = current_gene_finder.find_genes(sequence)
+		contig_genes[recid] = genes
+
+	# Extract data from Gene objects
+	protid = 1
+	gene_info = []
 	# Store data about first and last CDS in each sequence to speedup PLOT classification
-    close_to_tip = {genome_basename: {}}
-    for recid, genes in contig_genes.items():
-        data = get_gene_info(recid, genome_basename, protid, genes)
-        gene_info.extend(data[0])
-        if len(data[0]) > 0:
-            first_cds = data[0][0]
-            close_to_tip[genome_basename].setdefault(first_cds[0], []).append((contig_sizes[first_cds[3]], int(first_cds[4]), int(first_cds[5]), first_cds[-1]))
-            if first_cds != data[0][-1]:
-                last_cds = data[0][-1]
-                close_to_tip[genome_basename].setdefault(last_cds[0], []).append((contig_sizes[last_cds[3]], int(last_cds[4]), int(last_cds[5]), last_cds[-1]))
+	close_to_tip = {genome_basename: {}}
+	for recid, genes in contig_genes.items():
+		data = get_gene_info(recid, genome_basename, protid, genes)
+		gene_info.extend(data[0])
+		if len(data[0]) > 0:
+			first_cds = data[0][0]
+			close_to_tip[genome_basename].setdefault(first_cds[0], []).append((contig_sizes[first_cds[3]], int(first_cds[4]), int(first_cds[5]), first_cds[-1]))
+			if first_cds != data[0][-1]:
+				last_cds = data[0][-1]
+				close_to_tip[genome_basename].setdefault(last_cds[0], []).append((contig_sizes[last_cds[3]], int(last_cds[4]), int(last_cds[5]), last_cds[-1]))
 		# Reset protid based on the number of CDSs predicted for the sequence
-        protid = data[1]
-
-    total_genome = len(gene_info)
-    fasta_outfile = None
-    coordinates_outfile = None
-    if total_genome > 0:
-        # Create FASTA file with DNA sequences
-        fasta_outfile = fo.join_paths(output_directory,
-                                      [f'{genome_basename}.fasta'])
-        write_gene_fasta(gene_info, fasta_outfile)
-
-        # Save gene coordinates and contig sizes to pickle
-        coordinates_outfile = fo.join_paths(output_directory,
-                                            [f'{genome_basename}_coordinates'])
-        write_coordinates_pickle(gene_info, contig_sizes, coordinates_outfile)
-
-    return [input_file, total_genome, fasta_outfile, coordinates_outfile, close_to_tip]
+		protid = data[1]
+
+	total_genome = len(gene_info)
+	fasta_outfile = None
+	coordinates_outfile = None
+	if total_genome > 0:
+		# Create FASTA file with DNA sequences
+		fasta_outfile = fo.join_paths(output_directory,
+									  [f'{genome_basename}.fasta'])
+		write_gene_fasta(gene_info, fasta_outfile)
+
+		# Save gene coordinates and contig sizes to pickle
+		coordinates_outfile = fo.join_paths(output_directory,
+											[f'{genome_basename}_coordinates'])
+		write_coordinates_pickle(gene_info, contig_sizes, coordinates_outfile)
+
+	return [input_file, total_genome, fasta_outfile, coordinates_outfile, close_to_tip]
diff --git a/CHEWBBACA/utils/iterables_manipulation.py b/CHEWBBACA/utils/iterables_manipulation.py
index 7cfc94d6..0b45a21b 100644
--- a/CHEWBBACA/utils/iterables_manipulation.py
+++ b/CHEWBBACA/utils/iterables_manipulation.py
@@ -16,1013 +16,1013 @@
 import itertools
 
 try:
-    from utils import (constants as ct,
-                       fasta_operations as fao)
+	from utils import (constants as ct,
+					   fasta_operations as fao)
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import (constants as ct,
-                                 fasta_operations as fao)
+	from CHEWBBACA.utils import (constants as ct,
+								 fasta_operations as fao)
 
 
 def match_regex(text, pattern):
-    """Extract substrings that match a regex pattern.
+	"""Extract substrings that match a regex pattern.
 
-    Parameters
-    ----------
-    text : str
-        Input text to search the pattern in.
-    pattern : str
-        Regular expression pattern.
+	Parameters
+	----------
+	text : str
+		Input text to search the pattern in.
+	pattern : str
+		Regular expression pattern.
 
-    Returns
-    -------
-    match : str
-        Matched substring or NoneType if no matches
-        were found.
-    """
-    match = re.search(pattern, text)
-    if match is not None:
-        match_span = match.span()
-        match = text[match_span[0]:match_span[1]]
+	Returns
+	-------
+	match : str
+		Matched substring or NoneType if no matches
+		were found.
+	"""
+	match = re.search(pattern, text)
+	if match is not None:
+		match_span = match.span()
+		match = text[match_span[0]:match_span[1]]
 
-    return match
+	return match
 
 
 def join_list(lst, delimiter):
-    """Join all elements in a list into a single string.
+	"""Join all elements in a list into a single string.
 
-    Parameters
-    ----------
-    lst : list
-        List with elements to be joined.
-    delimiter : str
-        Character used to join list elements.
+	Parameters
+	----------
+	lst : list
+		List with elements to be joined.
+	delimiter : str
+		Character used to join list elements.
 
-    Returns
-    -------
-    joined_list : str
-        A single string with all elements in the input
-        list joined by the character chosen as link.
-    """
-    joined_list = delimiter.join(lst)
+	Returns
+	-------
+	joined_list : str
+		A single string with all elements in the input
+		list joined by the character chosen as link.
+	"""
+	joined_list = delimiter.join(lst)
 
-    return joined_list
+	return joined_list
 
 
 def flatten_list(list_to_flatten):
-    """Flatten one level of a nested list.
+	"""Flatten one level of a nested list.
 
-    Parameters
-    ----------
-    list_to_flatten : list
-        Nested list to flatten.
+	Parameters
+	----------
+	list_to_flatten : list
+		Nested list to flatten.
 
-    Returns
-    -------
-    flattened_list : str
-        Input list flattened by one level.
-    """
-    flattened_list = list(itertools.chain(*list_to_flatten))
+	Returns
+	-------
+	flattened_list : str
+		Input list flattened by one level.
+	"""
+	flattened_list = list(itertools.chain(*list_to_flatten))
 
-    return flattened_list
+	return flattened_list
 
 
 def isListEmpty(input_list):
-    """Check if a nested list is empty."""
-    if isinstance(input_list, list):
-        return all(map(isListEmpty, input_list)) if isinstance(input_list, list) else False
+	"""Check if a nested list is empty."""
+	if isinstance(input_list, list):
+		return all(map(isListEmpty, input_list)) if isinstance(input_list, list) else False
 
 
 def divide_list_into_n_chunks(list_to_divide, n):
-    """Divides a list into a defined number of sublists.
+	"""Divides a list into a defined number of sublists.
 
-    Parameters
-    ----------
-    list_to_divide : list
-        List to divide into sublists.
-    n : int
-        Number of sublists to create.
+	Parameters
+	----------
+	list_to_divide : list
+		List to divide into sublists.
+	n : int
+		Number of sublists to create.
 
-    Returns
-    -------
-    sublists : list
-        List with the sublists created by dividing
-        the input list.
-    """
-    sublists = []
-    d, r = divmod(len(list_to_divide), n)
-    for i in range(n):
-        si = (d+1)*(i if i < r else r) + d*(0 if i < r else i - r)
-        sublists.append(list_to_divide[si:si+(d+1 if i < r else d)])
+	Returns
+	-------
+	sublists : list
+		List with the sublists created by dividing
+		the input list.
+	"""
+	sublists = []
+	d, r = divmod(len(list_to_divide), n)
+	for i in range(n):
+		si = (d+1)*(i if i < r else r) + d*(0 if i < r else i - r)
+		sublists.append(list_to_divide[si:si+(d+1 if i < r else d)])
 
-    # exclude lists that are empty due to small number of elements
-    sublists = [line for line in sublists if len(line) > 0]
+	# exclude lists that are empty due to small number of elements
+	sublists = [line for line in sublists if len(line) > 0]
 
-    return sublists
+	return sublists
 
 
 def merge_dictionaries(dictionaries_list, overwrite=False):
-    """Merge several dictionaries.
-
-    Parameters
-    ----------
-    dictionaries_list : list
-        A list with the dictionaries to merge.
-    overwrite : bool
-        True to overwrite values when there is a key
-        collision, False otherwise.
-
-    Returns
-    -------
-    merged_dicts : dict
-        A dictionary resulting from merging
-        all input dictionaries.
-    """
-    merged_dicts = dictionaries_list[0]
-    if overwrite is True:
-        for d in dictionaries_list[1:]:
-            merged_dicts = {**merged_dicts, **d}
-    elif overwrite is False:
-        for d in dictionaries_list[1:]:
-            for k, v in d.items():
-                if k in merged_dicts:
-                    merged_dicts[k] = list(set.union(set(v), set(merged_dicts[k])))
-                else:
-                    merged_dicts[k] = v
-
-    return merged_dicts
+	"""Merge several dictionaries.
+
+	Parameters
+	----------
+	dictionaries_list : list
+		A list with the dictionaries to merge.
+	overwrite : bool
+		True to overwrite values when there is a key
+		collision, False otherwise.
+
+	Returns
+	-------
+	merged_dicts : dict
+		A dictionary resulting from merging
+		all input dictionaries.
+	"""
+	merged_dicts = dictionaries_list[0]
+	if overwrite is True:
+		for d in dictionaries_list[1:]:
+			merged_dicts = {**merged_dicts, **d}
+	elif overwrite is False:
+		for d in dictionaries_list[1:]:
+			for k, v in d.items():
+				if k in merged_dicts:
+					merged_dicts[k] = list(set.union(set(v), set(merged_dicts[k])))
+				else:
+					merged_dicts[k] = v
+
+	return merged_dicts
 
 
 def invert_dictionary(dictionary):
-    """Invert a dictionary (key:value to value:key).
+	"""Invert a dictionary (key:value to value:key).
 
-    Parameters
-    ----------
-    dictionary : dict
-        Dictionary to be inverted.
+	Parameters
+	----------
+	dictionary : dict
+		Dictionary to be inverted.
 
-    Returns
-    -------
-    inverted_dict : dict
-        Inverted dictionary.
-    """
-    inverted_dict = {value: key for key, value in dictionary.items()}
+	Returns
+	-------
+	inverted_dict : dict
+		Inverted dictionary.
+	"""
+	inverted_dict = {value: key for key, value in dictionary.items()}
 
-    return inverted_dict
+	return inverted_dict
 
 
 def split_iterable(iterable, size):
-    """Split a dictionary.
-
-    Parameters
-    ----------
-    iterable : dict
-        Dictionary to split.
-    size : int
-        Size of dictionaries created from the input
-        dictionary.
-
-    Returns
-    -------
-    chunks : list
-        List with dictionaries of defined size
-        resulting from splitting the input dictionary.
-    """
-    chunks = []
-    # create iterable from input object
-    # need to convert or slicing will always return the first elements
-    it = iter(iterable)
-    for i in range(0, len(iterable), size):
-        chunks.append({k: iterable[k] for k in itertools.islice(it, size)})
-
-    return chunks
+	"""Split a dictionary.
+
+	Parameters
+	----------
+	iterable : dict
+		Dictionary to split.
+	size : int
+		Size of dictionaries created from the input
+		dictionary.
+
+	Returns
+	-------
+	chunks : list
+		List with dictionaries of defined size
+		resulting from splitting the input dictionary.
+	"""
+	chunks = []
+	# create iterable from input object
+	# need to convert or slicing will always return the first elements
+	it = iter(iterable)
+	for i in range(0, len(iterable), size):
+		chunks.append({k: iterable[k] for k in itertools.islice(it, size)})
+
+	return chunks
 
 
 def select_keys(dictionary, size):
-    """Select dictionary keys based on the number of values.
+	"""Select dictionary keys based on the number of values.
 
-    Parameters
-    ----------
-    dictionary : dict
-        Dictionary with key:value pairs to select.
-    size : int
-        Number of values used to select dictionary entries.
+	Parameters
+	----------
+	dictionary : dict
+		Dictionary with key:value pairs to select.
+	size : int
+		Number of values used to select dictionary entries.
 
-    Returns
-    -------
-    selected : list
-        List with the dictionary keys that have a number
-        of matching values equal or greater/lesser than
-        `size`.
-    """
-    selected = [k for k, v in dictionary.items() if len(v) == size]
+	Returns
+	-------
+	selected : list
+		List with the dictionary keys that have a number
+		of matching values equal or greater/lesser than
+		`size`.
+	"""
+	selected = [k for k, v in dictionary.items() if len(v) == size]
 
-    return selected
+	return selected
 
 
 def prune_dictionary(dictionary, keys):
-    """Remove a set of keys from a dictionary.
+	"""Remove a set of keys from a dictionary.
 
-    Parameters
-    ----------
-    dictionary : dict
-        Input dictionary.
-    keys : list
-        List of keys to remove.
+	Parameters
+	----------
+	dictionary : dict
+		Input dictionary.
+	keys : list
+		List of keys to remove.
 
-    Returns
-    -------
-    pruned_dict : dict
-        Dictionary without the keys in the list
-        of keys to remove.
-    """
-    pruned_dict = {k: v for k, v in dictionary.items() if k not in keys}
+	Returns
+	-------
+	pruned_dict : dict
+		Dictionary without the keys in the list
+		of keys to remove.
+	"""
+	pruned_dict = {k: v for k, v in dictionary.items() if k not in keys}
 
-    return pruned_dict
+	return pruned_dict
 
 
 def extract_subsequence(sequence, start, stop):
-    """Extract a substring from a string.
+	"""Extract a substring from a string.
 
-    Parameters
-    ----------
-    sequence : str
-        Input string.
-    start : int
-        Substring start position in input string.
-    stop : int
-        Substring stop position in input string.
+	Parameters
+	----------
+	sequence : str
+		Input string.
+	start : int
+		Substring start position in input string.
+	stop : int
+		Substring stop position in input string.
 
-    Returns
-    -------
-    subsequence : str
-        Substring extracted from input string.
-    """
-    subsequence = sequence[start:stop]
+	Returns
+	-------
+	subsequence : str
+		Substring extracted from input string.
+	"""
+	subsequence = sequence[start:stop]
 
-    return subsequence
+	return subsequence
 
 
 def extract_single_cds(sequence, start, stop, strand):
-    """Extract a coding sequence from a larger sequence.
+	"""Extract a coding sequence from a larger sequence.
 
-    Parameters
-    ----------
-    sequence : str
-        Contig that contains the coding sequence.
-    start : int
-        Coding sequence start position in contig.
-    stop : int
-        Coding sequence stop position in contig.
-    strand : int
-        Coding sequence orientation.
+	Parameters
+	----------
+	sequence : str
+		Contig that contains the coding sequence.
+	start : int
+		Coding sequence start position in contig.
+	stop : int
+		Coding sequence stop position in contig.
+	strand : int
+		Coding sequence orientation.
 
-    Returns
-    -------
-    coding_sequence : str
-        Coding sequence extracted from input contig.
-    """
-    coding_sequence = extract_subsequence(sequence, start, stop)
+	Returns
+	-------
+	coding_sequence : str
+		Coding sequence extracted from input contig.
+	"""
+	coding_sequence = extract_subsequence(sequence, start, stop)
 
-    if strand == 0:
-        coding_sequence = reverse_complement(coding_sequence, ct.DNA_BASES)
+	if strand == 0:
+		coding_sequence = reverse_complement(coding_sequence, ct.DNA_BASES)
 
-    return coding_sequence
+	return coding_sequence
 
 
 def escape_special_characters(input_string):
-    """Escape strings to use in regex.
+	"""Escape strings to use in regex.
 
-    Parameters
-    ----------
-    input_string : str
-        String containing characters to escape.
+	Parameters
+	----------
+	input_string : str
+		String containing characters to escape.
 
-    Returns
-    -------
-    escaped_string : str
-        Escaped string.
-    """
-    escaped_string = re.escape(input_string)
+	Returns
+	-------
+	escaped_string : str
+		Escaped string.
+	"""
+	escaped_string = re.escape(input_string)
 
-    return escaped_string
+	return escaped_string
 
 
 def replace_multiple_characters(input_string, replacements):
-    """Replace multiple characters in a string.
+	"""Replace multiple characters in a string.
 
-    Parameters
-    ----------
-    input_string : str
-        String with characters to be replaced.
-    replacements : list
-        List that contains sublists with two elements,
-        the element to be replaced and the element it should
-        be replaced by.
+	Parameters
+	----------
+	input_string : str
+		String with characters to be replaced.
+	replacements : list
+		List that contains sublists with two elements,
+		the element to be replaced and the element it should
+		be replaced by.
 
-    Returns
-    -------
-    replaced : str
-        Input string with replaced characters.
-    """
-    for r in replacements:
-        if r[0] in input_string:
-            input_string = input_string.replace(*r)
+	Returns
+	-------
+	replaced : str
+		Input string with replaced characters.
+	"""
+	for r in replacements:
+		if r[0] in input_string:
+			input_string = input_string.replace(*r)
 
-    return input_string
+	return input_string
 
 
 def reverse_complement(input_string, alphabet):
-    """Determine the reverse complement of a string.
+	"""Determine the reverse complement of a string.
 
-    Parameters
-    ----------
-    input_string : str
-        Input string.
-    alphabet : str
-        String that contains the alphabet used in
-        the input string (the reverse must be equal
-        to the desired complement).
+	Parameters
+	----------
+	input_string : str
+		Input string.
+	alphabet : str
+		String that contains the alphabet used in
+		the input string (the reverse must be equal
+		to the desired complement).
 
-    Returns
-    -------
-    reverse_complement_string : str
-        The reverse complement of the input string.
-    """
-    translation_table = str.maketrans(alphabet, alphabet[::-1])
+	Returns
+	-------
+	reverse_complement_string : str
+		The reverse complement of the input string.
+	"""
+	translation_table = str.maketrans(alphabet, alphabet[::-1])
 
-    upper_string = input_string.upper()
-    complement_string = upper_string.translate(translation_table)
-    reverse_complement_string = reverse_str(complement_string)
+	upper_string = input_string.upper()
+	complement_string = upper_string.translate(translation_table)
+	reverse_complement_string = reverse_str(complement_string)
 
-    return reverse_complement_string
+	return reverse_complement_string
 
 
 def reverse_str(input_string):
-    """Reverse character order in a string.
+	"""Reverse character order in a string.
 
-    Parameters
-    ----------
-    input_string : str
-     String to be reversed.
+	Parameters
+	----------
+	input_string : str
+	 String to be reversed.
 
-    Returns
-    -------
-    revstr : str
-        Reverse of input string.
-    """
-    revstr = input_string[::-1]
+	Returns
+	-------
+	revstr : str
+		Reverse of input string.
+	"""
+	revstr = input_string[::-1]
 
-    return revstr
+	return revstr
 
 
 def check_str_alphabet(input_string, alphabet):
-    """Determine if a string only contains characters from specified alphabet.
+	"""Determine if a string only contains characters from specified alphabet.
 
-    Parameters
-    ----------
-    input_string : str
-        Input string.
-    alphabet : str
-        String that includes all characters in the
-        alphabet.
+	Parameters
+	----------
+	input_string : str
+		Input string.
+	alphabet : str
+		String that includes all characters in the
+		alphabet.
 
-    Returns
-    -------
-    True if sequence only has characters from specified
-    alphabet, False otherwise.
-    """
-    alphabet_chars = set(alphabet)
-    string_chars = set(input_string)
+	Returns
+	-------
+	True if sequence only has characters from specified
+	alphabet, False otherwise.
+	"""
+	alphabet_chars = set(alphabet)
+	string_chars = set(input_string)
 
-    diff = string_chars - alphabet_chars
+	diff = string_chars - alphabet_chars
 
-    return len(diff) == 0
+	return len(diff) == 0
 
 
 def check_str_multiple(input_string, number):
-    """Determine if length of input string is a multiple of a specified number.
+	"""Determine if length of input string is a multiple of a specified number.
 
-    Parameters
-    ----------
-    input_string : str
-        Input string.
-    number : int
-        Length value should be a multiple of this number.
+	Parameters
+	----------
+	input_string : str
+		Input string.
+	number : int
+		Length value should be a multiple of this number.
 
-    Returns
-    -------
-    True if the length of the sequence is a multiple of the
-    specified number, False otherwise.
-    """
-    return (len(input_string) % number) == 0
+	Returns
+	-------
+	True if the length of the sequence is a multiple of the
+	specified number, False otherwise.
+	"""
+	return (len(input_string) % number) == 0
 
 
 def hash_sequence(input_string, hash_type='sha256'):
-    """Compute hash of an input string.
+	"""Compute hash of an input string.
 
-    Parameters
-    ----------
-    input_string : str
-        Input string to hash.
-    hash_type : str
-        Hash type/function that will be used to compute the
-        hash (any of the hash functions available in the
-        hashlib module).
+	Parameters
+	----------
+	input_string : str
+		Input string to hash.
+	hash_type : str
+		Hash type/function that will be used to compute the
+		hash (any of the hash functions available in the
+		hashlib module).
 
-    Returns
-    -------
-    hashed_string : str
-        String representation of the HASH object
-        in hexadecimal digits.
-    """
-    # get hash function object from hashlib
-    hashing_function = getattr(hashlib, hash_type)
+	Returns
+	-------
+	hashed_string : str
+		String representation of the HASH object
+		in hexadecimal digits.
+	"""
+	# get hash function object from hashlib
+	hashing_function = getattr(hashlib, hash_type)
 
-    # default encoding is UTF-8
-    hashed_string = hashing_function(input_string.encode()).hexdigest()
+	# default encoding is UTF-8
+	hashed_string = hashing_function(input_string.encode()).hexdigest()
 
-    return hashed_string
+	return hashed_string
 
 
 def string_kmerizer(input_string, k_value, offset=1, position=False):
-    """Decompose a string into k-mers.
-
-    Parameters
-    ----------
-    input_string : str
-        String to divide into k-mers.
-    k_value : int
-        Value for the size of k-mers.
-    offset : int
-        Value to indicate offset of consecutive k-mers.
-    position : bool
-        If the start position of the k-mers in the string
-        should be stored.
-
-    Returns
-    -------
-    kmers : list
-        List that contains the k-mers determined for the
-        input string. The list will contain strings if
-        it is not specified that positions should be
-        stored and tuples of k-mer and start position
-        if the position is stored.
-    """
-    if position is False:
-        kmers = [input_string[i:i+k_value]
-                 for i in range(0, len(input_string)-k_value+1, offset)]
-    elif position is True:
-        kmers = [(input_string[i:i+k_value], i)
-                 for i in range(0, len(input_string)-k_value+1, offset)]
-
-    return kmers
+	"""Decompose a string into k-mers.
+
+	Parameters
+	----------
+	input_string : str
+		String to divide into k-mers.
+	k_value : int
+		Value for the size of k-mers.
+	offset : int
+		Value to indicate offset of consecutive k-mers.
+	position : bool
+		If the start position of the k-mers in the string
+		should be stored.
+
+	Returns
+	-------
+	kmers : list
+		List that contains the k-mers determined for the
+		input string. The list will contain strings if
+		it is not specified that positions should be
+		stored and tuples of k-mer and start position
+		if the position is stored.
+	"""
+	if position is False:
+		kmers = [input_string[i:i+k_value]
+				 for i in range(0, len(input_string)-k_value+1, offset)]
+	elif position is True:
+		kmers = [(input_string[i:i+k_value], i)
+				 for i in range(0, len(input_string)-k_value+1, offset)]
+
+	return kmers
 
 
 def determine_minimizers(input_string, adjacent_kmers, k_value, offset=1,
-                         position=False):
-    """Determine minimizers for a input string.
-
-    Determines minimizers for a string based on lexicographical
-    order. Skips windows that cannot have a minimizer based on
-    the minimizer computed in the previous iteration.
-
-    Parameters
-    ----------
-    input_string : str
-        String representing the sequence.
-    adjacent_kmers : int
-        Window size value. Number of adjacent k-mers per group.
-    k_value : int
-        Value of k for the k-mer size.
-    offset : int
-        Value to indicate offset of consecutive k-mers.
-    position : bool
-        If the start position of the k-mers in the sequence
-        should be stored.
-
-    Returns
-    -------
-    minimizers : list
-        A list with the set of minimizers determined
-        for the input string.
-    """
-    # break string into k-mers
-    kmers = string_kmerizer(input_string, k_value, offset, position)
-
-    i = 0
-    previous = None
-    sell = False
-    minimizers = []
-    # determine total number of windows
-    last_window = (len(kmers)-adjacent_kmers)
-    while i <= last_window:
-        # get kmers in current window
-        window = kmers[i:i+adjacent_kmers]
-        # pick smallest kmer as minimizer
-        minimizer = [min(window)]
-        # get position in window of smallest minimizer
-        minimizer_idx = window.index(minimizer[0])
-        # sliding window that does not included last minimizer
-        if previous is None:
-            # simply store smallest minimizer
-            minimizers.extend(minimizer)
-        # sliding window includes last minimizer because we
-        # skipped some sliding windows
-        else:
-            # check if minimizer is the same as the one picked
-            # in the last window
-            # Do not store minimizer if it is the same
-            if minimizer[0] != previous:
-                # get kmers smaller than last minimizer
-                skipped = window[1:minimizer_idx]
-                # determine if any of the smaller kmers is
-                # the minimizer of a skipped window
-                minimal = previous
-                for m in skipped:
-                    if m < minimal:
-                        minimizer.append(m)
-                        minimal = m
-                minimizers.extend(minimizer)
-
-        # slide by 1 if minimizer has index 0 in window
-        if minimizer_idx == 0:
-            i += 1
-            previous = None
-        # skip sliding windows based on minimizer position
-        else:
-            i += minimizer_idx
-            # if adding minimizer index surpasses last window value we
-            # might miss one last minimizer because it will fail the condition
-            # find a better way to control this condition!
-            if i > last_window and sell is False:
-                i = last_window
-                sell = True
-            previous = minimizer[0]
-
-    return minimizers
+						 position=False):
+	"""Determine minimizers for a input string.
+
+	Determines minimizers for a string based on lexicographical
+	order. Skips windows that cannot have a minimizer based on
+	the minimizer computed in the previous iteration.
+
+	Parameters
+	----------
+	input_string : str
+		String representing the sequence.
+	adjacent_kmers : int
+		Window size value. Number of adjacent k-mers per group.
+	k_value : int
+		Value of k for the k-mer size.
+	offset : int
+		Value to indicate offset of consecutive k-mers.
+	position : bool
+		If the start position of the k-mers in the sequence
+		should be stored.
+
+	Returns
+	-------
+	minimizers : list
+		A list with the set of minimizers determined
+		for the input string.
+	"""
+	# break string into k-mers
+	kmers = string_kmerizer(input_string, k_value, offset, position)
+
+	i = 0
+	previous = None
+	sell = False
+	minimizers = []
+	# determine total number of windows
+	last_window = (len(kmers)-adjacent_kmers)
+	while i <= last_window:
+		# get kmers in current window
+		window = kmers[i:i+adjacent_kmers]
+		# pick smallest kmer as minimizer
+		minimizer = [min(window)]
+		# get position in window of smallest minimizer
+		minimizer_idx = window.index(minimizer[0])
+		# sliding window that does not included last minimizer
+		if previous is None:
+			# simply store smallest minimizer
+			minimizers.extend(minimizer)
+		# sliding window includes last minimizer because we
+		# skipped some sliding windows
+		else:
+			# check if minimizer is the same as the one picked
+			# in the last window
+			# Do not store minimizer if it is the same
+			if minimizer[0] != previous:
+				# get kmers smaller than last minimizer
+				skipped = window[1:minimizer_idx]
+				# determine if any of the smaller kmers is
+				# the minimizer of a skipped window
+				minimal = previous
+				for m in skipped:
+					if m < minimal:
+						minimizer.append(m)
+						minimal = m
+				minimizers.extend(minimizer)
+
+		# slide by 1 if minimizer has index 0 in window
+		if minimizer_idx == 0:
+			i += 1
+			previous = None
+		# skip sliding windows based on minimizer position
+		else:
+			i += minimizer_idx
+			# if adding minimizer index surpasses last window value we
+			# might miss one last minimizer because it will fail the condition
+			# find a better way to control this condition!
+			if i > last_window and sell is False:
+				i = last_window
+				sell = True
+			previous = minimizer[0]
+
+	return minimizers
 
 
 def decode_str(str_list, encoding):
-    """Decode bytes objects.
+	"""Decode bytes objects.
 
-    Decodes bytes objects in the input list and strips decoded
-    strings from whitespaces and newlines.
+	Decodes bytes objects in the input list and strips decoded
+	strings from whitespaces and newlines.
 
-    Parameters
-    ----------
-    str_list
-        List with string or bytes objects to decode
-        and strip of whitespaces and newlines.
-    encoding : str
-        Encoding codec to use.
+	Parameters
+	----------
+	str_list
+		List with string or bytes objects to decode
+		and strip of whitespaces and newlines.
+	encoding : str
+		Encoding codec to use.
 
-    Returns
-    -------
-    decoded : list
-        List with strings without whitespaces or
-        newlines.
-    """
-    decoded = [m.decode(encoding).strip()
-               if type(m) == bytes
-               else m.strip()
-               for m in str_list]
+	Returns
+	-------
+	decoded : list
+		List with strings without whitespaces or
+		newlines.
+	"""
+	decoded = [m.decode(encoding).strip()
+			   if type(m) == bytes
+			   else m.strip()
+			   for m in str_list]
 
-    return decoded
+	return decoded
 
 
 # sorted key parameter is None by default
 def sort_iterable(data, sort_key=None, reverse=False):
-    """Sort an iterable.
+	"""Sort an iterable.
 
-    Parameters
-    ----------
-    data : iter
-        Iterable to sort.
-    sort_key
-        If provided, data will be sorted based
-        on this function.
-    reverse : bool
-        If sorting order should be inverted.
+	Parameters
+	----------
+	data : iter
+		Iterable to sort.
+	sort_key
+		If provided, data will be sorted based
+		on this function.
+	reverse : bool
+		If sorting order should be inverted.
 
-    Returns
-    -------
-    sorted_data
-        Iterable with sorted elements.
-    """
-    # sorted key parameter is None by default
-    sorted_data = sorted(data, key=sort_key, reverse=reverse)
+	Returns
+	-------
+	sorted_data
+		Iterable with sorted elements.
+	"""
+	# sorted key parameter is None by default
+	sorted_data = sorted(data, key=sort_key, reverse=reverse)
 
-    return sorted_data
+	return sorted_data
 
 
 def filter_list(lst, remove):
-    """Remove elements from a list.
+	"""Remove elements from a list.
 
-    Parameters
-    ----------
-    lst : list
-        Input list.
-    remove : list
-        List of elements to remove from input list.
+	Parameters
+	----------
+	lst : list
+		Input list.
+	remove : list
+		List of elements to remove from input list.
 
-    Returns
-    -------
-    filtered_list : list
-        List without the removed elements.
-    """
-    filtered_list = list(set(lst) - set(remove))
+	Returns
+	-------
+	filtered_list : list
+		List without the removed elements.
+	"""
+	filtered_list = list(set(lst) - set(remove))
 
-    return filtered_list
+	return filtered_list
 
 
 def find_missing(integer_list):
-    """Find the set of integers missing from a list to make it consecutive.
+	"""Find the set of integers missing from a list to make it consecutive.
 
-    Parameters
-    ----------
-    integer_list : list
-        List containing integers.
+	Parameters
+	----------
+	integer_list : list
+		List containing integers.
 
-    Returns
-    -------
-    missing_integers : list
-        Sorted list of missing integers.
-    """
-    first = min(integer_list)
-    last = max(integer_list)
-    missing_integers = sorted(set(range(first, last+1)).difference(integer_list))
+	Returns
+	-------
+	missing_integers : list
+		Sorted list of missing integers.
+	"""
+	first = min(integer_list)
+	last = max(integer_list)
+	missing_integers = sorted(set(range(first, last+1)).difference(integer_list))
 
-    return missing_integers
+	return missing_integers
 
 
 # Add new parameter that accepts method to sample kmers
 def kmer_index(fasta_file, word_size, fasta=True):
-    """Create a k-mer index from a set of sequences in a FASTA file.
-
-    Parameters
-    ----------
-    fasta_file : str
-        Path to a Fasta file.
-    word_size : int
-        Value k for the k-mer size.
-    fasta : bool
-        True if input is a FASTA file, False if it is a dictionary
-        with sequence identifiers as keys and sequences as values.
-
-    Returns
-    -------
-    kmers_mapping : dict
-        Dictionary with k-mers as keys and the
-        list of sequence identifiers of the
-        sequences that contain the kmers as
-        values.
-    seqs_kmers : dict
-        Dictionary with sequence identifiers
-        as keys and the set of distinct k-mers
-        for each sequence as values.
-    """
-    if fasta is True:
-        sequences = fao.sequence_generator(fasta_file)
-    else:
-        sequences = fasta_file
-
-    kmers_mapping = {}
-    for record in sequences:
-        try:
-            seqid = record
-            sequence = sequences[record]
-        except Exception as e:
-            seqid = record.id
-            sequence = str(record.seq)
-
-        minimizers = determine_minimizers(sequence, word_size,
-                                          word_size, position=False)
-        kmers = set(minimizers)
-
-        # Create dictionary with kmers as keys and list
-        # of sequences with given kmers as values
-        for kmer in kmers:
-            kmers_mapping.setdefault(kmer, []).append(seqid)
-
-    return kmers_mapping
+	"""Create a k-mer index from a set of sequences in a FASTA file.
+
+	Parameters
+	----------
+	fasta_file : str
+		Path to a Fasta file.
+	word_size : int
+		Value k for the k-mer size.
+	fasta : bool
+		True if input is a FASTA file, False if it is a dictionary
+		with sequence identifiers as keys and sequences as values.
+
+	Returns
+	-------
+	kmers_mapping : dict
+		Dictionary with k-mers as keys and the
+		list of sequence identifiers of the
+		sequences that contain the kmers as
+		values.
+	seqs_kmers : dict
+		Dictionary with sequence identifiers
+		as keys and the set of distinct k-mers
+		for each sequence as values.
+	"""
+	if fasta is True:
+		sequences = fao.sequence_generator(fasta_file)
+	else:
+		sequences = fasta_file
+
+	kmers_mapping = {}
+	for record in sequences:
+		try:
+			seqid = record
+			sequence = sequences[record]
+		except Exception as e:
+			seqid = record.id
+			sequence = str(record.seq)
+
+		minimizers = determine_minimizers(sequence, word_size,
+										  word_size, position=False)
+		kmers = set(minimizers)
+
+		# Create dictionary with kmers as keys and list
+		# of sequences with given kmers as values
+		for kmer in kmers:
+			kmers_mapping.setdefault(kmer, []).append(seqid)
+
+	return kmers_mapping
 
 
 def contained_terms(iterable, terms):
-    """Find elements in an iterable that contain any term from a set of terms.
-
-    Parameters
-    ----------
-    iterable
-        An iterable such as a list with strings
-        or a dictionary.
-    terms : list
-        Terms to search for.
-
-    Returns
-    -------
-    matches : list
-        List with the elements of the iterable
-        that contain any of the searched terms.
-    """
-    matches = []
-    for e in iterable:
-        if any([term in e for term in terms]) is True:
-            matches.append(e)
-
-    return matches
+	"""Find elements in an iterable that contain any term from a set of terms.
+
+	Parameters
+	----------
+	iterable
+		An iterable such as a list with strings
+		or a dictionary.
+	terms : list
+		Terms to search for.
+
+	Returns
+	-------
+	matches : list
+		List with the elements of the iterable
+		that contain any of the searched terms.
+	"""
+	matches = []
+	for e in iterable:
+		if any([term in e for term in terms]) is True:
+			matches.append(e)
+
+	return matches
 
 
 def integer_mapping(values, inverse=False):
-    """Create mapping between list elements and consecutive integers.
-
-    Parameters
-    ----------
-    values : iter
-        Input iterable.
-    inverse : bool
-        Invert mapping order, integers:values instead
-        of values:integers.
-
-    Returns
-    -------
-    mapping : dict
-        Dictionary with the mapping between the
-        elements in the input iterable and the
-        sequential integers.
-    """
-    mapping = {v: i+1 for i, v in enumerate(values)}
-    if inverse is True:
-        mapping = invert_dictionary(mapping)
-
-    return mapping
+	"""Create mapping between list elements and consecutive integers.
+
+	Parameters
+	----------
+	values : iter
+		Input iterable.
+	inverse : bool
+		Invert mapping order, integers:values instead
+		of values:integers.
+
+	Returns
+	-------
+	mapping : dict
+		Dictionary with the mapping between the
+		elements in the input iterable and the
+		sequential integers.
+	"""
+	mapping = {v: i+1 for i, v in enumerate(values)}
+	if inverse is True:
+		mapping = invert_dictionary(mapping)
+
+	return mapping
 
 
 def multiprocessing_inputs(inputs, common_args, function):
-    """Create lists of inputs to process in parallel.
-
-    Creates lists of inputs for the `map_async_parallelizer`
-    function from the `multiprocessing_operations` module.
-
-    Parameters
-    ----------
-    inputs : list
-        List with the distinct inputs.
-    common_args : list
-        List with the common arguments/inputs.
-    function : func
-        Function that will be parallelized by the
-        `map_async_parallelizer` function.
-
-    Returns
-    -------
-    input_groups : list
-        List with one sublist per distinct input. Each
-        sublist also contains the common arguments and
-        the function that will receive the arguments.
-    """
-    input_groups = []
-    # Create a list for each distinct input
-    for g in inputs:
-        new_input = g + common_args + [function]
-        input_groups.append(new_input)
-
-    return input_groups
+	"""Create lists of inputs to process in parallel.
+
+	Creates lists of inputs for the `map_async_parallelizer`
+	function from the `multiprocessing_operations` module.
+
+	Parameters
+	----------
+	inputs : list
+		List with the distinct inputs.
+	common_args : list
+		List with the common arguments/inputs.
+	function : func
+		Function that will be parallelized by the
+		`map_async_parallelizer` function.
+
+	Returns
+	-------
+	input_groups : list
+		List with one sublist per distinct input. Each
+		sublist also contains the common arguments and
+		the function that will receive the arguments.
+	"""
+	input_groups = []
+	# Create a list for each distinct input
+	for g in inputs:
+		new_input = g + common_args + [function]
+		input_groups.append(new_input)
+
+	return input_groups
 
 
 def aggregate_iterables(iterables):
-    """Aggregate elements with same index from several iterables.
+	"""Aggregate elements with same index from several iterables.
 
-    Parameters
-    ----------
-    iterables : list
-        List of iterables to aggreagate.
+	Parameters
+	----------
+	iterables : list
+		List of iterables to aggreagate.
 
-    Returns
-    -------
-    aggregated_inputs : list
-        List with a sublist per group of elements
-        with same index that were aggregated.
-    """
-    aggregated_inputs = [list(i) for i in zip(*iterables)]
+	Returns
+	-------
+	aggregated_inputs : list
+		List with a sublist per group of elements
+		with same index that were aggregated.
+	"""
+	aggregated_inputs = [list(i) for i in zip(*iterables)]
 
-    return aggregated_inputs
+	return aggregated_inputs
 
 
 def mapping_function(values, function, args):
-    """Create mapping between input elements and function result.
-
-    Parameters
-    ----------
-    values : iter
-        Input iterable.
-    function : func
-        Funtion to apply to the elements in the
-        input iterable.
-    args : list
-        List of arguments to pass to the function.
-
-    Returns
-    -------
-    mapping : dict
-        Dictionary with the mapping between the
-        elements in the input iterable and the
-        result of applying the function to each
-        value.
-    """
-    mapping = {v: function(v, *args) for v in values}
-
-    return mapping
+	"""Create mapping between input elements and function result.
+
+	Parameters
+	----------
+	values : iter
+		Input iterable.
+	function : func
+		Funtion to apply to the elements in the
+		input iterable.
+	args : list
+		List of arguments to pass to the function.
+
+	Returns
+	-------
+	mapping : dict
+		Dictionary with the mapping between the
+		elements in the input iterable and the
+		result of applying the function to each
+		value.
+	"""
+	mapping = {v: function(v, *args) for v in values}
+
+	return mapping
 
 
 def polyline_encoding(number_list, precision=0):
-    """Use the polyline algorithm to encode/compress a list of numbers.
-
-    Parameters
-    ----------
-    number_list : list
-        List with numbers to encode.
-    precision : int
-        Number of decimal places preserved by the encoding.
-
-    Returns
-    -------
-    compressed_values : str
-        A single string composed of ASCII characters
-        that represents the compressed list of numbers
-        to the desired level of precision.
-
-    Notes
-    -----
-    This implementation is based on the `numcompress` package
-    (https://github.com/amit1rrr/numcompress).
-    """
-    compressed_values = ''
-    # store the precision value
-    # to ensure proper character display, encoded values are summed with 63
-    # (the ASCII character '?')
-    compressed_values += chr(precision+63)
-    previous_num = 0
-    for number in number_list:
-        # encode the difference between numbers
-        difference = number - previous_num
-        # multiply and round to get integer that preserves decimal places
-        difference = int(round(difference*(10**precision)))
-        # left bitwise shift 1 position (0b11111 --> 0b111110)
-        # and invert the encoding if the original number was negative
-        # the bit added by the shift will be inverted and that allows
-        # to know that the number is negative
-        difference = ~(difference << 1) if difference < 0 else difference << 1
-
-        # process 5-bit chunks from right to left until we get a value that
-        # is smaller than 0b100000/0x20/32
-        while difference >= 0x20:
-            # use 0x1f (0b11111) as bitmask to get the smallest 5-bit chunk
-            # and 0x20 is used as continuation bit to help decode
-            compressed_values += (chr((0x20 | (difference & 0x1f)) + 63))
-            # right bitwise shift to exclude the 5-bit chunk that was encoded
-            difference >>= 5
-
-        # encode the last 5-bit chunk
-        compressed_values += (chr(difference + 63))
-        # store number to subtract from next
-        previous_num = number
-
-    return compressed_values
+	"""Use the polyline algorithm to encode/compress a list of numbers.
+
+	Parameters
+	----------
+	number_list : list
+		List with numbers to encode.
+	precision : int
+		Number of decimal places preserved by the encoding.
+
+	Returns
+	-------
+	compressed_values : str
+		A single string composed of ASCII characters
+		that represents the compressed list of numbers
+		to the desired level of precision.
+
+	Notes
+	-----
+	This implementation is based on the `numcompress` package
+	(https://github.com/amit1rrr/numcompress).
+	"""
+	compressed_values = ''
+	# store the precision value
+	# to ensure proper character display, encoded values are summed with 63
+	# (the ASCII character '?')
+	compressed_values += chr(precision+63)
+	previous_num = 0
+	for number in number_list:
+		# encode the difference between numbers
+		difference = number - previous_num
+		# multiply and round to get integer that preserves decimal places
+		difference = int(round(difference*(10**precision)))
+		# left bitwise shift 1 position (0b11111 --> 0b111110)
+		# and invert the encoding if the original number was negative
+		# the bit added by the shift will be inverted and that allows
+		# to know that the number is negative
+		difference = ~(difference << 1) if difference < 0 else difference << 1
+
+		# process 5-bit chunks from right to left until we get a value that
+		# is smaller than 0b100000/0x20/32
+		while difference >= 0x20:
+			# use 0x1f (0b11111) as bitmask to get the smallest 5-bit chunk
+			# and 0x20 is used as continuation bit to help decode
+			compressed_values += (chr((0x20 | (difference & 0x1f)) + 63))
+			# right bitwise shift to exclude the 5-bit chunk that was encoded
+			difference >>= 5
+
+		# encode the last 5-bit chunk
+		compressed_values += (chr(difference + 63))
+		# store number to subtract from next
+		previous_num = number
+
+	return compressed_values
 
 
 def decompress_number(text, index):
-    """Decode a single number from a string created with polyline encoding.
-
-    Parameters
-    ----------
-    text : str
-        String representing a compressed list of numbers.
-    index : int
-        Index of the first character to start decoding a number.
-
-    Returns
-    -------
-    Index to start decoding the next number and the number decoded
-    in the current function call.
-    """
-    number = 0
-    bitwise_shift = 0
-
-    while True:
-        # subtract 63 and remove bit from OR with 0x20 if 5-bit chunk
-        # is not the last to decode a number that was in the original list
-        n = (ord(text[index]) - 63)
-        index += 1
-        # only continue if there is a continuation bit (0b100000)
-        # e.g.: 0b11111 only has 5 bits, meaning it is the last chunk
-        # that is necessary to decode the number
-        if n >= 0x20:
-            # subtract 0x20 (0b100000) to get the 5-bit chunk
-            n -= 0x20
-            # contruct the binary number with biwise shift to add each
-            # 5-bit chunk to original position
-            number = number | (n << bitwise_shift)
-            # increment bitwise shift value for next 5-bit chunk
-            bitwise_shift += 5
-        else:
-            break
-
-    # add the last chunk, without continuation bit, to the leftmost position
-    number = number | (n << bitwise_shift)
-
-    # invert bits to get negative number if sign bit is 1
-    # remove sign bit and keep only bits for decoded value
-    return index, (~number >> 1) if (number & 1) != 0 else (number >> 1)
+	"""Decode a single number from a string created with polyline encoding.
+
+	Parameters
+	----------
+	text : str
+		String representing a compressed list of numbers.
+	index : int
+		Index of the first character to start decoding a number.
+
+	Returns
+	-------
+	Index to start decoding the next number and the number decoded
+	in the current function call.
+	"""
+	number = 0
+	bitwise_shift = 0
+
+	while True:
+		# subtract 63 and remove bit from OR with 0x20 if 5-bit chunk
+		# is not the last to decode a number that was in the original list
+		n = (ord(text[index]) - 63)
+		index += 1
+		# only continue if there is a continuation bit (0b100000)
+		# e.g.: 0b11111 only has 5 bits, meaning it is the last chunk
+		# that is necessary to decode the number
+		if n >= 0x20:
+			# subtract 0x20 (0b100000) to get the 5-bit chunk
+			n -= 0x20
+			# contruct the binary number with biwise shift to add each
+			# 5-bit chunk to original position
+			number = number | (n << bitwise_shift)
+			# increment bitwise shift value for next 5-bit chunk
+			bitwise_shift += 5
+		else:
+			break
+
+	# add the last chunk, without continuation bit, to the leftmost position
+	number = number | (n << bitwise_shift)
+
+	# invert bits to get negative number if sign bit is 1
+	# remove sign bit and keep only bits for decoded value
+	return index, (~number >> 1) if (number & 1) != 0 else (number >> 1)
 
 
 def polyline_decoding(text):
-    """Decode a list of integers compressed with polyline encoding.
-
-    Parameters
-    ----------
-    text : str
-        String representing a compressed list of numbers.
-
-    Returns
-    -------
-    number_list : list
-        List with the decoded numbers.
-    """
-    number_list = []
-    index = last_num = 0
-    # decode precision value
-    precision = ord(text[index]) - 63
-    index += 1
-    while index < len(text):
-        # decode a number and get index to start decoding next number
-        index, difference = decompress_number(text, index)
-        # add decoded difference to get next number in original list
-        last_num += difference
-        number_list.append(last_num)
-
-    number_list = [round(item * (10 ** (-precision)), precision) for item in number_list]
-
-    return number_list
+	"""Decode a list of integers compressed with polyline encoding.
+
+	Parameters
+	----------
+	text : str
+		String representing a compressed list of numbers.
+
+	Returns
+	-------
+	number_list : list
+		List with the decoded numbers.
+	"""
+	number_list = []
+	index = last_num = 0
+	# decode precision value
+	precision = ord(text[index]) - 63
+	index += 1
+	while index < len(text):
+		# decode a number and get index to start decoding next number
+		index, difference = decompress_number(text, index)
+		# add decoded difference to get next number in original list
+		last_num += difference
+		number_list.append(last_num)
+
+	number_list = [round(item * (10 ** (-precision)), precision) for item in number_list]
+
+	return number_list
 
 
 def replace_list_values(input_list, replace_dict):
-    """Replace values in list based on provided substitutions.
+	"""Replace values in list based on provided substitutions.
 
-    Parameters
-    ----------
-    input_list : list
-        List with values to substitute.
-    replace_dict : dict
-        Mapping between the values to substitute
-        and the values to substitute by.
+	Parameters
+	----------
+	input_list : list
+		List with values to substitute.
+	replace_dict : dict
+		Mapping between the values to substitute
+		and the values to substitute by.
 
-    Returns
-    -------
-    replaced_list : list
-        List with substituted values (values
-        that are not in `replace_dict` are kept
-        unchanged).
-    """
-    replaced_list = [replace_dict.get(e, e) for e in input_list]
+	Returns
+	-------
+	replaced_list : list
+		List with substituted values (values
+		that are not in `replace_dict` are kept
+		unchanged).
+	"""
+	replaced_list = [replace_dict.get(e, e) for e in input_list]
 
-    return replaced_list
+	return replaced_list
 
 
 def replace_chars(column, missing_replace='0'):
-    """Replace all non-numeric characters in a column with allele identifiers.
-
-    Parameters
-    ----------
-    column : pandas.core.series.Series
-        Pandas dataframe column.
-
-    Returns
-    -------
-    replace_missing : pandas.core.series.Series
-        Input column with cells that only contain
-        numeric characters.
-    """
-    # remove 'INF-' from inferred alleles
-    replace_inf = column.replace(to_replace='INF-',
-                                 value='', regex=True)
-    # replace '*' in novel alleles from schemas in Chewie-NS
-    # before replacing missing data cases to avoid replacing '*' with '0'
-    replace_inf = replace_inf.replace(to_replace='\*',
-                                      value='', regex=True)
-    # replace missing data with
-    replace_missing = replace_inf.replace(to_replace='\D+.*',
-                                          value=missing_replace, regex=True)
-
-    return replace_missing
+	"""Replace all non-numeric characters in a column with allele identifiers.
+
+	Parameters
+	----------
+	column : pandas.core.series.Series
+		Pandas dataframe column.
+
+	Returns
+	-------
+	replace_missing : pandas.core.series.Series
+		Input column with cells that only contain
+		numeric characters.
+	"""
+	# remove 'INF-' from inferred alleles
+	replace_inf = column.replace(to_replace='INF-',
+								 value='', regex=True)
+	# replace '*' in novel alleles from schemas in Chewie-NS
+	# before replacing missing data cases to avoid replacing '*' with '0'
+	replace_inf = replace_inf.replace(to_replace='\*',
+									  value='', regex=True)
+	# replace missing data with
+	replace_missing = replace_inf.replace(to_replace='\D+.*',
+										  value=missing_replace, regex=True)
+
+	return replace_missing
 
 
 def inclusive_range(start, stop, step):
-    """
-    """
-    i = start
-    while i < stop:
-        yield i
-        i += step
-    yield stop
+	"""
+	"""
+	i = start
+	while i < stop:
+		yield i
+		i += step
+	yield stop
 
 
diff --git a/CHEWBBACA/utils/join_profiles.py b/CHEWBBACA/utils/join_profiles.py
index 23bd2214..0cb37782 100755
--- a/CHEWBBACA/utils/join_profiles.py
+++ b/CHEWBBACA/utils/join_profiles.py
@@ -20,96 +20,96 @@
 import pandas as pd
 
 try:
-    from utils import file_operations as fo
+	from utils import file_operations as fo
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import file_operations as fo
+	from CHEWBBACA.utils import file_operations as fo
 
 
 def concatenate_profiles(files, loci_list, output_file):
-    """Concatenate allele calling results for a common set of loci.
-
-    Parameters
-    ----------
-    files : list
-        List with the paths to the TSV files with
-        allele calling results.
-    loci_list : list
-        List with the identifiers of common loci.
-    output_file : str
-        Path to the output file.
-
-    Returns
-    -------
-    total_profiles : int
-        Number of profiles written to the output file.
-    """
-    total_profiles = 0
-    for f in files:
-        # create dataframe with columns for loci in list
-        profiles = pd.read_csv(f, sep='\t',
-                               usecols=loci_list, dtype=str)
-        # reorder columns to prevent concatenating results
-        # with same loci but with different loci/column order
-        profiles = profiles[loci_list]
-        total_profiles += len(profiles)
-        # save dataframe to file
-        profiles.to_csv(output_file, mode='a',
-                        header=not os.path.exists(output_file),
-                        sep='\t', index=False)
-
-    return total_profiles
+	"""Concatenate allele calling results for a common set of loci.
+
+	Parameters
+	----------
+	files : list
+		List with the paths to the TSV files with
+		allele calling results.
+	loci_list : list
+		List with the identifiers of common loci.
+	output_file : str
+		Path to the output file.
+
+	Returns
+	-------
+	total_profiles : int
+		Number of profiles written to the output file.
+	"""
+	total_profiles = 0
+	for f in files:
+		# create dataframe with columns for loci in list
+		profiles = pd.read_csv(f, sep='\t',
+							   usecols=loci_list, dtype=str)
+		# reorder columns to prevent concatenating results
+		# with same loci but with different loci/column order
+		profiles = profiles[loci_list]
+		total_profiles += len(profiles)
+		# save dataframe to file
+		profiles.to_csv(output_file, mode='a',
+						header=not os.path.exists(output_file),
+						sep='\t', index=False)
+
+	return total_profiles
 
 
 def main(profiles, output_file, common):
-    """Join files with allelic profiles.
-
-    Parameters
-    ----------
-    profiles : list
-        List with paths to TSV files with allelic profiles.
-    output_file : str
-        Path to the output file.
-    common : bool
-        If the process should join profile data only for shared loci
-        when the profiles do not share the same loci sets.
-    """
-    if len(profiles) == 1:
-        sys.exit('Provided a single file. Nothing to do.')
-
-    headers = []
-    for file in profiles:
-        header = fo.read_lines(file, strip=True, num_lines=1)
-        headers.append(header[0].split('\t'))
-
-    if common is False:
-        # check if headers are equal
-        if all([set(headers[0]) == set(h) for h in headers[1:]]) is True:
-            print('Profiles have {0} loci.'.format(len(headers[0])-1))
-            total_profiles = concatenate_profiles(profiles,
-                                                  headers[0],
-                                                  output_file)
-        else:
-            sys.exit('Files have different sets of loci. Please provide '
-                     'files with the results for the same set of loci or '
-                     'provide the "--common" parameter to create a file '
-                     'with the results for the set of common loci.')
-    else:
-        # determine set of common loci
-        common_loci = headers[0]
-        for h in headers[1:]:
-            # determine common loci ordered based on headers in first file
-            latest_common = [locus for locus in common_loci if locus in h]
-            # update set of common loci so far
-            common_loci = latest_common
-
-        if len(latest_common) <= 1:
-            sys.exit('Profiles do not have loci in common.')
-
-        print('Profiles have {0} loci in common.'
-              ''.format(len(latest_common)-1))
-        total_profiles = concatenate_profiles(profiles,
-                                              latest_common,
-                                              output_file)
-
-    print('Joined {0} files with a total of {1} '
-          'profiles.'.format(len(profiles), total_profiles))
+	"""Join files with allelic profiles.
+
+	Parameters
+	----------
+	profiles : list
+		List with paths to TSV files with allelic profiles.
+	output_file : str
+		Path to the output file.
+	common : bool
+		If the process should join profile data only for shared loci
+		when the profiles do not share the same loci sets.
+	"""
+	if len(profiles) == 1:
+		sys.exit('Provided a single file. Nothing to do.')
+
+	headers = []
+	for file in profiles:
+		header = fo.read_lines(file, strip=True, num_lines=1)
+		headers.append(header[0].split('\t'))
+
+	if common is False:
+		# check if headers are equal
+		if all([set(headers[0]) == set(h) for h in headers[1:]]) is True:
+			print('Profiles have {0} loci.'.format(len(headers[0])-1))
+			total_profiles = concatenate_profiles(profiles,
+												  headers[0],
+												  output_file)
+		else:
+			sys.exit('Files have different sets of loci. Please provide '
+					 'files with the results for the same set of loci or '
+					 'provide the "--common" parameter to create a file '
+					 'with the results for the set of common loci.')
+	else:
+		# determine set of common loci
+		common_loci = headers[0]
+		for h in headers[1:]:
+			# determine common loci ordered based on headers in first file
+			latest_common = [locus for locus in common_loci if locus in h]
+			# update set of common loci so far
+			common_loci = latest_common
+
+		if len(latest_common) <= 1:
+			sys.exit('Profiles do not have loci in common.')
+
+		print('Profiles have {0} loci in common.'
+			  ''.format(len(latest_common)-1))
+		total_profiles = concatenate_profiles(profiles,
+											  latest_common,
+											  output_file)
+
+	print('Joined {0} files with a total of {1} '
+		  'profiles.'.format(len(profiles), total_profiles))
diff --git a/CHEWBBACA/utils/mafft_wrapper.py b/CHEWBBACA/utils/mafft_wrapper.py
index 9c754b54..e5323d9b 100644
--- a/CHEWBBACA/utils/mafft_wrapper.py
+++ b/CHEWBBACA/utils/mafft_wrapper.py
@@ -16,43 +16,43 @@
 import subprocess
 
 try:
-    from utils import constants as ct
+	from utils import constants as ct
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import constants as ct
+	from CHEWBBACA.utils import constants as ct
 
 
 def call_mafft(input_file, output_file):
-    """Call MAFFT to compute a MSA.
-
-    Parameters
-    ----------
-    input_file : str
-        Path to a FASTA file with the sequences to align.
-    output_file : str
-        Path to the output file created by MAFFT.
-
-    Returns
-    -------
-    output_file : str
-        Path to the output file.
-    outfile_exists : bool
-        True if the output file was created, False otherwise.
-    stdout : bytes
-        MAFFT stdout.
-    stderr : bytes
-        MAFFT stderr.
-    """
-    mafft_cmd = [ct.MAFFT_ALIAS, '--thread', '1', '--treeout', '--retree', '1',
-                 '--maxiterate', '0', input_file, '>', output_file]
-    mafft_cmd = ' '.join(mafft_cmd)
-
-    # Must include subprocess.PIPE to get stdout and stderr
-    mafft_cmd = subprocess.Popen(mafft_cmd,
-                                 stdout=subprocess.PIPE,
-                                 stderr=subprocess.PIPE,
-                                 shell=True)
-    stdout, stderr = mafft_cmd.communicate()
-
-    outfile_exists = os.path.exists(output_file)
-
-    return [output_file, outfile_exists, stdout, stderr]
+	"""Call MAFFT to compute a MSA.
+
+	Parameters
+	----------
+	input_file : str
+		Path to a FASTA file with the sequences to align.
+	output_file : str
+		Path to the output file created by MAFFT.
+
+	Returns
+	-------
+	output_file : str
+		Path to the output file.
+	outfile_exists : bool
+		True if the output file was created, False otherwise.
+	stdout : bytes
+		MAFFT stdout.
+	stderr : bytes
+		MAFFT stderr.
+	"""
+	mafft_cmd = [ct.MAFFT_ALIAS, '--thread', '1', '--treeout', '--retree', '1',
+				 '--maxiterate', '0', input_file, '>', output_file]
+	mafft_cmd = ' '.join(mafft_cmd)
+
+	# Must include subprocess.PIPE to get stdout and stderr
+	mafft_cmd = subprocess.Popen(mafft_cmd,
+								 stdout=subprocess.PIPE,
+								 stderr=subprocess.PIPE,
+								 shell=True)
+	stdout, stderr = mafft_cmd.communicate()
+
+	outfile_exists = os.path.exists(output_file)
+
+	return [output_file, outfile_exists, stdout, stderr]
diff --git a/CHEWBBACA/utils/multiprocessing_operations.py b/CHEWBBACA/utils/multiprocessing_operations.py
index c6c0ae34..8f552a9f 100644
--- a/CHEWBBACA/utils/multiprocessing_operations.py
+++ b/CHEWBBACA/utils/multiprocessing_operations.py
@@ -17,229 +17,229 @@
 import multiprocessing.pool
 
 try:
-    from utils import iterables_manipulation as im
+	from utils import iterables_manipulation as im
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import iterables_manipulation as im
+	from CHEWBBACA.utils import iterables_manipulation as im
 
 
 def function_helper(input_args):
-    """Run function with provided inputs and capture exceptions.
-
-    Parameters
-    ----------
-    input_args : list
-        List with function inputs and function object to call
-        in the last index.
-
-    Returns
-    -------
-    results : list
-        List with the results returned by the function.
-        If an exception is raised it returns a list with
-        the name of the function and the exception traceback.
-    """
-    try:
-        results = input_args[-1](*input_args[0:-1])
-    except Exception as e:
-        func_name = (input_args[-1]).__name__
-        traceback_lines = traceback.format_exc()
-        traceback_text = ''.join(traceback_lines)
-        print('\nError on {0}:\n{1}\n'.format(func_name, traceback_text), flush=True)
-        results = [func_name, traceback_text]
-
-    return results
+	"""Run function with provided inputs and capture exceptions.
+
+	Parameters
+	----------
+	input_args : list
+		List with function inputs and function object to call
+		in the last index.
+
+	Returns
+	-------
+	results : list
+		List with the results returned by the function.
+		If an exception is raised it returns a list with
+		the name of the function and the exception traceback.
+	"""
+	try:
+		results = input_args[-1](*input_args[0:-1])
+	except Exception as e:
+		func_name = (input_args[-1]).__name__
+		traceback_lines = traceback.format_exc()
+		traceback_text = ''.join(traceback_lines)
+		print('\nError on {0}:\n{1}\n'.format(func_name, traceback_text), flush=True)
+		results = [func_name, traceback_text]
+
+	return results
 
 
 def map_async_parallelizer(inputs, function, cpu, callback='extend',
-                           chunksize=1, show_progress=False, pool_type='pool'):
-    """Run function in parallel.
-
-    Parameters
-    ----------
-    inputs : list
-        List with inputs to process.
-    function : func
-        Function to be parallelized.
-    cpu : int
-        Number of processes to create (based on the
-        number of CPU cores).
-    callback : str
-        Results can be appended, "append", to the
-        list that stores results or the list of results
-        can be extended, "extend".
-    chunksize : int
-        Size of input chunks that will be passed to
-        each process. The function will create groups
-        of inputs with this number of elements.
-    show_progress : bool
-        True to show a progress bar with the percentage
-        of inputs that have been processed, False
-        otherwise.
-    pool_type : str
-        The multiprocessing.pool object that will be used,
-        Pool or ThreadPool.
-
-    Returns
-    -------
-    results : list
-        List with the results returned for each function
-        call.
-    """
-    if pool_type == 'pool':
-        multiprocessing_function = multiprocessing.pool.Pool
-    # Gene prediction uses ThreadPool because Pyrodigal might hang with Pool
-    elif pool_type == 'threadpool':
-        multiprocessing_function = multiprocessing.pool.ThreadPool
-
-    results = []
-    # Use context manager to join and close pool automatically
-    with multiprocessing_function(cpu) as pool:
-        if callback == 'extend':
-            rawr = pool.map_async(function, inputs,
-                                  callback=results.extend, chunksize=chunksize)
-        elif callback == 'append':
-            rawr = pool.map_async(function, inputs,
-                                  callback=results.append, chunksize=chunksize)
-
-        if show_progress is True:
-            progress = None
-            while progress != 100:
-                progress = progress_bar(rawr._number_left, len(inputs), progress)
-
-        rawr.wait()
-
-    return results
+						   chunksize=1, show_progress=False, pool_type='pool'):
+	"""Run function in parallel.
+
+	Parameters
+	----------
+	inputs : list
+		List with inputs to process.
+	function : func
+		Function to be parallelized.
+	cpu : int
+		Number of processes to create (based on the
+		number of CPU cores).
+	callback : str
+		Results can be appended, "append", to the
+		list that stores results or the list of results
+		can be extended, "extend".
+	chunksize : int
+		Size of input chunks that will be passed to
+		each process. The function will create groups
+		of inputs with this number of elements.
+	show_progress : bool
+		True to show a progress bar with the percentage
+		of inputs that have been processed, False
+		otherwise.
+	pool_type : str
+		The multiprocessing.pool object that will be used,
+		Pool or ThreadPool.
+
+	Returns
+	-------
+	results : list
+		List with the results returned for each function
+		call.
+	"""
+	if pool_type == 'pool':
+		multiprocessing_function = multiprocessing.pool.Pool
+	# Gene prediction uses ThreadPool because Pyrodigal might hang with Pool
+	elif pool_type == 'threadpool':
+		multiprocessing_function = multiprocessing.pool.ThreadPool
+
+	results = []
+	# Use context manager to join and close pool automatically
+	with multiprocessing_function(cpu) as pool:
+		if callback == 'extend':
+			rawr = pool.map_async(function, inputs,
+								  callback=results.extend, chunksize=chunksize)
+		elif callback == 'append':
+			rawr = pool.map_async(function, inputs,
+								  callback=results.append, chunksize=chunksize)
+
+		if show_progress is True:
+			progress = None
+			while progress != 100:
+				progress = progress_bar(rawr._number_left, len(inputs), progress)
+
+		rawr.wait()
+
+	return results
 
 
 def progress_bar(remaining, total, previous, tickval=5, ticknum=20):
-    """Create and print a progress bar to the stdout.
-
-    Parameters
-    ----------
-    remaining : int
-        Number of remaining tasks to complete.
-    total : int
-        Total number of inputs that have to be processed.
-    previous : int
-        Percentage of tasks that had been completed in the
-        previous function call.
-    tickval : int
-        Progress completion percentage value for each
-        tick.
-    ticknum : int
-        Total number of ticks in progress bar.
-
-    Returns
-    -------
-    completed : bool
-        Boolean indicating if all inputs have been processed.
-    """
-    # determine percentage of processed inputs
-    progress = int(100-(remaining/total)*100)
-    # only print if percentage has changed
-    if progress != previous:
-        progress_tick = progress//tickval
-        progress_bar = '[{0}{1}] {2}%'.format('='*progress_tick,
-                                              ' '*(ticknum-progress_tick),
-                                              progress)
-        print('\r', progress_bar, end='')
-
-    time.sleep(0.1)
-
-    return progress
+	"""Create and print a progress bar to the stdout.
+
+	Parameters
+	----------
+	remaining : int
+		Number of remaining tasks to complete.
+	total : int
+		Total number of inputs that have to be processed.
+	previous : int
+		Percentage of tasks that had been completed in the
+		previous function call.
+	tickval : int
+		Progress completion percentage value for each
+		tick.
+	ticknum : int
+		Total number of ticks in progress bar.
+
+	Returns
+	-------
+	completed : bool
+		Boolean indicating if all inputs have been processed.
+	"""
+	# determine percentage of processed inputs
+	progress = int(100-(remaining/total)*100)
+	# only print if percentage has changed
+	if progress != previous:
+		progress_tick = progress//tickval
+		progress_bar = '[{0}{1}] {2}%'.format('='*progress_tick,
+											  ' '*(ticknum-progress_tick),
+											  progress)
+		print('\r', progress_bar, end='')
+
+	time.sleep(0.1)
+
+	return progress
 
 
 def distribute_loci(inputs, cores, method):
-    """Create balanced lists of loci to efficiently parallelize function calls.
-
-    Creates balanced lists of loci to distribute per number of
-    available cores. Loci lists can be created based on the number
-    of sequences per locus (seqcount), the mean length of the
-    sequences (length) in each locus or the product of both values
-    (seqcount+length).
-
-    Parameters
-    ----------
-    inputs : list
-        List with one sublist per locus. Each sublist has
-        a locus identifier, the total number of sequences
-        and sequence mean legth for that locus.
-    cores : int
-        The number of loci groups that should be created.
-        Based on the number of CPU cores that will be
-        used to process the inputs.
-    method : str
-        "seqcount" to create loci lists based on the total
-        number of sequences, "length" to split based
-        on mean length of sequences and "seqcount+length" to
-        split based on both criteria.
-
-    Returns
-    -------
-    splitted_ids : list
-        List with sublists that contain loci identifiers.
-        Sublists are balanced based on the chosen method.
-    """
-    # initialize list with sublists to store inputs
-    splitted_ids = [[] for cpu in range(cores)]
-    # initialize list with chosen criterion values
-    # for each sublist of inputs
-    splitted_values = [0 for cpu in range(cores)]
-    i = 0
-    for locus in inputs:
-        if method == 'seqcount':
-            splitted_values[i] += locus[1]
-        elif method == 'length':
-            splitted_values[i] += locus[4]
-        elif method == 'seqcount+length':
-            splitted_values[i] += locus[1] * locus[4]
-        splitted_ids[i].append(locus[0])
-        # at the end of each iteration, choose the sublist
-        # with lowest criterion value
-        i = splitted_values.index(min(splitted_values))
-
-    return splitted_ids
+	"""Create balanced lists of loci to efficiently parallelize function calls.
+
+	Creates balanced lists of loci to distribute per number of
+	available cores. Loci lists can be created based on the number
+	of sequences per locus (seqcount), the mean length of the
+	sequences (length) in each locus or the product of both values
+	(seqcount+length).
+
+	Parameters
+	----------
+	inputs : list
+		List with one sublist per locus. Each sublist has
+		a locus identifier, the total number of sequences
+		and sequence mean legth for that locus.
+	cores : int
+		The number of loci groups that should be created.
+		Based on the number of CPU cores that will be
+		used to process the inputs.
+	method : str
+		"seqcount" to create loci lists based on the total
+		number of sequences, "length" to split based
+		on mean length of sequences and "seqcount+length" to
+		split based on both criteria.
+
+	Returns
+	-------
+	splitted_ids : list
+		List with sublists that contain loci identifiers.
+		Sublists are balanced based on the chosen method.
+	"""
+	# initialize list with sublists to store inputs
+	splitted_ids = [[] for cpu in range(cores)]
+	# initialize list with chosen criterion values
+	# for each sublist of inputs
+	splitted_values = [0 for cpu in range(cores)]
+	i = 0
+	for locus in inputs:
+		if method == 'seqcount':
+			splitted_values[i] += locus[1]
+		elif method == 'length':
+			splitted_values[i] += locus[4]
+		elif method == 'seqcount+length':
+			splitted_values[i] += locus[1] * locus[4]
+		splitted_ids[i].append(locus[0])
+		# at the end of each iteration, choose the sublist
+		# with lowest criterion value
+		i = splitted_values.index(min(splitted_values))
+
+	return splitted_ids
 
 
 def parallelize_function(function, inputs, common_args=None,
-                         cpu_cores=1, show_progress=False):
-    """Create list of inputs and parallelize function calls.
-
-    Parameters
-    ----------
-    function : func
-        Function to be parallelized.
-    inputs : list
-        List of inputs to divide into sublists.
-    common_args : list
-        List of common arguments to add to each sublist.
-        The common args are values that will be passed to the
-        function.
-    cpu_cores : int
-        Number of CPU cores used to parallelize the function.
-    show_progress : bool
-        True to show a progress bar for the percentage of
-        inputs that have been processed, False otherwise.
-
-    Returns
-    -------
-    results : list
-        List with the results returned by the function calls.
-    """
-    # create chunks to distribute per cores
-    input_lists = im.divide_list_into_n_chunks(inputs, len(inputs))
-
-    if common_args is None:
-        common_args = []
-
-    # add common arguments to all sublists
-    input_lists = im.multiprocessing_inputs(input_lists,
-                                            common_args,
-                                            function)
-
-    results = map_async_parallelizer(input_lists,
-                                     function_helper,
-                                     cpu_cores,
-                                     show_progress=show_progress)
-
-    return results
+						 cpu_cores=1, show_progress=False):
+	"""Create list of inputs and parallelize function calls.
+
+	Parameters
+	----------
+	function : func
+		Function to be parallelized.
+	inputs : list
+		List of inputs to divide into sublists.
+	common_args : list
+		List of common arguments to add to each sublist.
+		The common args are values that will be passed to the
+		function.
+	cpu_cores : int
+		Number of CPU cores used to parallelize the function.
+	show_progress : bool
+		True to show a progress bar for the percentage of
+		inputs that have been processed, False otherwise.
+
+	Returns
+	-------
+	results : list
+		List with the results returned by the function calls.
+	"""
+	# create chunks to distribute per cores
+	input_lists = im.divide_list_into_n_chunks(inputs, len(inputs))
+
+	if common_args is None:
+		common_args = []
+
+	# add common arguments to all sublists
+	input_lists = im.multiprocessing_inputs(input_lists,
+											common_args,
+											function)
+
+	results = map_async_parallelizer(input_lists,
+									 function_helper,
+									 cpu_cores,
+									 show_progress=show_progress)
+
+	return results
diff --git a/CHEWBBACA/utils/process_datetime.py b/CHEWBBACA/utils/process_datetime.py
index fcb45304..70042ae6 100644
--- a/CHEWBBACA/utils/process_datetime.py
+++ b/CHEWBBACA/utils/process_datetime.py
@@ -18,139 +18,139 @@
 
 
 def get_datetime():
-    """Return datetime object for current date and hour."""
-    current_datetime = dt.datetime.now()
+	"""Return datetime object for current date and hour."""
+	current_datetime = dt.datetime.now()
 
-    return current_datetime
+	return current_datetime
 
 
 def datetime_str(datetime_obj, date_format='%Y-%m-%dT%H:%M:%S'):
-    """Convert datetime module object to formatted string.
+	"""Convert datetime module object to formatted string.
 
-    Parameters
-    ----------
-    datetime_obj : datetime.datetime
-        Datetime module object.
-    date_format : str
-        Format for string representation of the date
-        object.
+	Parameters
+	----------
+	datetime_obj : datetime.datetime
+		Datetime module object.
+	date_format : str
+		Format for string representation of the date
+		object.
 
-    Returns
-    -------
-    dt_str : str
-        String representation of the date according
-        to specified format.
-    """
-    dt_str = dt.datetime.strftime(datetime_obj, date_format)
+	Returns
+	-------
+	dt_str : str
+		String representation of the date according
+		to specified format.
+	"""
+	dt_str = dt.datetime.strftime(datetime_obj, date_format)
 
-    return dt_str
+	return dt_str
 
 
 def datetime_obj(datetime_str, date_format='%Y-%m-%dT%H:%M:%S'):
-    """Convert formatted string to datetime module object.
+	"""Convert formatted string to datetime module object.
 
-    Parameters
-    ----------
-    datetime_str : str
-        String to convert to datetime object.
-    date_format : str
-        Format of the string representation of the date
-        object.
+	Parameters
+	----------
+	datetime_str : str
+		String to convert to datetime object.
+	date_format : str
+		Format of the string representation of the date
+		object.
 
-    Returns
-    -------
-    dt_obj : datetime.datetime
-        Datetime module object.
-    """
-    dt_obj = dt.datetime.strptime(datetime_str, date_format)
+	Returns
+	-------
+	dt_obj : datetime.datetime
+		Datetime module object.
+	"""
+	dt_obj = dt.datetime.strptime(datetime_str, date_format)
 
-    return dt_obj
+	return dt_obj
 
 
 def datetime_diff(sdate, edate):
-    """Return the difference in minutes and seconds between two dates.
-
-    Parameters
-    ----------
-    sdate : datetime.datetime
-        Datetime module object corresponding to
-        the oldest date.
-    edate : datetime.datetime
-        Datetime module object corresponding to
-        the most recent date.
-
-    Returns
-    -------
-    A list with the following elements:
-        minutes : float
-            Time difference between the two dates
-            in minutes.
-        seconds : float
-            The remaining time difference in seconds.
-    """
-    delta = edate - sdate
-    minutes, seconds = divmod(delta.total_seconds(), 60)
-
-    return [minutes, seconds]
+	"""Return the difference in minutes and seconds between two dates.
+
+	Parameters
+	----------
+	sdate : datetime.datetime
+		Datetime module object corresponding to
+		the oldest date.
+	edate : datetime.datetime
+		Datetime module object corresponding to
+		the most recent date.
+
+	Returns
+	-------
+	A list with the following elements:
+		minutes : float
+			Time difference between the two dates
+			in minutes.
+		seconds : float
+			The remaining time difference in seconds.
+	"""
+	delta = edate - sdate
+	minutes, seconds = divmod(delta.total_seconds(), 60)
+
+	return [minutes, seconds]
 
 
 def validate_date(date, date_format='%Y-%m-%dT%H:%M:%S.%f'):
-    """Check if date is in specified format.
-
-    Parameters
-    ----------
-    date : str
-        String representing a date.
-    date_format : str
-        Date format.
-
-    Returns
-    -------
-    valid : str
-        The string representing the date if it
-        is in the specified format.
-    """
-    valid = False
-    try:
-        date = dt.datetime.strptime(date, date_format)
-        valid = date
-    except ValueError:
-        date = dt.datetime.strptime(date+'.0', date_format)
-        valid = date
-
-    return valid
+	"""Check if date is in specified format.
+
+	Parameters
+	----------
+	date : str
+		String representing a date.
+	date_format : str
+		Date format.
+
+	Returns
+	-------
+	valid : str
+		The string representing the date if it
+		is in the specified format.
+	"""
+	valid = False
+	try:
+		date = dt.datetime.strptime(date, date_format)
+		valid = date
+	except ValueError:
+		date = dt.datetime.strptime(date+'.0', date_format)
+		valid = date
+
+	return valid
 
 
 def process_header(process):
-    """Print a header with the name of the process."""
-    header = f'chewBBACA - {process}'
-    hf = '='*(len(header)+4)
-    print(f'{hf}\n  {header}\n{hf}')
+	"""Print a header with the name of the process."""
+	header = f'chewBBACA - {process}'
+	hf = '='*(len(header)+4)
+	print(f'{hf}\n  {header}\n{hf}')
 
 
 # Decorator to time main processes
 def process_timer(func):
-    # Use functools to preserve info about wrapped function
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        # Get process name and print header
-        process_header(sys.argv[1])
-
-        # Do not measure time if it is only needed to print the help message
-        if any([option in ['-h', '--help'] for option in sys.argv]) is False:
-            start = get_datetime()
-            start_str = datetime_str(start)
-            print(f'Started at: {start_str}\n')
-
-        # Run function
-        func(*args, **kwargs)
-
-        # Does not print elapsed time if the help message is printed
-        end = get_datetime()
-        end_str = datetime_str(end)
-        print(f'\nFinished at: {end_str}')
-
-        minutes, seconds = datetime_diff(start, end)
-        print(f'Took {minutes: .0f}m{seconds: .0f}s.')
-
-    return wrapper
+	# Use functools to preserve info about wrapped function
+	@functools.wraps(func)
+	def wrapper(*args, **kwargs):
+		# Get process name and print header
+		process_header(sys.argv[1])
+
+		# Do not measure time if it is only needed to print the help message
+		if any([option in ['-h', '--help'] for option in sys.argv]) is False:
+			start = get_datetime()
+			start_str = datetime_str(start)
+			print(f'Started at: {start_str}\n')
+
+		# Run function
+		func(*args, **kwargs)
+
+		# Does not print elapsed time if the help message is printed
+		end = get_datetime()
+		end_str = datetime_str(end)
+		print(f'\nFinished at: {end_str}')
+
+		minutes, seconds = datetime_diff(start, end)
+		print(f'Took {minutes: .0f}m{seconds: .0f}s.')
+
+	return wrapper
diff --git a/CHEWBBACA/utils/profile_hasher.py b/CHEWBBACA/utils/profile_hasher.py
index c2048fc1..da2ebceb 100644
--- a/CHEWBBACA/utils/profile_hasher.py
+++ b/CHEWBBACA/utils/profile_hasher.py
@@ -16,193 +16,193 @@
 import pandas as pd
 
 try:
-    from utils import (file_operations as fo,
-                       fasta_operations as fao,
-                       iterables_manipulation as im,
-                       multiprocessing_operations as mo)
+	from utils import (file_operations as fo,
+					   fasta_operations as fao,
+					   iterables_manipulation as im,
+					   multiprocessing_operations as mo)
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import (file_operations as fo,
-                                 fasta_operations as fao,
-                                 iterables_manipulation as im,
-                                 multiprocessing_operations as mo)
+	from CHEWBBACA.utils import (file_operations as fo,
+								 fasta_operations as fao,
+								 iterables_manipulation as im,
+								 multiprocessing_operations as mo)
 
 
 def hash_column(column, locus_file, hashing_function):
-    """Substitute allele identifiers by allele sequence hashes.
-
-    Parameters
-    ----------
-    column : pandas.core.series.Series
-        Column with allele identifiers to substitute by the
-        hash values computed from each allele sequence.
-    locus_file : str
-        Path to the FASTA file that contains the locus alleles.
-    hashing_function : func
-        Hashing function used to hash each allele.
-
-    Returns
-    -------
-    hashed_column : pandas.core.series.Series
-        Column where each allele identifier was substituted by
-        the hash computed from the allele sequence.
-    """
-    # read Fasta files with locus alleles
-    locus_alleles = {(rec.id).split('_')[-1].replace('*', ''): str(rec.seq)
-                     for rec in fao.sequence_generator(locus_file[0])}
-    if len(locus_file) > 1:
-        novel_records = {(rec.id).split('_')[-1].replace('*', ''): str(rec.seq)
-                         for rec in fao.sequence_generator(locus_file[1])}
-        locus_alleles = im.merge_dictionaries([locus_alleles, novel_records], True)
-
-    hashed_alleles = {}
-    for seqid, seq in locus_alleles.items():
-        # hash function does not accept string object, encode to get bytes object
-        hashed_seq = hashing_function(seq.encode())
-        # bitwise operation to convert crc32 and adler32 hashes to unsigned
-        # integer and ensure the computed value is the same for Python 2 & 3
-        if isinstance(hashed_seq, int):
-            hashed_seq &= 0xffffffff
-        else:
-            hashed_seq = hashed_seq.hexdigest()
-        hashed_alleles[seqid] = hashed_seq
-
-    # faster than replace or map with update to avoid adding NaN
-    hashed_column = column.apply(lambda x: hashed_alleles.get(x, x))
-
-    return hashed_column
+	"""Substitute allele identifiers by allele sequence hashes.
+
+	Parameters
+	----------
+	column : pandas.core.series.Series
+		Column with allele identifiers to substitute by the
+		hash values computed from each allele sequence.
+	locus_file : str
+		Path to the FASTA file that contains the locus alleles.
+	hashing_function : func
+		Hashing function used to hash each allele.
+
+	Returns
+	-------
+	hashed_column : pandas.core.series.Series
+		Column where each allele identifier was substituted by
+		the hash computed from the allele sequence.
+	"""
+	# read Fasta files with locus alleles
+	locus_alleles = {(rec.id).split('_')[-1].replace('*', ''): str(rec.seq)
+					 for rec in fao.sequence_generator(locus_file[0])}
+	if len(locus_file) > 1:
+		novel_records = {(rec.id).split('_')[-1].replace('*', ''): str(rec.seq)
+						 for rec in fao.sequence_generator(locus_file[1])}
+		locus_alleles = im.merge_dictionaries([locus_alleles, novel_records], True)
+
+	hashed_alleles = {}
+	for seqid, seq in locus_alleles.items():
+		# hash function does not accept string object, encode to get bytes object
+		hashed_seq = hashing_function(seq.encode())
+		# bitwise operation to convert crc32 and adler32 hashes to unsigned
+		# integer and ensure the computed value is the same for Python 2 & 3
+		if isinstance(hashed_seq, int):
+			hashed_seq &= 0xffffffff
+		else:
+			hashed_seq = hashed_seq.hexdigest()
+		hashed_alleles[seqid] = hashed_seq
+
+	# faster than replace or map with update to avoid adding NaN
+	hashed_column = column.apply(lambda x: hashed_alleles.get(x, x))
+
+	return hashed_column
 
 
 def hash_profiles(profiles_table, loci_ids, loci_files, hashing_function,
-                  nrows, skiprows, output_directory):
-    """Hash a set of allelic profiles read from a TSV file.
-
-    Parameters
-    ----------
-    profiles_table : str
-        Path to the TSV file that contains the allelic profiles.
-    loci_ids : list
-        List with the loci identifiers.
-    loci_files : list
-        List with the paths to the FASTA files that contain the
-        loci alleles.
-    hashing_function : func
-        Hashing function used to hash each allele.
-    nrows : int
-        Number of rows/allelic profiles to read from the input
-        file.
-    skiprows : range
-        Range of rows to skip.
-    output_directory : str
-        Path to the output directory.
-
-    Returns
-    -------
-    output_file : str
-        Path to the output file with the hashed profiles.
-    """
-    current_rows = pd.read_csv(profiles_table, delimiter='\t', dtype=str,
-                               skiprows=skiprows, nrows=nrows, index_col=0)
-
-    # Remove all 'INF-' prefixes, missing data and '*' from identifiers
-    current_rows = current_rows.apply(im.replace_chars, args=('-'))
-
-    hashed_profiles = []
-    for locus in loci_ids:
-        locus_column = current_rows[locus]
-        hashed_column = hash_column(locus_column, loci_files[locus],
-                                    hashing_function)
-        hashed_profiles.append(hashed_column)
-
-    hashed_df = pd.concat(hashed_profiles, axis=1)
-    start = skiprows.stop
-    stop = skiprows.stop+len(hashed_df)-1
-    input_basename = fo.file_basename(profiles_table, False)
-    output_file = fo.join_paths(output_directory,
-                                ['{0}_{1}-{2}_hashed.tsv'.format(input_basename, start, stop)])
-    hashed_df.to_csv(output_file, sep='\t', index=True, header=False)
-
-    return output_file
+				  nrows, skiprows, output_directory):
+	"""Hash a set of allelic profiles read from a TSV file.
+
+	Parameters
+	----------
+	profiles_table : str
+		Path to the TSV file that contains the allelic profiles.
+	loci_ids : list
+		List with the loci identifiers.
+	loci_files : list
+		List with the paths to the FASTA files that contain the
+		loci alleles.
+	hashing_function : func
+		Hashing function used to hash each allele.
+	nrows : int
+		Number of rows/allelic profiles to read from the input
+		file.
+	skiprows : range
+		Range of rows to skip.
+	output_directory : str
+		Path to the output directory.
+
+	Returns
+	-------
+	output_file : str
+		Path to the output file with the hashed profiles.
+	"""
+	current_rows = pd.read_csv(profiles_table, delimiter='\t', dtype=str,
+							   skiprows=skiprows, nrows=nrows, index_col=0)
+
+	# Remove all 'INF-' prefixes, missing data and '*' from identifiers
+	current_rows = current_rows.apply(im.replace_chars, args=('-'))
+
+	hashed_profiles = []
+	for locus in loci_ids:
+		locus_column = current_rows[locus]
+		hashed_column = hash_column(locus_column, loci_files[locus],
+									hashing_function)
+		hashed_profiles.append(hashed_column)
+
+	hashed_df = pd.concat(hashed_profiles, axis=1)
+	start = skiprows.stop
+	stop = skiprows.stop+len(hashed_df)-1
+	input_basename = fo.file_basename(profiles_table, False)
+	output_file = fo.join_paths(output_directory,
+								['{0}_{1}-{2}_hashed.tsv'.format(input_basename, start, stop)])
+	hashed_df.to_csv(output_file, sep='\t', index=True, header=False)
+
+	return output_file
 
 
 def main(profiles_table, schema_directory, output_directory, hash_type,
-         cpu_cores, nrows, updated_files, no_inferred):
-    """Hash allele identifiers in a matrix of allelic profiles.
-
-    Parameters
-    ----------
-    profiles_table : str
-        Path to a TSV file with allelic profiles determined by the
-        AlleleCall module.
-    schema_directory : str
-        Path to the directory of the schema used to determine the
-        allelic profiles.
-    output_directory : str
-        Path to the output directory.
-    hash_type : str
-        Hashing algorithm to use.
-    cpu_cores : int
-        Number of CPU cores used by the process.
-    nrows : int
-        Divide input file into subsets to process more efficiently.
-    updated_files : dict
-        Dictionary with paths to schema FASTA files as keys and paths
-        to FASTA files updated by allele calling as values. Only used
-        if `no_inferred` is True.
-    no_inferred : bool
-        If the allele calling process did not add inferred alleles to
-        the schema.
-    """
-    # Get hash function
-    hashing_function = getattr(hashlib, hash_type, None)
-    if hashing_function is None:
-        hashing_function = getattr(zlib, hash_type, None)
-
-    if hashing_function is None:
-        print('{0} hash function is not available in '
-              'hashlib or zlib modules.'.format(hash_type))
-        return False
-
-    # Get loci identifiers
-    with open(profiles_table, 'r') as infile:
-        header = infile.readline()
-        loci_ids = header.split()[1:]
-
-    loci_files = {}
-    for locus in loci_ids:
-        locus_file = fo.join_paths(schema_directory, [locus])
-        # Add .fasta extension if file headers did not include it
-        if locus_file.endswith('.fasta') is False:
-            locus_file += '.fasta'
-        loci_files[locus] = [locus_file]
-        if locus_file in updated_files and no_inferred is True:
-            loci_files[locus].append(updated_files[locus_file][0])
-
-    # Get input/sample identifiers
-    sample_ids = pd.read_csv(profiles_table, delimiter='\t',
-                             dtype=str, usecols=['FILE'])
-
-    # Write file with header
-    header_basename = fo.file_basename(profiles_table).replace('.tsv', '_header.tsv')
-    header_file = fo.join_paths(output_directory, [header_basename])
-    fo.write_to_file(header, header_file, 'w', '')
-
-    # Create multiprocessing inputs
-    multi_inputs = []
-    # Divide and process by row chunks
-    for i in range(0, len(sample_ids), nrows):
-        multi_inputs.append([profiles_table, loci_ids, loci_files,
-                             hashing_function, nrows, range(1, i+1),
-                             output_directory, hash_profiles])
-
-    hashed_files = mo.map_async_parallelizer(multi_inputs, mo.function_helper,
-                                             cpu_cores)
-
-    # Concatenate all files
-    output_basename = fo.file_basename(profiles_table).replace('.tsv', '_hashed.tsv')
-    output_file = fo.join_paths(output_directory, [output_basename])
-    fo.concatenate_files([header_file]+hashed_files, output_file)
-
-    # Delete intermediate dataframes
-    fo.remove_files([header_file]+hashed_files)
-
-    return output_file
+		 cpu_cores, nrows, updated_files, no_inferred):
+	"""Hash allele identifiers in a matrix of allelic profiles.
+
+	Parameters
+	----------
+	profiles_table : str
+		Path to a TSV file with allelic profiles determined by the
+		AlleleCall module.
+	schema_directory : str
+		Path to the directory of the schema used to determine the
+		allelic profiles.
+	output_directory : str
+		Path to the output directory.
+	hash_type : str
+		Hashing algorithm to use.
+	cpu_cores : int
+		Number of CPU cores used by the process.
+	nrows : int
+		Divide input file into subsets to process more efficiently.
+	updated_files : dict
+		Dictionary with paths to schema FASTA files as keys and paths
+		to FASTA files updated by allele calling as values. Only used
+		if `no_inferred` is True.
+	no_inferred : bool
+		If the allele calling process did not add inferred alleles to
+		the schema.
+	"""
+	# Get hash function
+	hashing_function = getattr(hashlib, hash_type, None)
+	if hashing_function is None:
+		hashing_function = getattr(zlib, hash_type, None)
+
+	if hashing_function is None:
+		print('{0} hash function is not available in '
+			  'hashlib or zlib modules.'.format(hash_type))
+		return False
+
+	# Get loci identifiers
+	with open(profiles_table, 'r') as infile:
+		header = infile.readline()
+		loci_ids = header.split()[1:]
+
+	loci_files = {}
+	for locus in loci_ids:
+		locus_file = fo.join_paths(schema_directory, [locus])
+		# Add .fasta extension if file headers did not include it
+		if locus_file.endswith('.fasta') is False:
+			locus_file += '.fasta'
+		loci_files[locus] = [locus_file]
+		if locus_file in updated_files and no_inferred is True:
+			loci_files[locus].append(updated_files[locus_file][0])
+
+	# Get input/sample identifiers
+	sample_ids = pd.read_csv(profiles_table, delimiter='\t',
+							 dtype=str, usecols=['FILE'])
+
+	# Write file with header
+	header_basename = fo.file_basename(profiles_table).replace('.tsv', '_header.tsv')
+	header_file = fo.join_paths(output_directory, [header_basename])
+	fo.write_to_file(header, header_file, 'w', '')
+
+	# Create multiprocessing inputs
+	multi_inputs = []
+	# Divide and process by row chunks
+	for i in range(0, len(sample_ids), nrows):
+		multi_inputs.append([profiles_table, loci_ids, loci_files,
+							 hashing_function, nrows, range(1, i+1),
+							 output_directory, hash_profiles])
+
+	hashed_files = mo.map_async_parallelizer(multi_inputs, mo.function_helper,
+											 cpu_cores)
+
+	# Concatenate all files
+	output_basename = fo.file_basename(profiles_table).replace('.tsv', '_hashed.tsv')
+	output_file = fo.join_paths(output_directory, [output_basename])
+	fo.concatenate_files([header_file]+hashed_files, output_file)
+
+	# Delete intermediate dataframes
+	fo.remove_files([header_file]+hashed_files)
+
+	return output_file
diff --git a/CHEWBBACA/utils/profiles_sqlitedb.py b/CHEWBBACA/utils/profiles_sqlitedb.py
index 4c99e060..cb4f603b 100755
--- a/CHEWBBACA/utils/profiles_sqlitedb.py
+++ b/CHEWBBACA/utils/profiles_sqlitedb.py
@@ -26,690 +26,690 @@
 import sqlite3
 
 try:
-    from utils import process_datetime as pd
+	from utils import process_datetime as pd
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import process_datetime as pd
+	from CHEWBBACA.utils import process_datetime as pd
 
 
 def create_database_file(db_file):
-    """Create a SQLite database file.
-
-    If the database file already exists,
-    it will establish and close connection.
-
-    Parameters
-    ----------
-    df_file : str
-        Path to the SQLite database file.
-
-    Returns
-    -------
-    error : None or sqlite3.OperationalError
-        None if the SQLite database file was
-        successfully created, OperationalError
-        if it could not create/establish connection
-    """
-    conn = None
-    error = None
-    try:
-        # Creates db file if it does not exist
-        conn = sqlite3.connect(db_file)
-    except Exception as e:
-        error = e
-    finally:
-        if conn:
-            conn.close()
-
-    return error
+	"""Create a SQLite database file.
+
+	If the database file already exists,
+	it will establish and close connection.
+
+	Parameters
+	----------
+	df_file : str
+		Path to the SQLite database file.
+
+	Returns
+	-------
+	error : None or sqlite3.OperationalError
+		None if the SQLite database file was
+		successfully created, OperationalError
+		if it could not create/establish connection
+	"""
+	conn = None
+	error = None
+	try:
+		# Creates db file if it does not exist
+		conn = sqlite3.connect(db_file)
+	except Exception as e:
+		error = e
+	finally:
+		if conn:
+			conn.close()
+
+	return error
 
 
 def create_connection(db_file):
-    """Create a database connection to a SQLite database.
+	"""Create a database connection to a SQLite database.
 
-    Parameters
-    ----------
-    db_file: str
-        Path to the SQLite database file.
+	Parameters
+	----------
+	db_file: str
+		Path to the SQLite database file.
 
-    Returns
-    -------
-    conn : sqlite3.Connection or sqlite3.OperationalError
-        SQLite Connection object if connection was
-        successfull or error if it was not possible
-        to connect to the database.
-    """
-    try:
-        conn = sqlite3.connect(db_file)
-    except Exception as e:
-        conn = e
+	Returns
+	-------
+	conn : sqlite3.Connection or sqlite3.OperationalError
+		SQLite Connection object if connection was
+		successfull or error if it was not possible
+		to connect to the database.
+	"""
+	try:
+		conn = sqlite3.connect(db_file)
+	except Exception as e:
+		conn = e
 
-    return conn
+	return conn
 
 
 def execute_statement(conn, statement):
-    """Execute a SQL statement.
-
-    Parameters
-    ----------
-    conn : sqlite3.Connection
-        SQLite Connection object.
-    statement : str
-        SQL statement to execute.
-
-    Returns
-    -------
-    error : None or sqlite3.OperationalError
-        None if the SQLite database file was
-        successfully created, OperationalError
-        if it could not create/establish connection
-    """
-    error = None
-    try:
-        c = conn.cursor()
-        c.execute(statement)
-        return c
-    except Exception as e:
-        error = e
-        return error
+	"""Execute a SQL statement.
+
+	Parameters
+	----------
+	conn : sqlite3.Connection
+		SQLite Connection object.
+	statement : str
+		SQL statement to execute.
+
+	Returns
+	-------
+	error : None or sqlite3.OperationalError
+		None if the SQLite database file was
+		successfully created, OperationalError
+		if it could not create/establish connection
+	"""
+	error = None
+	try:
+		c = conn.cursor()
+		c.execute(statement)
+		return c
+	except Exception as e:
+		error = e
+		return error
 
 
 def select_all_rows(db_file, table):
-    """Retrieve all rows in a table.
+	"""Retrieve all rows in a table.
 
-    Parameters
-    ----------
-    db_file : str
-        Path to the SQLite database file.
-    table : str
-        Name of the table.
+	Parameters
+	----------
+	db_file : str
+		Path to the SQLite database file.
+	table : str
+		Name of the table.
 
-    Returns
-    -------
-    rows : list of tup
-        List of all rows in the table. Each row
-        is represented by a tuple with the values
-        for all columns.
-    """
-    conn = create_connection(db_file)
-    cur = conn.cursor()
-    cur.execute('SELECT * FROM {0}'.format(table))
+	Returns
+	-------
+	rows : list of tup
+		List of all rows in the table. Each row
+		is represented by a tuple with the values
+		for all columns.
+	"""
+	conn = create_connection(db_file)
+	cur = conn.cursor()
+	cur.execute('SELECT * FROM {0}'.format(table))
 
-    result = cur.fetchall()
+	result = cur.fetchall()
 
-    rows = [r for r in result]
+	rows = [r for r in result]
 
-    conn.close()
+	conn.close()
 
-    return rows
+	return rows
 
 
 def create_insert_statement(table, columns, placeholders):
-    """Create a base SQL insert statement.
-
-    Parameters
-    ----------
-    table : str
-        Name of the table.
-    columns : list
-        List with the names of the columns
-        that values will be inserted into.
-
-    Returns
-    -------
-    statement : str
-        SQL insert statement that can be
-        used to insert values into `columns`
-        of a `table`.
-    """
-    statement = ('INSERT OR IGNORE INTO {0}({1}) '
-                 'VALUES({2});'.format(table, ','.join(columns),
-                                       ','.join(placeholders)))
-
-    return statement
+	"""Create a base SQL insert statement.
+
+	Parameters
+	----------
+	table : str
+		Name of the table.
+	columns : list
+		List with the names of the columns
+		that values will be inserted into.
+
+	Returns
+	-------
+	statement : str
+		SQL insert statement that can be
+		used to insert values into `columns`
+		of a `table`.
+	"""
+	statement = ('INSERT OR IGNORE INTO {0}({1}) '
+				 'VALUES({2});'.format(table, ','.join(columns),
+									   ','.join(placeholders)))
+
+	return statement
 
 
 def insert_loci(db_file, matrix_file):
-    """Insert loci into the loci table.
+	"""Insert loci into the loci table.
 
-    Parameters
-    ----------
-    db_file : str
-        Path to the SQLite database file.
-    matrix_file : str
-        Path to the TSV file with a matrix
-        of allelic profiles.
+	Parameters
+	----------
+	db_file : str
+		Path to the SQLite database file.
+	matrix_file : str
+		Path to the TSV file with a matrix
+		of allelic profiles.
 
-    Returns
-    -------
-    The number of loci that were insert
-    into the table.
-    """
-    matrix_lines = read_matrix(matrix_file)
-    loci_list = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]]
+	Returns
+	-------
+	The number of loci that were insert
+	into the table.
+	"""
+	matrix_lines = read_matrix(matrix_file)
+	loci_list = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]]
 
-    conn = create_connection(db_file)
-    locus_sql = create_insert_statement('loci', ['name'],
-                                        ['?'])
+	conn = create_connection(db_file)
+	locus_sql = create_insert_statement('loci', ['name'],
+										['?'])
 
-    loci = [(locus,) for locus in loci_list]
+	loci = [(locus,) for locus in loci_list]
 
-    cur = conn.cursor()
-    cur.executemany(locus_sql, loci)
+	cur = conn.cursor()
+	cur.executemany(locus_sql, loci)
 
-    conn.commit()
-    conn.close()
+	conn.commit()
+	conn.close()
 
-    return len(loci_list)
+	return len(loci_list)
 
 
 def insert_multiple(db_file, base_statement, data):
-    """Execute several insert statements.
-
-    Parameters
-    ----------
-    df_file : str
-        Path to the SQLite database file.
-    base_statement : str
-        Base SQL insert statement to execute.
-    data : list of tup
-        A list with tuples that contain the
-        column values to insert for each row.
-
-    Returns
-    -------
-    error : None or sqlite3.OperationalError
-        None if the SQL statement was successfully
-        inserted, SQLite OperationalError otherwise.
-    """
-    error = None
-    try:
-        conn = create_connection(db_file)
-        cur = conn.cursor()
-        cur.executemany(base_statement, data)
-        conn.commit()
-        conn.close()
-    except Exception as e:
-        error = e
-
-    return error
+	"""Execute several insert statements.
+
+	Parameters
+	----------
+	df_file : str
+		Path to the SQLite database file.
+	base_statement : str
+		Base SQL insert statement to execute.
+	data : list of tup
+		A list with tuples that contain the
+		column values to insert for each row.
+
+	Returns
+	-------
+	error : None or sqlite3.OperationalError
+		None if the SQL statement was successfully
+		inserted, SQLite OperationalError otherwise.
+	"""
+	error = None
+	try:
+		conn = create_connection(db_file)
+		cur = conn.cursor()
+		cur.executemany(base_statement, data)
+		conn.commit()
+		conn.close()
+	except Exception as e:
+		error = e
+
+	return error
 
 
 def create_database(db_file):
-    """Create a SQLite database to store allelic profiles.
-
-    Parameters
-    ----------
-    db_file : str
-        Path to the SQLite database file.
-
-    Returns
-    -------
-    True if the SQLite database file and tables were
-    successfully created, SQLite OperationalError otherwise.
-    """
-    message = create_database_file(db_file)
-
-    # samples table
-    # date format YYYY-MM-DDTHH:MM:SS
-    sql_samples_table = ('CREATE TABLE IF NOT EXISTS samples ('
-                             'id INTEGER PRIMARY KEY,'
-                             'name TEXT NOT NULL,'
-                             'date TEXT NOT NULL,'
-                             'profile_id TEXT NOT NULL,'
-                             'FOREIGN KEY (profile_id) REFERENCES profiles (profile_id)'
-                             ');')
-
-    # loci table
-    sql_loci_table = ('CREATE TABLE IF NOT EXISTS loci ('
-                          'id INTEGER PRIMARY KEY,'
-                          'name TEXT,'
-                          'annotation TEXT,'
-                          'custom_annotation TEXT'
-                          ');')
-
-    # profiles table
-    sql_profiles_table = ('CREATE TABLE IF NOT EXISTS profiles ('
-                          'profile_id TEXT PRIMARY KEY,'
-                          'type TEXT,'
-                          'date TEXT,'
-                          'profile_json JSON,'
-                          'subschema_id TEXT,'
-                          'FOREIGN KEY (subschema_id) REFERENCES subschemas (subschema_id)'
-                          ');')
-
-    # subschemas table
-    sql_subschemas_table = ('CREATE TABLE IF NOT EXISTS subschemas ('
-                            'subschema_id TEXT PRIMARY KEY,'
-                            'loci JSON'  # JSON with schema subset
-                            ');')
-
-    # create tables
-    conn = create_connection(db_file)
-
-    if isinstance(conn, str) is True:
-        return conn
-    else:
-        row = execute_statement(conn, sql_samples_table)
-        row = execute_statement(conn, sql_loci_table)
-        row = execute_statement(conn, sql_profiles_table)
-        row = execute_statement(conn, sql_subschemas_table)
-        conn.commit()
-        conn.close()
-
-        return True
+	"""Create a SQLite database to store allelic profiles.
+
+	Parameters
+	----------
+	db_file : str
+		Path to the SQLite database file.
+
+	Returns
+	-------
+	True if the SQLite database file and tables were
+	successfully created, SQLite OperationalError otherwise.
+	"""
+	message = create_database_file(db_file)
+
+	# samples table
+	# date format YYYY-MM-DDTHH:MM:SS
+	sql_samples_table = ('CREATE TABLE IF NOT EXISTS samples ('
+							 'id INTEGER PRIMARY KEY,'
+							 'name TEXT NOT NULL,'
+							 'date TEXT NOT NULL,'
+							 'profile_id TEXT NOT NULL,'
+							 'FOREIGN KEY (profile_id) REFERENCES profiles (profile_id)'
+							 ');')
+
+	# loci table
+	sql_loci_table = ('CREATE TABLE IF NOT EXISTS loci ('
+						  'id INTEGER PRIMARY KEY,'
+						  'name TEXT,'
+						  'annotation TEXT,'
+						  'custom_annotation TEXT'
+						  ');')
+
+	# profiles table
+	sql_profiles_table = ('CREATE TABLE IF NOT EXISTS profiles ('
+						  'profile_id TEXT PRIMARY KEY,'
+						  'type TEXT,'
+						  'date TEXT,'
+						  'profile_json JSON,'
+						  'subschema_id TEXT,'
+						  'FOREIGN KEY (subschema_id) REFERENCES subschemas (subschema_id)'
+						  ');')
+
+	# subschemas table
+	sql_subschemas_table = ('CREATE TABLE IF NOT EXISTS subschemas ('
+							'subschema_id TEXT PRIMARY KEY,'
+							'loci JSON'  # JSON with schema subset
+							');')
+
+	# create tables
+	conn = create_connection(db_file)
+
+	if isinstance(conn, str) is True:
+		return conn
+	else:
+		row = execute_statement(conn, sql_samples_table)
+		row = execute_statement(conn, sql_loci_table)
+		row = execute_statement(conn, sql_profiles_table)
+		row = execute_statement(conn, sql_subschemas_table)
+		conn.commit()
+		conn.close()
+
+		return True
 
 
 def read_matrix(matrix_file):
-    """Read a TSV file that contains a matrix with allelic profiles.
+	"""Read a TSV file that contains a matrix with allelic profiles.
 
-    Parameters
-    ----------
-    matrix_file : str
-        Path to the TSV file with the matrix.
+	Parameters
+	----------
+	matrix_file : str
+		Path to the TSV file with the matrix.
 
-    Returns
-    -------
-    matrix_lines : list of list
-        A list with all the lines in the TSV file.
-    """
-    with open(matrix_file, 'r') as m:
-        matrix_lines = list(csv.reader(m, delimiter='\t'))
+	Returns
+	-------
+	matrix_lines : list of list
+		A list with all the lines in the TSV file.
+	"""
+	with open(matrix_file, 'r') as m:
+		matrix_lines = list(csv.reader(m, delimiter='\t'))
 
-    return matrix_lines
+	return matrix_lines
 
 
 def get_loci_ids(matrix_lines):
-    """Extract loci identifiers from a list with allelic profiles.
+	"""Extract loci identifiers from a list with allelic profiles.
 
-    Parameters
-    ----------
-    matrix_lines : list of list
-        A list with all the lines read from a TSV
-        file that contains a matrix of allelic profiles.
+	Parameters
+	----------
+	matrix_lines : list of list
+		A list with all the lines read from a TSV
+		file that contains a matrix of allelic profiles.
 
-    Returns
-    -------
-    loci_ids : list
-        List with the identifiers of all loci represented
-        in the allelic profiles.
-    """
-    loci_ids = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]]
+	Returns
+	-------
+	loci_ids : list
+		List with the identifiers of all loci represented
+		in the allelic profiles.
+	"""
+	loci_ids = [locus.rstrip('.fasta') for locus in matrix_lines[0][1:]]
 
-    return loci_ids
+	return loci_ids
 
 
 def get_sample_ids(matrix_lines):
-    """Extract sample identifiers from a list with allelic profiles.
+	"""Extract sample identifiers from a list with allelic profiles.
 
-    Parameters
-    ----------
-    matrix_lines : list of list
-        A list with all the lines read from a TSV
-        file that contains a matrix of allelic profiles.
+	Parameters
+	----------
+	matrix_lines : list of list
+		A list with all the lines read from a TSV
+		file that contains a matrix of allelic profiles.
 
-    Returns
-    -------
-    sample_ids : list
-        List with the sample identifiers of all allelic
-        profiles.
-    """
-    sample_ids = [l[0].rstrip('.fasta') for l in matrix_lines[1:]]
+	Returns
+	-------
+	sample_ids : list
+		List with the sample identifiers of all allelic
+		profiles.
+	"""
+	sample_ids = [l[0].rstrip('.fasta') for l in matrix_lines[1:]]
 
-    return sample_ids
+	return sample_ids
 
 
 def get_profiles(matrix_lines):
-    """Extract profiles from a list with allelic profiles.
+	"""Extract profiles from a list with allelic profiles.
 
-    Parameters
-    ----------
-    matrix_lines : list of list
-        A list with all the lines read from a TSV
-        file that contains a matrix of allelic profiles.
+	Parameters
+	----------
+	matrix_lines : list of list
+		A list with all the lines read from a TSV
+		file that contains a matrix of allelic profiles.
 
-    Returns
-    -------
-    profiles : list of dict
-        List with one dictionary per allelic profile.
-    """
-    profiles = []
-    loci_ids = matrix_lines[0][1:]
-    for l in matrix_lines[1:]:
-        lp = l[1:]
-        profile = {loci_ids[i].rstrip('.fasta'): lp[i] for i in range(len(lp))}
-        profiles.append(profile)
+	Returns
+	-------
+	profiles : list of dict
+		List with one dictionary per allelic profile.
+	"""
+	profiles = []
+	loci_ids = matrix_lines[0][1:]
+	for l in matrix_lines[1:]:
+		lp = l[1:]
+		profile = {loci_ids[i].rstrip('.fasta'): lp[i] for i in range(len(lp))}
+		profiles.append(profile)
 
-    return profiles
+	return profiles
 
 
 def remove_inf(profile):
-    """Remove the 'INF-' prefix from inferred alleles.
+	"""Remove the 'INF-' prefix from inferred alleles.
 
-    Parameters
-    ----------
-    profile : list
-        List with the allele identifiers in an allelic
-        profile.
+	Parameters
+	----------
+	profile : list
+		List with the allele identifiers in an allelic
+		profile.
 
-    Returns
-    -------
-    clean_profile : list
-        List with allele identifiers stripped of the
-        'INF-' prefix.
-    """
-    clean_profile = [a.lstrip('INF-') if 'INF-' in a else a for a in profile]
+	Returns
+	-------
+	clean_profile : list
+		List with allele identifiers stripped of the
+		'INF-' prefix.
+	"""
+	clean_profile = [a.lstrip('INF-') if 'INF-' in a else a for a in profile]
 
-    return clean_profile
+	return clean_profile
 
 
 def jsonify_profile(profile, loci):
-    """
-    """
-    json_profile = ''
-    for k, v in profile.items():
-        # add first entry to JSON only if locus value is not LNF
-        locus = k
-        locus_id = loci[locus]
-        allele_id = v
-        if len(json_profile) == 0:
-            if allele_id != 'LNF':
-                json_profile += '{{"{0}":"{1}"'.format(locus_id, allele_id)
-        else:
-            if allele_id != 'LNF':
-                json_profile += ', "{0}":"{1}"'.format(locus_id, allele_id)
-
-    json_profile += '}'
-
-    return json_profile
+	"""
+	"""
+	json_profile = ''
+	for k, v in profile.items():
+		# add first entry to JSON only if locus value is not LNF
+		locus = k
+		locus_id = loci[locus]
+		allele_id = v
+		if len(json_profile) == 0:
+			if allele_id != 'LNF':
+				json_profile += '{{"{0}":"{1}"'.format(locus_id, allele_id)
+		else:
+			if allele_id != 'LNF':
+				json_profile += ', "{0}":"{1}"'.format(locus_id, allele_id)
+
+	json_profile += '}'
+
+	return json_profile
 
 
 def store_allelecall_results(output_directory, schema_directory):
-    """
-    """
-    # add profiles to SQLite database
-    # parent results folder might have several results folders
-    results_folders = [os.path.join(output_directory, file)
-                       for file in os.listdir(output_directory) if 'results' in file]
-    # create datetime objects and sort to get latest
-    insert_dates = [(file, pd.datetime_obj(file.split('_')[-1], '%Y%m%dT%H%M%S'))
-                    for file in results_folders]
-    sorted_insert_dates = sorted(insert_dates, key=lambda x: x[1], reverse=True)
-    results_matrix = os.path.join(sorted_insert_dates[0][0], 'results_alleles.tsv')
-    insert_date = sorted_insert_dates[0][0].split('_')[-1]
-
-    # verify that database directory exists
-    database_directory = os.path.join(schema_directory, 'profiles_database')
-    # create if it does not exist
-    if os.path.isdir(database_directory) is False:
-        os.mkdir(database_directory)
-
-    # also need to check for database file
-    database_file = os.path.join(database_directory, 'profiles.db')
-    if os.path.isfile(database_file) is False:
-        print('\nCreating SQLite database to store profiles...', end='')
-        try:
-            create_database(database_file)
-            # insert loci list into loci table
-            total_loci = insert_loci(database_file, results_matrix)
-            print('done.')
-            print('Inserted {0} loci into database.'.format(total_loci))
-        except Exception as e:
-            print(e)
-            print('WARNING: Could not create database file. Will not store profiles.')
-
-    # insert whole matrix
-    updated = False
-    if os.path.isfile(database_file) is not False:
-        print('\nSending allelic profiles to SQLite database...', end='')
-        try:
-            total_profiles = insert_allelecall_matrix(results_matrix, database_file, insert_date)
-            print('done.')
-            print('Inserted {0} profiles ({1} total, {2} total unique).'.format(total_profiles[0], total_profiles[1], total_profiles[2]))
-            updated = True
-        except Exception:
-            print('WARNING: Could not store profiles in local database.')
-
-    return updated
+	"""
+	"""
+	# add profiles to SQLite database
+	# parent results folder might have several results folders
+	results_folders = [os.path.join(output_directory, file)
+					   for file in os.listdir(output_directory) if 'results' in file]
+	# create datetime objects and sort to get latest
+	insert_dates = [(file, pd.datetime_obj(file.split('_')[-1], '%Y%m%dT%H%M%S'))
+					for file in results_folders]
+	sorted_insert_dates = sorted(insert_dates, key=lambda x: x[1], reverse=True)
+	results_matrix = os.path.join(sorted_insert_dates[0][0], 'results_alleles.tsv')
+	insert_date = sorted_insert_dates[0][0].split('_')[-1]
+
+	# verify that database directory exists
+	database_directory = os.path.join(schema_directory, 'profiles_database')
+	# create if it does not exist
+	if os.path.isdir(database_directory) is False:
+		os.mkdir(database_directory)
+
+	# also need to check for database file
+	database_file = os.path.join(database_directory, 'profiles.db')
+	if os.path.isfile(database_file) is False:
+		print('\nCreating SQLite database to store profiles...', end='')
+		try:
+			create_database(database_file)
+			# insert loci list into loci table
+			total_loci = insert_loci(database_file, results_matrix)
+			print('done.')
+			print('Inserted {0} loci into database.'.format(total_loci))
+		except Exception as e:
+			print(e)
+			print('WARNING: Could not create database file. Will not store profiles.')
+
+	# insert whole matrix
+	updated = False
+	if os.path.isfile(database_file) is not False:
+		print('\nSending allelic profiles to SQLite database...', end='')
+		try:
+			total_profiles = insert_allelecall_matrix(results_matrix, database_file, insert_date)
+			print('done.')
+			print('Inserted {0} profiles ({1} total, {2} total unique).'.format(total_profiles[0], total_profiles[1], total_profiles[2]))
+			updated = True
+		except Exception:
+			print('WARNING: Could not store profiles in local database.')
+
+	return updated
 
 
 def insert_allelecall_matrix(matrix_file, db_file, insert_date):
-    """Insert profile data from a TSV file into a SQLite db.
-
-    Parameters
-    ----------
-    matrix_file : str
-        Path to the TSV file with the matrix.
-    db_file : str
-        Path to the SQLite database file.
-    insert_date : str
-        Date in the name of the folder created
-        by the AlleleCall process to save results
-        (in the format Y-m-dTH:M:S).
-
-    Returns
-    -------
-    A list with the following elements:
-
-    - Number of inserted profiles.
-    - Total number of profiles.
-    - Number of unique profiles.
-    """
-    loci_list_db = select_all_rows(db_file, 'loci')
-    loci_map = {t[1]: t[0] for t in loci_list_db}
-
-    # read matrix
-    matrix_lines = read_matrix(matrix_file)
-    matrix_lines = [remove_inf(l) for l in matrix_lines]
-
-    # get sample identifiers
-    sample_ids = get_sample_ids(matrix_lines)
-
-    # get profiles
-    profiles = get_profiles(matrix_lines)
-
-    # insert profiles
-    # create JSON format of each profile
-    profiles_hashes = []
-    subschemas_hashes = []
-    subschemas_loci = []
-    inserted_profiles = 0
-    profile_data = []
-    for p in profiles:
-        loci = [str(loci_map[locus]) for locus in p.keys()]
-        loci_join = ','.join(loci)
-        loci_hash = hashlib.sha256(loci_join.encode('utf-8')).hexdigest()
-        if loci_hash not in subschemas_hashes:
-            subschemas_loci.append(loci_join)
-            subschemas_hashes.append(loci_hash)
-
-        json_profile = jsonify_profile(p, loci_map)
-
-        profile_hash = hashlib.sha256(json_profile.encode('utf-8')).hexdigest()
-        profiles_hashes.append(profile_hash)
-
-        profile_data.append((profile_hash, insert_date,
-                             json_profile, loci_hash))
-
-    # insert all profiles
-    conn = create_connection(db_file)
-    profile_statement = create_insert_statement('profiles',
-                                                ['profile_id', 'date',
-                                                 'profile_json', 'subschema_id'],
-                                                ['?', '?', 'json(?)', '?'])
-    previous_rcount = execute_statement(conn, "SELECT COUNT(*) "
-                                              "FROM profiles;").fetchone()[0]
-    profiles_res = insert_multiple(db_file, profile_statement, profile_data)
-
-    posterior_rcount = execute_statement(conn, "SELECT COUNT(*) "
-                                               "FROM profiles;").fetchone()[0]
-    inserted_profiles = posterior_rcount - previous_rcount
-    conn.close()
-
-    # insert subschemas
-    subschema_statement = create_insert_statement('subschemas',
-                                                  ['subschema_id', 'loci'],
-                                                  ['?', '?'])
-    subschema_data = [(subschemas_hashes[i], subschemas_loci[i])
-                      for i in range(len(subschemas_hashes))]
-
-    # insert all subschemas
-    subschema_res = insert_multiple(db_file, subschema_statement,
-                                    subschema_data)
-
-    # insert samples
-    sample_statement = create_insert_statement('samples',
-                                               ['name', 'date', 'profile_id'],
-                                               ['?', '?', '?'])
-    samples_data = [(sample_ids[i], insert_date, profiles_hashes[i])
-                    for i in range(len(sample_ids))]
-
-    # insert all samples - it checks PK uniqueness condition
-    samples_res = insert_multiple(db_file, sample_statement, samples_data)
-
-    return [inserted_profiles, len(profiles), len(set(profiles_hashes)),
-            profiles_res, subschema_res, samples_res]
+	"""Insert profile data from a TSV file into a SQLite db.
+
+	Parameters
+	----------
+	matrix_file : str
+		Path to the TSV file with the matrix.
+	db_file : str
+		Path to the SQLite database file.
+	insert_date : str
+		Date in the name of the folder created
+		by the AlleleCall process to save results
+		(in the format Y-m-dTH:M:S).
+
+	Returns
+	-------
+	A list with the following elements:
+
+	- Number of inserted profiles.
+	- Total number of profiles.
+	- Number of unique profiles.
+	"""
+	loci_list_db = select_all_rows(db_file, 'loci')
+	loci_map = {t[1]: t[0] for t in loci_list_db}
+
+	# read matrix
+	matrix_lines = read_matrix(matrix_file)
+	matrix_lines = [remove_inf(l) for l in matrix_lines]
+
+	# get sample identifiers
+	sample_ids = get_sample_ids(matrix_lines)
+
+	# get profiles
+	profiles = get_profiles(matrix_lines)
+
+	# insert profiles
+	# create JSON format of each profile
+	profiles_hashes = []
+	subschemas_hashes = []
+	subschemas_loci = []
+	inserted_profiles = 0
+	profile_data = []
+	for p in profiles:
+		loci = [str(loci_map[locus]) for locus in p.keys()]
+		loci_join = ','.join(loci)
+		loci_hash = hashlib.sha256(loci_join.encode('utf-8')).hexdigest()
+		if loci_hash not in subschemas_hashes:
+			subschemas_loci.append(loci_join)
+			subschemas_hashes.append(loci_hash)
+
+		json_profile = jsonify_profile(p, loci_map)
+
+		profile_hash = hashlib.sha256(json_profile.encode('utf-8')).hexdigest()
+		profiles_hashes.append(profile_hash)
+
+		profile_data.append((profile_hash, insert_date,
+							 json_profile, loci_hash))
+
+	# insert all profiles
+	conn = create_connection(db_file)
+	profile_statement = create_insert_statement('profiles',
+												['profile_id', 'date',
+												 'profile_json', 'subschema_id'],
+												['?', '?', 'json(?)', '?'])
+	previous_rcount = execute_statement(conn, "SELECT COUNT(*) "
+											  "FROM profiles;").fetchone()[0]
+	profiles_res = insert_multiple(db_file, profile_statement, profile_data)
+
+	posterior_rcount = execute_statement(conn, "SELECT COUNT(*) "
+											   "FROM profiles;").fetchone()[0]
+	inserted_profiles = posterior_rcount - previous_rcount
+	conn.close()
+
+	# insert subschemas
+	subschema_statement = create_insert_statement('subschemas',
+												  ['subschema_id', 'loci'],
+												  ['?', '?'])
+	subschema_data = [(subschemas_hashes[i], subschemas_loci[i])
+					  for i in range(len(subschemas_hashes))]
+
+	# insert all subschemas
+	subschema_res = insert_multiple(db_file, subschema_statement,
+									subschema_data)
+
+	# insert samples
+	sample_statement = create_insert_statement('samples',
+											   ['name', 'date', 'profile_id'],
+											   ['?', '?', '?'])
+	samples_data = [(sample_ids[i], insert_date, profiles_hashes[i])
+					for i in range(len(sample_ids))]
+
+	# insert all samples - it checks PK uniqueness condition
+	samples_res = insert_multiple(db_file, sample_statement, samples_data)
+
+	return [inserted_profiles, len(profiles), len(set(profiles_hashes)),
+			profiles_res, subschema_res, samples_res]
 
 
 def select_outdated(loci, reassigned, cursor):
-    """Retrive the allelic profiles with outdated allele identifiers.
-
-    Parameters
-    ----------
-    loci : dict
-        A dictionary with the numeric identifiers
-        of loci as keys and the integer identifier
-        of each locus in the local SQLite database.
-    reassigned : dict
-        A dictionary with loci identifiers as keys.
-        Each key has a dictionary as value, with the
-        current alleles identifiers as keys and the
-        updated identifiers as values.
-    cursor : sqlite3.Cursor
-        SQLite cursor object.
-
-    Returns
-    -------
-    profiles : dict of list
-        A dictionary with profiles hashes as keys and
-        lists as values. Each list contains the profile
-        as str type and a variable number of tuples,
-        one tuple per allele identifier that must be
-        updated (tuples contain the locus identifier,
-        the outdated allele identifier and the updated
-        allele identifier).
-    """
-    profiles = {}
-    for locus, alleles in reassigned.items():
-        locus_id = loci[locus.split('-')[-1].rstrip('.fasta').lstrip('0')]
-        for a1, a2 in alleles.items():
-            # good idea to implement way to run many queries at once
-            query = ("SELECT profiles.profile_id, profiles.profile_json "
-                     "FROM profiles "
-                     "WHERE json_extract(profiles.profile_json, '$.{0}') = '{1}';".format(locus_id, a1))
-            cursor.execute(query)
-            rows = [r for r in cursor.fetchall()]
-            for r in rows:
-                profile_hash = r[0]
-                profile = r[1]
-                if profile_hash in profiles:
-                    profiles[profile_hash].append((locus_id, a1, a2))
-                else:
-                    profiles[profile_hash] = [profile, (locus_id, a1, a2)]
-
-    return profiles
+	"""Retrive the allelic profiles with outdated allele identifiers.
+
+	Parameters
+	----------
+	loci : dict
+		A dictionary with the numeric identifiers
+		of loci as keys and the integer identifier
+		of each locus in the local SQLite database.
+	reassigned : dict
+		A dictionary with loci identifiers as keys.
+		Each key has a dictionary as value, with the
+		current alleles identifiers as keys and the
+		updated identifiers as values.
+	cursor : sqlite3.Cursor
+		SQLite cursor object.
+
+	Returns
+	-------
+	profiles : dict of list
+		A dictionary with profiles hashes as keys and
+		lists as values. Each list contains the profile
+		as str type and a variable number of tuples,
+		one tuple per allele identifier that must be
+		updated (tuples contain the locus identifier,
+		the outdated allele identifier and the updated
+		allele identifier).
+	"""
+	profiles = {}
+	for locus, alleles in reassigned.items():
+		locus_id = loci[locus.split('-')[-1].rstrip('.fasta').lstrip('0')]
+		for a1, a2 in alleles.items():
+			# good idea to implement way to run many queries at once
+			query = ("SELECT profiles.profile_id, profiles.profile_json "
+					 "FROM profiles "
+					 "WHERE json_extract(profiles.profile_json, '$.{0}') = '{1}';".format(locus_id, a1))
+			cursor.execute(query)
+			rows = [r for r in cursor.fetchall()]
+			for r in rows:
+				profile_hash = r[0]
+				profile = r[1]
+				if profile_hash in profiles:
+					profiles[profile_hash].append((locus_id, a1, a2))
+				else:
+					profiles[profile_hash] = [profile, (locus_id, a1, a2)]
+
+	return profiles
 
 
 def alter_profiles(profiles, cursor):
-    """Update allele identifiers in allelic profiles.
-
-    Parameters
-    ----------
-    profiles : dict
-        A dictionary with profiles hashes as keys and
-        lists as values. Each list contains the profile
-        as str type and a variable number of tuples,
-        one tuple per allele identifier that must be
-        updated (tuples contain the locus identifier,
-        the outdated allele identifier and the updated
-        allele identifier).
-    cursor : sqlite3.Cursor
-        SQLite cursor object.
-
-    Returns
-    -------
-    results : dict of str
-        A dictionary with profiles hashes as keys and
-        updated profiles as values.
-    """
-    results = {}
-    for k, v in profiles.items():
-        profile = v[0]
-        to_replace = ["'$.{0}', '{1}'".format(r[0], r[2]) for r in v[1:]]
-        # json_replace cannot receive a great number of arguments
-        # change 50 identifiers at a time
-        for i in range(0, len(to_replace), 50):
-            to_replace_txt = ','.join(to_replace[i:i+50])
-            query = ("SELECT json_replace('{0}', {1}) "
-                     "FROM profiles;".format(profile, to_replace_txt))
-            cursor.execute(query)
-            res = cursor.fetchall()
-            profile = res[0][0]
-        results[k] = profile
-
-    return results
+	"""Update allele identifiers in allelic profiles.
+
+	Parameters
+	----------
+	profiles : dict
+		A dictionary with profiles hashes as keys and
+		lists as values. Each list contains the profile
+		as str type and a variable number of tuples,
+		one tuple per allele identifier that must be
+		updated (tuples contain the locus identifier,
+		the outdated allele identifier and the updated
+		allele identifier).
+	cursor : sqlite3.Cursor
+		SQLite cursor object.
+
+	Returns
+	-------
+	results : dict of str
+		A dictionary with profiles hashes as keys and
+		updated profiles as values.
+	"""
+	results = {}
+	for k, v in profiles.items():
+		profile = v[0]
+		to_replace = ["'$.{0}', '{1}'".format(r[0], r[2]) for r in v[1:]]
+		# json_replace cannot receive a great number of arguments
+		# change 50 identifiers at a time
+		for i in range(0, len(to_replace), 50):
+			to_replace_txt = ','.join(to_replace[i:i+50])
+			query = ("SELECT json_replace('{0}', {1}) "
+					 "FROM profiles;".format(profile, to_replace_txt))
+			cursor.execute(query)
+			res = cursor.fetchall()
+			profile = res[0][0]
+		results[k] = profile
+
+	return results
 
 
 def update_profiles(schema_directory, reassigned):
-    """Update allele identifiers in allelic profiles.
-
-    Parameters
-    ----------
-    schema_directory : str
-        Path to the directory with the schema files.
-    reassigned : dict of dict
-        A dictionary with loci identifiers as keys.
-        Each key has a dictionary as value, with the
-        current alleles identifiers as keys and the
-        updated identifiers as values.
-
-    Returns
-    -------
-    The number of profiles that were altered.
-    """
-
-    db_file = os.path.join(schema_directory, 'profiles_database',
-                           'profiles.db')
-
-    if os.path.isfile(db_file) is False:
-        return None
-
-    # create connection to db
-    conn = create_connection(db_file)
-    cursor = conn.cursor()
-    # get list of loci
-    loci_list = select_all_rows(db_file, 'loci')
-    ################
-    ################
-    ################
-    ################
-    # this does not seem right! loci identifiers are being truncated?
-    # maybe because it was designed to work with identifiers from Chewie-NS? -_-
-    # profiles should only be updated for schemas downloaded from the NS anyway...
-    loci = {l[1].split('-')[-1].lstrip('0'): l[0] for l in loci_list}
-    # get profiles with identifiers that have to be changed
-    profiles = select_outdated(loci, reassigned, cursor)
-
-    # change identifiers in profiles
-    updated_profiles = alter_profiles(profiles, cursor)
-
-    conn.close()
-
-    # change values in profiles table
-    query = "update profiles set profile_json = ? where profile_id = ?;"
-    params = [[p, h] for h, p in updated_profiles.items()]
-    insert_multiple(db_file, query, params)
-
-    return len(profiles)
+	"""Update allele identifiers in allelic profiles.
+
+	Parameters
+	----------
+	schema_directory : str
+		Path to the directory with the schema files.
+	reassigned : dict of dict
+		A dictionary with loci identifiers as keys.
+		Each key has a dictionary as value, with the
+		current alleles identifiers as keys and the
+		updated identifiers as values.
+
+	Returns
+	-------
+	The number of profiles that were altered.
+	"""
+
+	db_file = os.path.join(schema_directory, 'profiles_database',
+						   'profiles.db')
+
+	if os.path.isfile(db_file) is False:
+		return None
+
+	# create connection to db
+	conn = create_connection(db_file)
+	cursor = conn.cursor()
+	# get list of loci
+	loci_list = select_all_rows(db_file, 'loci')
+	################
+	################
+	################
+	################
+	# this does not seem right! loci identifiers are being truncated?
+	# maybe because it was designed to work with identifiers from Chewie-NS? -_-
+	# profiles should only be updated for schemas downloaded from the NS anyway...
+	loci = {l[1].split('-')[-1].lstrip('0'): l[0] for l in loci_list}
+	# get profiles with identifiers that have to be changed
+	profiles = select_outdated(loci, reassigned, cursor)
+
+	# change identifiers in profiles
+	updated_profiles = alter_profiles(profiles, cursor)
+
+	conn.close()
+
+	# change values in profiles table
+	query = "update profiles set profile_json = ? where profile_id = ?;"
+	params = [[p, h] for h, p in updated_profiles.items()]
+	insert_multiple(db_file, query, params)
+
+	return len(profiles)
 
 
 #schema_directory = '/home/rfm/Desktop/rfm/Lab_Software/senterica_schema/senterica_INNUENDO_cgMLST'
diff --git a/CHEWBBACA/utils/sequence_clustering.py b/CHEWBBACA/utils/sequence_clustering.py
index 6934046e..5cf90823 100644
--- a/CHEWBBACA/utils/sequence_clustering.py
+++ b/CHEWBBACA/utils/sequence_clustering.py
@@ -16,510 +16,510 @@
 from collections import Counter
 
 try:
-    from utils import (file_operations as fo,
-                       iterables_manipulation as im,
-                       blast_wrapper as bw,
-                       fasta_operations as fao)
+	from utils import (file_operations as fo,
+					   iterables_manipulation as im,
+					   blast_wrapper as bw,
+					   fasta_operations as fao)
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import (file_operations as fo,
-                                 iterables_manipulation as im,
-                                 blast_wrapper as bw,
-                                 fasta_operations as fao)
+	from CHEWBBACA.utils import (file_operations as fo,
+								 iterables_manipulation as im,
+								 blast_wrapper as bw,
+								 fasta_operations as fao)
 
 
 def select_representatives(kmers, reps_groups, clustering_sim):
-    """Determine the clusters a sequence can be added to.
-
-    Determines the set of clusters that a sequence can be
-    added to based on the decimal proportion of shared
-    distinct k-mers.
-
-    Parameters
-    ----------
-    kmers : list or set
-        Set of k-mers determined by decomposing a single
-        sequence.
-    reps_groups : dict
-        Dictionary with k-mers as keys and sequence
-        identifiers of sequences that contain each
-        k-mer as values.
-    clustering_sim : float
-        Sequences are added to clusters if they
-        share a minimum decimal proportion of
-        distinct k-mers with a cluster representative.
-
-    Returns
-    -------
-    selected_reps : list
-        List with a tuple per cluster/representative
-        that the sequence can be added to. Each tuple
-        has the identifier of the cluster representative
-        and the decimal proportion of shared distinct
-        kmers.
-    """
-    current_reps = [reps_groups[k] for k in kmers if k in reps_groups]
-    current_reps = im.flatten_list(current_reps)
-
-    # count number of kmer hits per representative
-    counts = Counter(current_reps)
-    selected_reps = [(k, v/len(kmers))
-                     for k, v in counts.items()
-                     if v/len(kmers) >= clustering_sim]
-
-    # sort by identifier and then by similarity to always get same order
-    selected_reps = sorted(selected_reps, key=lambda x: x[0])
-    selected_reps = sorted(selected_reps, key=lambda x: x[1], reverse=True)
-
-    return selected_reps
+	"""Determine the clusters a sequence can be added to.
+
+	Determines the set of clusters that a sequence can be
+	added to based on the decimal proportion of shared
+	distinct k-mers.
+
+	Parameters
+	----------
+	kmers : list or set
+		Set of k-mers determined by decomposing a single
+		sequence.
+	reps_groups : dict
+		Dictionary with k-mers as keys and sequence
+		identifiers of sequences that contain each
+		k-mer as values.
+	clustering_sim : float
+		Sequences are added to clusters if they
+		share a minimum decimal proportion of
+		distinct k-mers with a cluster representative.
+
+	Returns
+	-------
+	selected_reps : list
+		List with a tuple per cluster/representative
+		that the sequence can be added to. Each tuple
+		has the identifier of the cluster representative
+		and the decimal proportion of shared distinct
+		kmers.
+	"""
+	current_reps = [reps_groups[k] for k in kmers if k in reps_groups]
+	current_reps = im.flatten_list(current_reps)
+
+	# count number of kmer hits per representative
+	counts = Counter(current_reps)
+	selected_reps = [(k, v/len(kmers))
+					 for k, v in counts.items()
+					 if v/len(kmers) >= clustering_sim]
+
+	# sort by identifier and then by similarity to always get same order
+	selected_reps = sorted(selected_reps, key=lambda x: x[0])
+	selected_reps = sorted(selected_reps, key=lambda x: x[1], reverse=True)
+
+	return selected_reps
 
 
 def intra_cluster_sim(clusters, sequences, word_size, intra_filter):
-    """Identify clustered sequences that are similar.
-
-    Determines the percentage of shared k-mers/minimizers
-    between sequences in the same cluster and excludes
-    sequences that are similar to other sequences in the
-    cluster.
-
-    Parameters
-    ----------
-    clusters : dict
-        Dictionary with the identifiers of sequences
-        that are cluster representatives as keys and
-        a list with tuples as values. Each tuple has
-        the identifier of a sequence that was added to
-        the cluster, the percentage of shared
-        kmers/minimizers and the length of the clustered
-        sequence.
-    sequences : dict
-        Dictionary with sequence identifiers as keys
-        and sequences as values.
-    word_size : int
-        Value k for the kmer size.
-    intra_filter : float
-        Similarity threshold value. If two sequences in
-        the same cluster have a similarity value equal
-        or greater to this value, the shorter sequence
-        will be excluded from the cluster.
-
-    Returns
-    -------
-    excluded_seqids : dict
-        Dictionary with the identifiers of sequences
-        that are cluster representatives as keys and
-        a list with the sequence identifiers of sequences
-        that were excluded from the cluster as values.
-    excluded_sims : dict
-        A dictionary with the identifiers of sequences
-        that are cluster representatives as keys
-        and a list with tuples as values. Each tuple
-        contains the sequence identifier and the
-        similarity value for a match that led to
-        an exclusion.
-    """
-    excluded_seqids = {}
-    excluded_sims = {}
-    for representative, clustered in clusters.items():
-        # get identifiers of sequences in the cluster
-        clustered_ids = [c[0] for c in clustered]
-        clustered_seqs = {seqid: sequences[seqid] for seqid in clustered_ids}
-
-        # create kmer index
-        kmers_mapping = im.kmer_index(clustered_seqs, word_size, fasta=False)
-
-        excluded = []
-        similarities = []
-        # for each sequence in the cluster
-        for seqid, sequence in clustered_seqs.items():
-            if seqid not in excluded:
-                query_kmers = im.determine_minimizers(sequence, word_size,
-                                                      word_size, position=False)
-
-                # select sequences with same kmers
-                sims = select_representatives(query_kmers,
-                                              kmers_mapping,
-                                              intra_filter)
-
-                if len(sims) > 1:
-                    # exclude current sequence
-                    candidates = [s for s in sims if s[0] != seqid]
-                    for c in candidates:
-                        # if query sequence is longer, keep it and exclude candidate
-                        if len(clustered_seqs[seqid]) >= len(clustered_seqs[c[0]]):
-                            picked = (c[0], seqid, c[1])
-                        # otherwise, exclude query sequence
-                        elif len(clustered_seqs[c[0]]) > len(clustered_seqs[seqid]):
-                            picked = (seqid, c[0], c[1])
-                        similarities.append(picked)
-                        excluded.append(picked[0])
-
-        # convert into set first to remove possible duplicates
-        excluded_seqids[representative] = list(set(excluded))
-        excluded_sims[representative] = similarities
-
-    return [excluded_seqids, excluded_sims]
+	"""Identify clustered sequences that are similar.
+
+	Determines the percentage of shared k-mers/minimizers
+	between sequences in the same cluster and excludes
+	sequences that are similar to other sequences in the
+	cluster.
+
+	Parameters
+	----------
+	clusters : dict
+		Dictionary with the identifiers of sequences
+		that are cluster representatives as keys and
+		a list with tuples as values. Each tuple has
+		the identifier of a sequence that was added to
+		the cluster, the percentage of shared
+		kmers/minimizers and the length of the clustered
+		sequence.
+	sequences : dict
+		Dictionary with sequence identifiers as keys
+		and sequences as values.
+	word_size : int
+		Value k for the kmer size.
+	intra_filter : float
+		Similarity threshold value. If two sequences in
+		the same cluster have a similarity value equal
+		or greater to this value, the shorter sequence
+		will be excluded from the cluster.
+
+	Returns
+	-------
+	excluded_seqids : dict
+		Dictionary with the identifiers of sequences
+		that are cluster representatives as keys and
+		a list with the sequence identifiers of sequences
+		that were excluded from the cluster as values.
+	excluded_sims : dict
+		A dictionary with the identifiers of sequences
+		that are cluster representatives as keys
+		and a list with tuples as values. Each tuple
+		contains the sequence identifier and the
+		similarity value for a match that led to
+		an exclusion.
+	"""
+	excluded_seqids = {}
+	excluded_sims = {}
+	for representative, clustered in clusters.items():
+		# get identifiers of sequences in the cluster
+		clustered_ids = [c[0] for c in clustered]
+		clustered_seqs = {seqid: sequences[seqid] for seqid in clustered_ids}
+
+		# create kmer index
+		kmers_mapping = im.kmer_index(clustered_seqs, word_size, fasta=False)
+
+		excluded = []
+		similarities = []
+		# for each sequence in the cluster
+		for seqid, sequence in clustered_seqs.items():
+			if seqid not in excluded:
+				query_kmers = im.determine_minimizers(sequence, word_size,
+													  word_size, position=False)
+
+				# select sequences with same kmers
+				sims = select_representatives(query_kmers,
+											  kmers_mapping,
+											  intra_filter)
+
+				if len(sims) > 1:
+					# exclude current sequence
+					candidates = [s for s in sims if s[0] != seqid]
+					for c in candidates:
+						# if query sequence is longer, keep it and exclude candidate
+						if len(clustered_seqs[seqid]) >= len(clustered_seqs[c[0]]):
+							picked = (c[0], seqid, c[1])
+						# otherwise, exclude query sequence
+						elif len(clustered_seqs[c[0]]) > len(clustered_seqs[seqid]):
+							picked = (seqid, c[0], c[1])
+						similarities.append(picked)
+						excluded.append(picked[0])
+
+		# convert into set first to remove possible duplicates
+		excluded_seqids[representative] = list(set(excluded))
+		excluded_sims[representative] = similarities
+
+	return [excluded_seqids, excluded_sims]
 
 
 # change name and add parameter to accept k-mer sampling used to cluster?
 def minimizer_clustering(sorted_sequences, word_size, window_size, position,
-                         offset, clusters, reps_sequences, reps_groups,
-                         seq_num_cluster, clustering_sim, grow):
-    """Cluster sequences based on shared distinct minimizers.
-
-    Parameters
-    ----------
-    sorted_sequences : dict
-        Dictionary with sequence identifiers as keys and
-        sequences as values. Sorted by decreasing sequence
-        length.
-    word_size : int
-        Value k for the kmer size.
-    window_size : int
-        Window size used to determine minimizers.
-    position : bool
-        If minimizer sequence position should be stored.
-    offset : int
-        Value to indicate offset of consecutive kmers.
-    clusters : dict
-        Dictionary with the identifiers of sequences
-        that are cluster representatives as keys and
-        a list with tuples as values. Each tuple has
-        the identifier of a sequence that was added to
-        the cluster, the decimal proportion of shared
-        distinct minimizers and the length of the clustered
-        sequence. This dictionary should be empty at the
-        start of the clustering step during the CreateSchema
-        process. For the AlleleCall process, the dictionary
-        should contain the identifiers of the loci
-        representatives.
-    reps_sequences : dict
-        Dictionary with the identifiers of sequences
-        that are cluster representatives as keys and
-        their sequences as values. This dictionary should
-        be empty at the start of the clustering step
-        during the CreateSchema process. For the AlleleCall
-        process, the dictionary should contain the
-        identifiers of the loci representatives as keys
-        and their sequences as values.
-    reps_groups : dict
-        Dictionary with kmers as keys and a list with
-        identifiers of sequences that contain that kmer
-        as values. This dictionary should be empty at the
-        start of the clustering step during the CreateSchema
-        process. For the AlleleCall process, the dictionary
-        should contain all kmers contained in the set of
-        the schema's representatives sequences.
-    seq_num_cluster : int
-        Maximum number of clusters that a sequence can be
-        added to.
-    clustering_sim : float
-        Similarity threshold to cluster a sequence into
-        a cluster.
-
-    Returns
-    -------
-    A list with the following elements:
-        clusters : dict
-            Dictionary with the identifiers of sequences
-            that are clusters representatives as keys and
-            a list with tuples as values. Each tuple has
-            the identifier of a sequence that was added to
-            the cluster, the decimal proportion of shared
-            distinct minimizers and the length of the clustered
-            sequence.
-        reps_sequences : dict
-            Dictionary with the identifiers of sequences
-            that are cluster representatives as keys and
-            their sequences as values.
-        reps_groups : dict
-            Dictionary with kmers as keys and a list with
-            identifiers of sequences that contain that kmer
-            as values.
-    """
-    # several = {}
-    for protid, protein in sorted_sequences.items():
-
-        minimizers = im.determine_minimizers(protein, window_size,
-                                             word_size, offset=offset,
-                                             position=position)
-
-        distinct_minimizers = set(minimizers)
-
-        selected_reps = select_representatives(distinct_minimizers,
-                                               reps_groups,
-                                               clustering_sim)
-
-        top = (len(selected_reps)
-               if len(selected_reps) < seq_num_cluster
-               else seq_num_cluster)
-
-        # Sort to get most similar at index 0
-        if len(selected_reps) > 0:
-            for i in range(0, top):
-                clusters[selected_reps[i][0]].append((protid,
-                                                      selected_reps[i][1],
-                                                      len(protein),
-                                                      len(minimizers),
-                                                      len(distinct_minimizers)))
+						 offset, clusters, reps_sequences, reps_groups,
+						 seq_num_cluster, clustering_sim, grow):
+	"""Cluster sequences based on shared distinct minimizers.
+
+	Parameters
+	----------
+	sorted_sequences : dict
+		Dictionary with sequence identifiers as keys and
+		sequences as values. Sorted by decreasing sequence
+		length.
+	word_size : int
+		Value k for the kmer size.
+	window_size : int
+		Window size used to determine minimizers.
+	position : bool
+		If minimizer sequence position should be stored.
+	offset : int
+		Value to indicate offset of consecutive kmers.
+	clusters : dict
+		Dictionary with the identifiers of sequences
+		that are cluster representatives as keys and
+		a list with tuples as values. Each tuple has
+		the identifier of a sequence that was added to
+		the cluster, the decimal proportion of shared
+		distinct minimizers and the length of the clustered
+		sequence. This dictionary should be empty at the
+		start of the clustering step during the CreateSchema
+		process. For the AlleleCall process, the dictionary
+		should contain the identifiers of the loci
+		representatives.
+	reps_sequences : dict
+		Dictionary with the identifiers of sequences
+		that are cluster representatives as keys and
+		their sequences as values. This dictionary should
+		be empty at the start of the clustering step
+		during the CreateSchema process. For the AlleleCall
+		process, the dictionary should contain the
+		identifiers of the loci representatives as keys
+		and their sequences as values.
+	reps_groups : dict
+		Dictionary with kmers as keys and a list with
+		identifiers of sequences that contain that kmer
+		as values. This dictionary should be empty at the
+		start of the clustering step during the CreateSchema
+		process. For the AlleleCall process, the dictionary
+		should contain all kmers contained in the set of
+		the schema's representatives sequences.
+	seq_num_cluster : int
+		Maximum number of clusters that a sequence can be
+		added to.
+	clustering_sim : float
+		Similarity threshold to cluster a sequence into
+		a cluster.
+
+	Returns
+	-------
+	A list with the following elements:
+		clusters : dict
+			Dictionary with the identifiers of sequences
+			that are clusters representatives as keys and
+			a list with tuples as values. Each tuple has
+			the identifier of a sequence that was added to
+			the cluster, the decimal proportion of shared
+			distinct minimizers and the length of the clustered
+			sequence.
+		reps_sequences : dict
+			Dictionary with the identifiers of sequences
+			that are cluster representatives as keys and
+			their sequences as values.
+		reps_groups : dict
+			Dictionary with kmers as keys and a list with
+			identifiers of sequences that contain that kmer
+			as values.
+	"""
+	# several = {}
+	for protid, protein in sorted_sequences.items():
+
+		minimizers = im.determine_minimizers(protein, window_size,
+											 word_size, offset=offset,
+											 position=position)
+
+		distinct_minimizers = set(minimizers)
+
+		selected_reps = select_representatives(distinct_minimizers,
+											   reps_groups,
+											   clustering_sim)
+
+		top = (len(selected_reps)
+			   if len(selected_reps) < seq_num_cluster
+			   else seq_num_cluster)
+
+		# Sort to get most similar at index 0
+		if len(selected_reps) > 0:
+			for i in range(0, top):
+				clusters[selected_reps[i][0]].append((protid,
+													  selected_reps[i][1],
+													  len(protein),
+													  len(minimizers),
+													  len(distinct_minimizers)))
 ####
-            # if len(selected_reps) > 1:
-            #     for i in range(0, top):
-            #         several.setdefault(protid, []).append(selected_reps[i][0])
+			# if len(selected_reps) > 1:
+			#     for i in range(0, top):
+			#         several.setdefault(protid, []).append(selected_reps[i][0])
 ####
-        else:
-            if grow is True:
-                for k in distinct_minimizers:
-                    reps_groups.setdefault(k, []).append(protid)
+		else:
+			if grow is True:
+				for k in distinct_minimizers:
+					reps_groups.setdefault(k, []).append(protid)
 
-                clusters[protid] = [(protid, 1.0, len(protein),
-                                    len(minimizers), len(distinct_minimizers))]
-                reps_sequences[protid] = protein
+				clusters[protid] = [(protid, 1.0, len(protein),
+									len(minimizers), len(distinct_minimizers))]
+				reps_sequences[protid] = protein
 
-    # print(several)
+	# print(several)
 
-    return [clusters, reps_sequences, reps_groups]
+	return [clusters, reps_sequences, reps_groups]
 
 
 def clusterer(sorted_sequences, word_size, window_size,
-              clustering_sim, representatives, grow,
-              offset, position, seq_num_cluster):
-    """Cluster sequences based on shared distinct minimizers.
-
-    Parameters
-    ----------
-    sorted_sequences : dict
-        Dictionary with sequence identifiers as keys and
-        sequences as values. Sorted by decreasing sequence
-        length.
-    word_size : int
-        Value k for the kmer size.
-    window_size : int
-        Window size used to determine minimizers.
-    clustering_sim : float
-        Similarity threshold to cluster a sequence into
-        a cluster.
-    representatives : dict
-        Dictionary with kmers as keys and a list with
-        identifiers of sequences that contain that kmer
-        as values.
-    grow : bool
-        If it is allowed to create new clusters.
-    offset : int
-        Value to indicate offset of consecutive kmers.
-    position : bool
-        If minimizer sequence position should be stored.
-    seq_num_cluster : int
-        Maximum number of clusters that a sequence can be
-        added to.
-
-    Returns
-    -------
-    A list with the following elements:
-        clusters : dict
-            Dictionary with the identifiers of sequences
-            that are cluster representatives as keys and
-            a list with tuples as values. Each tuple has
-            the identifier of a sequence that was added to
-            the cluster, the decimal proportion of shared
-            distinct minimizers and the length of the clustered
-            sequence.
-        reps_sequences : dict
-            Dictionary with the identifiers of sequences
-            that are cluster representatives as keys and
-            their sequences as values.
-    """
-    clusters = {}
-    reps_sequences = {}
-    if representatives is None:
-        reps_groups = {}
-    else:
-        reps_groups = representatives
-        clusters_ids = set(im.flatten_list(list(reps_groups.values())))
-        clusters = {rep: [] for rep in clusters_ids}
-
-    cluster_results = minimizer_clustering(sorted_sequences, word_size,
-                                           window_size, position,
-                                           offset, clusters,
-                                           reps_sequences, reps_groups,
-                                           seq_num_cluster, clustering_sim,
-                                           grow)
-
-    return cluster_results[0:2]
+			  clustering_sim, representatives, grow,
+			  offset, position, seq_num_cluster):
+	"""Cluster sequences based on shared distinct minimizers.
+
+	Parameters
+	----------
+	sorted_sequences : dict
+		Dictionary with sequence identifiers as keys and
+		sequences as values. Sorted by decreasing sequence
+		length.
+	word_size : int
+		Value k for the kmer size.
+	window_size : int
+		Window size used to determine minimizers.
+	clustering_sim : float
+		Similarity threshold to cluster a sequence into
+		a cluster.
+	representatives : dict
+		Dictionary with kmers as keys and a list with
+		identifiers of sequences that contain that kmer
+		as values.
+	grow : bool
+		If it is allowed to create new clusters.
+	offset : int
+		Value to indicate offset of consecutive kmers.
+	position : bool
+		If minimizer sequence position should be stored.
+	seq_num_cluster : int
+		Maximum number of clusters that a sequence can be
+		added to.
+
+	Returns
+	-------
+	A list with the following elements:
+		clusters : dict
+			Dictionary with the identifiers of sequences
+			that are cluster representatives as keys and
+			a list with tuples as values. Each tuple has
+			the identifier of a sequence that was added to
+			the cluster, the decimal proportion of shared
+			distinct minimizers and the length of the clustered
+			sequence.
+		reps_sequences : dict
+			Dictionary with the identifiers of sequences
+			that are cluster representatives as keys and
+			their sequences as values.
+	"""
+	clusters = {}
+	reps_sequences = {}
+	if representatives is None:
+		reps_groups = {}
+	else:
+		reps_groups = representatives
+		clusters_ids = set(im.flatten_list(list(reps_groups.values())))
+		clusters = {rep: [] for rep in clusters_ids}
+
+	cluster_results = minimizer_clustering(sorted_sequences, word_size,
+										   window_size, position,
+										   offset, clusters,
+										   reps_sequences, reps_groups,
+										   seq_num_cluster, clustering_sim,
+										   grow)
+
+	return cluster_results[0:2]
 
 
 def write_clusters(clusters, outfile):
-    """Write information about clusters to file.
-
-    Parameters
-    ----------
-    clusters : dict
-        Dictionary with the identifiers of sequences
-        that are cluster representatives as keys and
-        a list with tuples as values. Each tuple has
-        the identifier of a sequence that was added to
-        the cluster, the decimal proportion of shared
-        distinct kmers/minimizers and the length of the
-        clustered sequence.
-    outfile : str
-        Path to the file that will be created to save
-        information about clusters.
-    """
-    cluster_lines = []
-    for rep, seqids in clusters.items():
-        current_cluster = []
-        current_cluster.append('>{0}'.format(rep))
-        clustered = [', '.join(['{}']*len(s)).format(*s)
-                     for s in seqids]
-        current_cluster.extend(clustered)
-        cluster_lines.append(current_cluster)
-
-    # sort by number of lines to get clusters with more sequences first
-    cluster_lines = im.sort_iterable(cluster_lines,
-                                     sort_key=lambda x: len(x), reverse=True)
-    cluster_lines = im.flatten_list(cluster_lines)
-    cluster_text = im.join_list(cluster_lines, '\n')
-
-    fo.write_to_file(cluster_text, outfile, 'w', '\n')
+	"""Write information about clusters to file.
+
+	Parameters
+	----------
+	clusters : dict
+		Dictionary with the identifiers of sequences
+		that are cluster representatives as keys and
+		a list with tuples as values. Each tuple has
+		the identifier of a sequence that was added to
+		the cluster, the decimal proportion of shared
+		distinct kmers/minimizers and the length of the
+		clustered sequence.
+	outfile : str
+		Path to the file that will be created to save
+		information about clusters.
+	"""
+	cluster_lines = []
+	for rep, seqids in clusters.items():
+		current_cluster = []
+		current_cluster.append('>{0}'.format(rep))
+		clustered = [', '.join(['{}']*len(s)).format(*s)
+					 for s in seqids]
+		current_cluster.extend(clustered)
+		cluster_lines.append(current_cluster)
+
+	# sort by number of lines to get clusters with more sequences first
+	cluster_lines = im.sort_iterable(cluster_lines,
+									 sort_key=lambda x: len(x), reverse=True)
+	cluster_lines = im.flatten_list(cluster_lines)
+	cluster_text = im.join_list(cluster_lines, '\n')
+
+	fo.write_to_file(cluster_text, outfile, 'w', '\n')
 
 
 def representative_pruner(clusters, sim_cutoff):
-    """Remove sequences from clusters based on a similarity threshold.
-
-    Parameters
-    ----------
-    clusters : dict
-        Dictionary with the identifiers of sequences
-        that are cluster representatives as keys and
-        a list with tuples as values. Each tuple has
-        the identifier of a sequence that was added to
-        the cluster, the percentage of shared
-        kmers/minimizers and the length of the clustered
-        sequence.
-    sim_cutoff : float
-        Similarity threshold value. Sequences with
-        equal or greater similarity value are excluded
-        from clusters.
-
-    Returns
-    -------
-    A list with the following elements:
-        pruned_clusters : dict
-            Input dictionary without values for the
-            sequences that had similarity values
-            equal or greater that defined threshold.
-        excluded : list
-            List with a list per cluster. Each list
-            has the sequences that were excluded from
-            a cluster.
-    """
-    excluded = []
-    pruned_clusters = {}
-    for rep, seqids in clusters.items():
-        pruned_clusters[rep] = [seqid
-                                for seqid in seqids
-                                if seqid[1] < sim_cutoff]
-        # do not exclude cluster representatives
-        excluded.extend([seqid
-                         for seqid in seqids
-                         if seqid[1] >= sim_cutoff and seqid[0] != rep])
-
-    return [pruned_clusters, excluded]
+	"""Remove sequences from clusters based on a similarity threshold.
+
+	Parameters
+	----------
+	clusters : dict
+		Dictionary with the identifiers of sequences
+		that are cluster representatives as keys and
+		a list with tuples as values. Each tuple has
+		the identifier of a sequence that was added to
+		the cluster, the percentage of shared
+		kmers/minimizers and the length of the clustered
+		sequence.
+	sim_cutoff : float
+		Similarity threshold value. Sequences with
+		equal or greater similarity value are excluded
+		from clusters.
+
+	Returns
+	-------
+	A list with the following elements:
+		pruned_clusters : dict
+			Input dictionary without values for the
+			sequences that had similarity values
+			equal or greater that defined threshold.
+		excluded : list
+			List with a list per cluster. Each list
+			has the sequences that were excluded from
+			a cluster.
+	"""
+	excluded = []
+	pruned_clusters = {}
+	for rep, seqids in clusters.items():
+		pruned_clusters[rep] = [seqid
+								for seqid in seqids
+								if seqid[1] < sim_cutoff]
+		# do not exclude cluster representatives
+		excluded.extend([seqid
+						 for seqid in seqids
+						 if seqid[1] >= sim_cutoff and seqid[0] != rep])
+
+	return [pruned_clusters, excluded]
 
 
 def cluster_blaster(seqids, sequences, output_directory,
-                    blast_path, blastdb_path, blastdb_aliastool_path, only_rep=False):
-    """Align sequences in the same cluster with BLAST.
-
-    Parameters
-    ----------
-    seqids : list
-        List with cluster identifiers.
-    sequences : str
-        Path to the FASTA file with the protein sequences
-        in all clusters.
-    output_directory : str
-        Path to the directory where FASTA files with
-        the sequences in each cluster and files with
-        BLAST results will be written to.
-    blast_path : str
-        Path to BLAST executables
-    blastdb_path : str
-        Path to a BLAST database.
-
-    Returns
-    -------
-    out_files : list
-        List with the paths to the files with the BLAST
-        results for each cluster.
-    """
-    indexed_fasta = fao.index_fasta(sequences)
-
-    out_files = []
-    for cluster in seqids:
-        cluster_id = cluster
-        ids_file = os.path.join(output_directory,
-                                '{0}_ids.txt'.format(cluster_id))
-
-        fasta_file = os.path.join(output_directory,
-                                  '{0}_protein.fasta'.format(cluster_id))
-
-        if only_rep is False:
-            with open(ids_file, 'r') as clstr:
-                cluster_ids = [l.strip() for l in clstr.readlines()]
-            # create file with protein sequences
-            fao.get_sequences_by_id(indexed_fasta, cluster_ids, fasta_file)
-        else:
-            fao.get_sequences_by_id(indexed_fasta, [cluster_id], fasta_file)
-
-        blast_output = os.path.join(output_directory,
-                                    '{0}_blastout.tsv'.format(cluster_id))
+					blast_path, blastdb_path, blastdb_aliastool_path, only_rep=False):
+	"""Align sequences in the same cluster with BLAST.
+
+	Parameters
+	----------
+	seqids : list
+		List with cluster identifiers.
+	sequences : str
+		Path to the FASTA file with the protein sequences
+		in all clusters.
+	output_directory : str
+		Path to the directory where FASTA files with
+		the sequences in each cluster and files with
+		BLAST results will be written to.
+	blast_path : str
+		Path to BLAST executables
+	blastdb_path : str
+		Path to a BLAST database.
+
+	Returns
+	-------
+	out_files : list
+		List with the paths to the files with the BLAST
+		results for each cluster.
+	"""
+	indexed_fasta = fao.index_fasta(sequences)
+
+	out_files = []
+	for cluster in seqids:
+		cluster_id = cluster
+		ids_file = os.path.join(output_directory,
+								'{0}_ids.txt'.format(cluster_id))
+
+		fasta_file = os.path.join(output_directory,
+								  '{0}_protein.fasta'.format(cluster_id))
+
+		if only_rep is False:
+			with open(ids_file, 'r') as clstr:
+				cluster_ids = [l.strip() for l in clstr.readlines()]
+			# create file with protein sequences
+			fao.get_sequences_by_id(indexed_fasta, cluster_ids, fasta_file)
+		else:
+			fao.get_sequences_by_id(indexed_fasta, [cluster_id], fasta_file)
+
+		blast_output = os.path.join(output_directory,
+									'{0}_blastout.tsv'.format(cluster_id))
 
 		binary_file = f'{ids_file}.bin'
 		blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path,
 												ids_file,
 												binary_file)
 		ids_file = binary_file
-        # Use subprocess to capture errors and warnings
-        blast_std = bw.run_blast(blast_path, blastdb_path, fasta_file,
-                                 blast_output, 1, 1,
-                                 ids_file)
+		# Use subprocess to capture errors and warnings
+		blast_std = bw.run_blast(blast_path, blastdb_path, fasta_file,
+								 blast_output, 1, 1,
+								 ids_file)
 
-        out_files.append(blast_output)
+		out_files.append(blast_output)
 
-    return out_files
+	return out_files
 
 
 def blast_seqids(clusters, output_directory, only_rep):
-    """Create files with the identifiers of the sequences in each cluster.
-
-    Parameters
-    ----------
-    clusters : dict
-        Dictionary with the identifiers of cluster
-        representatives as keys and a list with tuples
-        as values (each tuple has the identifier of a
-        sequence that is in the cluster, the decimal
-        proportion of shared minimizers and the length
-        of that sequence).
-    output_directory : str
-        Path to the directory where files with identifiers
-        will be created.
-
-    Returns
-    -------
-    ids_to_blast : list
-        List with the identifiers of all clusters.
-    """
-    ids_to_blast = []
-    for rep in clusters:
-        cluster_file = os.path.join(output_directory, f'{rep}_ids.txt')
-        if only_rep is False:
-            cluster_ids = [rep] + [seqid[0] for seqid in clusters[rep]]
-        else:
-            cluster_ids = [seqid[0] for seqid in clusters[rep]]
-
-        cluster_lines = im.join_list(cluster_ids, '\n')
-        fo.write_to_file(cluster_lines, cluster_file, 'w', '')
-        ids_to_blast.append((rep, len(cluster_ids)))
-
-    return ids_to_blast
+	"""Create files with the identifiers of the sequences in each cluster.
+
+	Parameters
+	----------
+	clusters : dict
+		Dictionary with the identifiers of cluster
+		representatives as keys and a list with tuples
+		as values (each tuple has the identifier of a
+		sequence that is in the cluster, the decimal
+		proportion of shared minimizers and the length
+		of that sequence).
+	output_directory : str
+		Path to the directory where files with identifiers
+		will be created.
+
+	Returns
+	-------
+	ids_to_blast : list
+		List with the identifiers of all clusters.
+	"""
+	ids_to_blast = []
+	for rep in clusters:
+		cluster_file = os.path.join(output_directory, f'{rep}_ids.txt')
+		if only_rep is False:
+			cluster_ids = [rep] + [seqid[0] for seqid in clusters[rep]]
+		else:
+			cluster_ids = [seqid[0] for seqid in clusters[rep]]
+
+		cluster_lines = im.join_list(cluster_ids, '\n')
+		fo.write_to_file(cluster_lines, cluster_file, 'w', '')
+		ids_to_blast.append((rep, len(cluster_ids)))
+
+	return ids_to_blast
diff --git a/CHEWBBACA/utils/sequence_manipulation.py b/CHEWBBACA/utils/sequence_manipulation.py
index 2f38b6e3..240061cb 100644
--- a/CHEWBBACA/utils/sequence_manipulation.py
+++ b/CHEWBBACA/utils/sequence_manipulation.py
@@ -16,632 +16,632 @@
 from collections import Counter
 
 try:
-    from utils import (constants as ct,
-                       file_operations as fo,
-                       iterables_manipulation as im,
-                       fasta_operations as fao)
+	from utils import (constants as ct,
+					   file_operations as fo,
+					   iterables_manipulation as im,
+					   fasta_operations as fao)
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import (constants as ct,
-                                 file_operations as fo,
-                                 iterables_manipulation as im,
-                                 fasta_operations as fao)
+	from CHEWBBACA.utils import (constants as ct,
+								 file_operations as fo,
+								 iterables_manipulation as im,
+								 fasta_operations as fao)
 
 
 def translate_sequence(dna_str, table_id):
-    """Translate a DNA sequence using the BioPython package.
+	"""Translate a DNA sequence using the BioPython package.
 
-    Parameters
-    ----------
-    dna_str : str
-        String representing a DNA sequence.
-    table_id : int
-        Translation table identifier.
+	Parameters
+	----------
+	dna_str : str
+		String representing a DNA sequence.
+	table_id : int
+		Translation table identifier.
 
-    Returns
-    -------
-    protseq : Bio.Seq.Seq
-        Protein sequence created by translating the
-        input DNA sequence.
-    """
-    myseq_obj = Seq(dna_str)
-    protseq = Seq.translate(myseq_obj, table=table_id, cds=True)
+	Returns
+	-------
+	protseq : Bio.Seq.Seq
+		Protein sequence created by translating the
+		input DNA sequence.
+	"""
+	myseq_obj = Seq(dna_str)
+	protseq = Seq.translate(myseq_obj, table=table_id, cds=True)
 
-    return protseq
+	return protseq
 
 
 def translate_dna_aux(dna_sequence, method, table_id):
-    """Attempt to translate a DNA sequence in specified orientation.
-
-    Attempts to translate an input DNA sequence in specified
-    orientation and stores exceptions when the input sequence
-    cannot be translated.
-
-    Parameters
-    ----------
-    dna_sequence : str
-        String representing a DNA sequence.
-    method : str
-        Sequence orientation to attempt translation.
-    table_id : int
-        Translation table identifier.
-
-    Returns
-    -------
-    If the sequence can be translated:
-        protseq : Bio.Seq.Seq
-            Translated DNA sequence.
-        myseq : str
-            String representing the DNA sequence in the
-            orientation used to translate it.
-    Otherwise, returns string with the description of the
-    exception that was raised.
-    """
-    myseq = dna_sequence
-    # try to translate original sequence
-    if method == 'original':
-        try:
-            protseq = translate_sequence(myseq, table_id)
-        except Exception as argh:
-            return argh
-    # try to translate the reverse complement
-    elif method == 'revcomp':
-        try:
-            myseq = im.reverse_complement(myseq, ct.DNA_BASES)
-            protseq = translate_sequence(myseq, table_id)
-        except Exception as argh:
-            return argh
-    # try to translate the reverse
-    elif method == 'rev':
-        try:
-            myseq = im.reverse_str(myseq)
-            protseq = translate_sequence(myseq, table_id)
-        except Exception as argh:
-            return argh
-    # try to translate the reverse reverse complement
-    elif method == 'revrevcomp':
-        try:
-            myseq = im.reverse_str(myseq)
-            myseq = im.reverse_complement(myseq, ct.DNA_BASES)
-            protseq = translate_sequence(myseq, table_id)
-        except Exception as argh:
-            return argh
-
-    return [protseq, myseq]
+	"""Attempt to translate a DNA sequence in specified orientation.
+
+	Attempts to translate an input DNA sequence in specified
+	orientation and stores exceptions when the input sequence
+	cannot be translated.
+
+	Parameters
+	----------
+	dna_sequence : str
+		String representing a DNA sequence.
+	method : str
+		Sequence orientation to attempt translation.
+	table_id : int
+		Translation table identifier.
+
+	Returns
+	-------
+	If the sequence can be translated:
+		protseq : Bio.Seq.Seq
+			Translated DNA sequence.
+		myseq : str
+			String representing the DNA sequence in the
+			orientation used to translate it.
+	Otherwise, returns string with the description of the
+	exception that was raised.
+	"""
+	myseq = dna_sequence
+	# try to translate original sequence
+	if method == 'original':
+		try:
+			protseq = translate_sequence(myseq, table_id)
+		except Exception as argh:
+			return argh
+	# try to translate the reverse complement
+	elif method == 'revcomp':
+		try:
+			myseq = im.reverse_complement(myseq, ct.DNA_BASES)
+			protseq = translate_sequence(myseq, table_id)
+		except Exception as argh:
+			return argh
+	# try to translate the reverse
+	elif method == 'rev':
+		try:
+			myseq = im.reverse_str(myseq)
+			protseq = translate_sequence(myseq, table_id)
+		except Exception as argh:
+			return argh
+	# try to translate the reverse reverse complement
+	elif method == 'revrevcomp':
+		try:
+			myseq = im.reverse_str(myseq)
+			myseq = im.reverse_complement(myseq, ct.DNA_BASES)
+			protseq = translate_sequence(myseq, table_id)
+		except Exception as argh:
+			return argh
+
+	return [protseq, myseq]
 
 
 def translate_dna(dna_sequence, table_id, min_len):
-    """Determine if a DNA sequence is valid and translate it.
-
-    Checks if sequence is valid and attempts to translate
-    it, calling several functions to ensure that the sequence
-    only has 'ACTG', is multiple of 3 and that it can be
-    translated in any of 4 different orientations. Stores
-    exceptions so that it is possible to understand why the
-    sequence could not be translated.
-
-    Parameters
-    ----------
-    dna_sequence : str
-        String representing a DNA sequence.
-    table_id : int
-        Translation table identifier.
-    min_len : int
-        Minimum sequence length. Sequences shorter
-        than this value are not translated.
-
-    Returns
-    -------
-    If the sequence can be translated:
-        sequence : list
-            List with two elemets, the protein sequence
-            and the DNA sequence in the correct orientation.
-        coding_strand : str
-            The sequence orientation that codes for the
-            protein.
-    Otherwise:
-        exception_str : str
-            A string containing the exceptions that
-            explain why the the sequence could not be
-            translated.
-    """
-    original_seq = dna_sequence.upper()
-    exception_collector = []
-    strands = ['sense', 'antisense', 'revsense', 'revantisense']
-    translating_methods = ['original', 'revcomp', 'rev', 'revrevcomp']
-
-    # check if the sequence has ambiguous bases
-    valid_dna = im.check_str_alphabet(original_seq, ct.DNA_BASES)
-    if valid_dna is not True:
-        return 'ambiguous or invalid characters'
-
-    # check if sequence size is multiple of three
-    valid_length = im.check_str_multiple(original_seq, 3)
-    if valid_length is not True:
-        return 'sequence length is not a multiple of 3'
-
-    # check if sequence is not shorter than the accepted minimum length
-    if len(original_seq) < min_len:
-        return 'sequence shorter than {0} nucleotides'.format(min_len)
-
-    # try to translate in 4 different orientations
-    # or reach the conclusion that the sequence cannot be translated
-    i = 0
-    translated = False
-    while translated is False:
-        translated_seq = translate_dna_aux(original_seq, translating_methods[i], table_id)
-        if not isinstance(translated_seq, list):
-            exception_collector.append('{0}({1})'.format(strands[i],
-                                                         translated_seq.args[0]))
-
-        i += 1
-        if i == len(strands) or isinstance(translated_seq, list) is True:
-            translated = True
-
-    coding_strand = strands[i-1]
-
-    # if the sequence could be translated, return list with protein and DNA
-    # sequence in correct orientation
-    if isinstance(translated_seq, list):
-        return [translated_seq, coding_strand]
-    # if it could not be translated, return the string with all exception
-    # that were collected
-    else:
-        exception_str = ','.join(exception_collector)
-        return exception_str
+	"""Determine if a DNA sequence is valid and translate it.
+
+	Checks if sequence is valid and attempts to translate
+	it, calling several functions to ensure that the sequence
+	only has 'ACTG', is multiple of 3 and that it can be
+	translated in any of 4 different orientations. Stores
+	exceptions so that it is possible to understand why the
+	sequence could not be translated.
+
+	Parameters
+	----------
+	dna_sequence : str
+		String representing a DNA sequence.
+	table_id : int
+		Translation table identifier.
+	min_len : int
+		Minimum sequence length. Sequences shorter
+		than this value are not translated.
+
+	Returns
+	-------
+	If the sequence can be translated:
+		sequence : list
+			List with two elemets, the protein sequence
+			and the DNA sequence in the correct orientation.
+		coding_strand : str
+			The sequence orientation that codes for the
+			protein.
+	Otherwise:
+		exception_str : str
+			A string containing the exceptions that
+			explain why the the sequence could not be
+			translated.
+	"""
+	original_seq = dna_sequence.upper()
+	exception_collector = []
+	strands = ['sense', 'antisense', 'revsense', 'revantisense']
+	translating_methods = ['original', 'revcomp', 'rev', 'revrevcomp']
+
+	# check if the sequence has ambiguous bases
+	valid_dna = im.check_str_alphabet(original_seq, ct.DNA_BASES)
+	if valid_dna is not True:
+		return 'ambiguous or invalid characters'
+
+	# check if sequence size is multiple of three
+	valid_length = im.check_str_multiple(original_seq, 3)
+	if valid_length is not True:
+		return 'sequence length is not a multiple of 3'
+
+	# check if sequence is not shorter than the accepted minimum length
+	if len(original_seq) < min_len:
+		return 'sequence shorter than {0} nucleotides'.format(min_len)
+
+	# try to translate in 4 different orientations
+	# or reach the conclusion that the sequence cannot be translated
+	i = 0
+	translated = False
+	while translated is False:
+		translated_seq = translate_dna_aux(original_seq, translating_methods[i], table_id)
+		if not isinstance(translated_seq, list):
+			exception_collector.append('{0}({1})'.format(strands[i],
+														 translated_seq.args[0]))
+
+		i += 1
+		if i == len(strands) or isinstance(translated_seq, list) is True:
+			translated = True
+
+	coding_strand = strands[i-1]
+
+	# if the sequence could be translated, return list with protein and DNA
+	# sequence in correct orientation
+	if isinstance(translated_seq, list):
+		return [translated_seq, coding_strand]
+	# if it could not be translated, return the string with all exception
+	# that were collected
+	else:
+		exception_str = ','.join(exception_collector)
+		return exception_str
 
 
 def determine_duplicated_seqs(sequences):
-    """Create mapping between sequences and sequence identifiers.
-
-    Parameters
-    ----------
-    sequences : dict
-        Dictionary with sequence identifiers as keys and
-        sequences as values.
-
-    Returns
-    -------
-    equal_seqs : dict
-        Dictionary with sequences as keys and sequence
-        identifiers that are associated with each
-        sequence as values.
-    """
-    equal_seqs = {}
-    for seqid, seq in sequences.items():
-        # if protein sequence was already added as key
-        if seq in equal_seqs:
-            # append new protid
-            equal_seqs[seq].append(seqid)
-        # else add new protein sequence as key and protid
-        # as value
-        else:
-            equal_seqs[seq] = [seqid]
-
-    return equal_seqs
+	"""Create mapping between sequences and sequence identifiers.
+
+	Parameters
+	----------
+	sequences : dict
+		Dictionary with sequence identifiers as keys and
+		sequences as values.
+
+	Returns
+	-------
+	equal_seqs : dict
+		Dictionary with sequences as keys and sequence
+		identifiers that are associated with each
+		sequence as values.
+	"""
+	equal_seqs = {}
+	for seqid, seq in sequences.items():
+		# if protein sequence was already added as key
+		if seq in equal_seqs:
+			# append new protid
+			equal_seqs[seq].append(seqid)
+		# else add new protein sequence as key and protid
+		# as value
+		else:
+			equal_seqs[seq] = [seqid]
+
+	return equal_seqs
 
 
 def determine_longest(seqids, sequences):
-    """Find the longest sequence in a set of sequences.
+	"""Find the longest sequence in a set of sequences.
 
-    Parameters
-    ----------
-    seqids : list
-        List with sequence identifiers.
-    sequences : dict
-        Dictionary with sequence identifiers as keys
-        and sequences as values.
+	Parameters
+	----------
+	seqids : list
+		List with sequence identifiers.
+	sequences : dict
+		Dictionary with sequence identifiers as keys
+		and sequences as values.
 
-    Returns
-    -------
-    chosen : str
-        Sequence identifier of the longest sequence.
-    """
-    seqids_tups = [(seqid, sequences[seqid]) for seqid in seqids]
-    sorted_tups = sorted(seqids_tups, key=lambda x: len(x[1]), reverse=True)
-    chosen = sorted_tups[0][0]
+	Returns
+	-------
+	chosen : str
+		Sequence identifier of the longest sequence.
+	"""
+	seqids_tups = [(seqid, sequences[seqid]) for seqid in seqids]
+	sorted_tups = sorted(seqids_tups, key=lambda x: len(x[1]), reverse=True)
+	chosen = sorted_tups[0][0]
 
-    return chosen
+	return chosen
 
 
 def determine_mode(values):
-    """Compute the mode based on a list of integers.
+	"""Compute the mode based on a list of integers.
 
-    Parameters
-    ----------
-    values : list
-        List with integer values.
+	Parameters
+	----------
+	values : list
+		List with integer values.
 
-    Returns
-    -------
-    modes : list
-        The most frequent integer values.
-    """
-    # Determine frequency of each value
-    counts = Counter(values)
-    # Order by frequency
-    most_common = counts.most_common()
-    # Get first value and any other value with same frequency
-    modes = [m[0] for m in most_common
-             if m[1] == most_common[0][1]]
+	Returns
+	-------
+	modes : list
+		The most frequent integer values.
+	"""
+	# Determine frequency of each value
+	counts = Counter(values)
+	# Order by frequency
+	most_common = counts.most_common()
+	# Get first value and any other value with same frequency
+	modes = [m[0] for m in most_common
+			 if m[1] == most_common[0][1]]
 
-    return modes
+	return modes
 
 
 def mode_filter(sequences, size_threshold):
-    """Find sequences with length that deviates from the mode value.
-
-    Determines the mode from a set of input sequences and
-    identifies sequences that have a length value smaller
-    or greater than the mode based on a threshold.
-
-    Parameters
-    ----------
-    sequences : dict
-        Dictionary with sequence identifiers as keys and
-        sequences as values.
-    size_threshold : float
-        Sequences with +/- this value * mode will be
-        reported as above or below the mode.
-
-    Returns
-    -------
-    A list with the following variables:
-        modes : list
-            List with mode values determined based on the
-            length of input sequences.
-        alm : list
-            List with the sequence identifiers of the
-            sequences that are above the mode value by
-            mode*size_threshold.
-        asm : list
-            List with the sequence identifiers of the
-            sequences that are below the mode value by
-            mode*size_threshold.
-        seqs_lengths : dict
-            Dictionary with sequence identifiers as keys
-            and sequence lengths as values.
-    """
-    # determine length value of all sequences
-    seqs_lengths = {seqid: len(seq) for seqid, seq in sequences.items()}
-
-    # determine mode/s
-    modes = determine_mode(list(seqs_lengths.values()))
-
-    # determine top and bot length value limits
-    max_mode = max(modes)
-    top_limit = max_mode + (max_mode*size_threshold)
-    min_mode = min(modes)
-    bot_limit = min_mode - (min_mode*size_threshold)
-
-    # determine sequences that are below or above limits
-    alm = [seqid for seqid, length in seqs_lengths.items()
-           if length > top_limit]
-    asm = [seqid for seqid, length in seqs_lengths.items()
-           if length < bot_limit]
-
-    return [modes, alm, asm, seqs_lengths]
+	"""Find sequences with length that deviates from the mode value.
+
+	Determines the mode from a set of input sequences and
+	identifies sequences that have a length value smaller
+	or greater than the mode based on a threshold.
+
+	Parameters
+	----------
+	sequences : dict
+		Dictionary with sequence identifiers as keys and
+		sequences as values.
+	size_threshold : float
+		Sequences with +/- this value * mode will be
+		reported as above or below the mode.
+
+	Returns
+	-------
+	A list with the following variables:
+		modes : list
+			List with mode values determined based on the
+			length of input sequences.
+		alm : list
+			List with the sequence identifiers of the
+			sequences that are above the mode value by
+			mode*size_threshold.
+		asm : list
+			List with the sequence identifiers of the
+			sequences that are below the mode value by
+			mode*size_threshold.
+		seqs_lengths : dict
+			Dictionary with sequence identifiers as keys
+			and sequence lengths as values.
+	"""
+	# determine length value of all sequences
+	seqs_lengths = {seqid: len(seq) for seqid, seq in sequences.items()}
+
+	# determine mode/s
+	modes = determine_mode(list(seqs_lengths.values()))
+
+	# determine top and bot length value limits
+	max_mode = max(modes)
+	top_limit = max_mode + (max_mode*size_threshold)
+	min_mode = min(modes)
+	bot_limit = min_mode - (min_mode*size_threshold)
+
+	# determine sequences that are below or above limits
+	alm = [seqid for seqid, length in seqs_lengths.items()
+		   if length > top_limit]
+	asm = [seqid for seqid, length in seqs_lengths.items()
+		   if length < bot_limit]
+
+	return [modes, alm, asm, seqs_lengths]
 
 
 def get_seqs_dicts(fasta_path, gene_id, table_id, min_len, size_threshold):
-    """Translate DNA sequences in a FASTA file.
-
-    Translates the set of alleles from a gene. Identifies
-    sequences that cannot be translated according to the
-    criteria enforced by chewBBACA.
-
-    Parameters
-    ----------
-    fasta_path : str
-        Path to the FASTA file with DNA sequences.
-    gene_id : str
-        Gene identifier.
-    table_id : int
-        Translation table identifier.
-    min_len : int
-        Minimum sequence length. Sequences shorter
-        than this value are not translated.
-    size_threshold : float
-        Sequences with +/- this value * mode will be
-        reported as above or below the mode.
-
-    Returns
-    -------
-    List with following elements:
-        dna_seqs : dict
-            Dictionary with sequence identifiers as keys
-            and DNA sequences as values.
-        prot_seqs : dict
-            Dictionary with protein identifiers as keys
-            and Protein sequences as values. Keys are
-            consecutive integers to enable alignment
-            with BLASTp without getting exceptions due
-            to long sequence identifiers.
-        invalid : list
-            List with sequence identifiers of alleles
-            that are not valid because they could not be
-            translated.
-        seqids_map : dict
-            Dictionary with consecutive integers as keys
-            and original allele identifiers as values.
-        total_seqs : int
-            Total number of sequences that was processed
-            (including invalid alleles).
-    """
-    sequences = fao.import_sequences(fasta_path)
-
-    # Translate sequences
-    translated_seqs = {k: translate_dna(v, table_id, min_len)
-                       for k, v in sequences.items()}
-
-    # Add locus identifier to headers
-    # Some headers might only have the allele identifier
-    seqids = list(translated_seqs.keys())
-    new_seqids = {}
-    for i in seqids:
-        new_id = '{0}_{1}'.format(gene_id, i.split('_')[-1])
-        new_seqids[i] = new_id
-
-    # Switch sequence ids
-    sequences = {new_seqids[k]: v
-                 for k, v in translated_seqs.items()}
-
-    valid = {k: v
-             for k, v in sequences.items()
-             if isinstance(v, list) is True}
-    invalid = [[k, v]
-               for k, v in sequences.items()
-               if isinstance(v, list) is False]
-
-    seqid = 1
-    seqids_map = {}
-    dna_seqs = {}
-    prot_seqs = {}
-    for k, v in valid.items():
-        seqids_map[str(seqid)] = k
-        dna_seqs[k] = v[0][1]
-        prot_seqs[str(seqid)] = str(v[0][0])
-        seqid += 1
-
-    if size_threshold is not None and len(prot_seqs) > 0:
-        # Remove alleles based on length mode and size threshold
-        modes, alm, asm, alleles_lengths = mode_filter(dna_seqs, size_threshold)
-        excluded = set(asm + alm)
-
-        dna_seqs = {seqid: seq
-                    for seqid, seq in dna_seqs.items()
-                    if seqid not in excluded}
-        prot_seqs = {seqid: seq
-                     for seqid, seq in prot_seqs.items()
-                     if seqids_map[seqid] not in excluded}
-
-        modes_concat = ':'.join(map(str, modes))
-        st_percentage = int(size_threshold*100)
-        invalid += [[s, 'allele greater than {0}% locus length mode '
-                     '({1}>{2})'.format(st_percentage,
-                                        alleles_lengths[s],
-                                        modes_concat)]
-                    for s in alm]
-        invalid += [[s, 'allele smaller than {0}% locus length mode '
-                     '({1}<{2})'.format(st_percentage,
-                                        alleles_lengths[s],
-                                        modes_concat)]
-                    for s in asm]
-
-    total_seqs = len(translated_seqs)
-
-    return [dna_seqs, prot_seqs, invalid, seqids_map, total_seqs]
+	"""Translate DNA sequences in a FASTA file.
+
+	Translates the set of alleles from a gene. Identifies
+	sequences that cannot be translated according to the
+	criteria enforced by chewBBACA.
+
+	Parameters
+	----------
+	fasta_path : str
+		Path to the FASTA file with DNA sequences.
+	gene_id : str
+		Gene identifier.
+	table_id : int
+		Translation table identifier.
+	min_len : int
+		Minimum sequence length. Sequences shorter
+		than this value are not translated.
+	size_threshold : float
+		Sequences with +/- this value * mode will be
+		reported as above or below the mode.
+
+	Returns
+	-------
+	List with following elements:
+		dna_seqs : dict
+			Dictionary with sequence identifiers as keys
+			and DNA sequences as values.
+		prot_seqs : dict
+			Dictionary with protein identifiers as keys
+			and Protein sequences as values. Keys are
+			consecutive integers to enable alignment
+			with BLASTp without getting exceptions due
+			to long sequence identifiers.
+		invalid : list
+			List with sequence identifiers of alleles
+			that are not valid because they could not be
+			translated.
+		seqids_map : dict
+			Dictionary with consecutive integers as keys
+			and original allele identifiers as values.
+		total_seqs : int
+			Total number of sequences that was processed
+			(including invalid alleles).
+	"""
+	sequences = fao.import_sequences(fasta_path)
+
+	# Translate sequences
+	translated_seqs = {k: translate_dna(v, table_id, min_len)
+					   for k, v in sequences.items()}
+
+	# Add locus identifier to headers
+	# Some headers might only have the allele identifier
+	seqids = list(translated_seqs.keys())
+	new_seqids = {}
+	for i in seqids:
+		new_id = '{0}_{1}'.format(gene_id, i.split('_')[-1])
+		new_seqids[i] = new_id
+
+	# Switch sequence ids
+	sequences = {new_seqids[k]: v
+				 for k, v in translated_seqs.items()}
+
+	valid = {k: v
+			 for k, v in sequences.items()
+			 if isinstance(v, list) is True}
+	invalid = [[k, v]
+			   for k, v in sequences.items()
+			   if isinstance(v, list) is False]
+
+	seqid = 1
+	seqids_map = {}
+	dna_seqs = {}
+	prot_seqs = {}
+	for k, v in valid.items():
+		seqids_map[str(seqid)] = k
+		dna_seqs[k] = v[0][1]
+		prot_seqs[str(seqid)] = str(v[0][0])
+		seqid += 1
+
+	if size_threshold is not None and len(prot_seqs) > 0:
+		# Remove alleles based on length mode and size threshold
+		modes, alm, asm, alleles_lengths = mode_filter(dna_seqs, size_threshold)
+		excluded = set(asm + alm)
+
+		dna_seqs = {seqid: seq
+					for seqid, seq in dna_seqs.items()
+					if seqid not in excluded}
+		prot_seqs = {seqid: seq
+					 for seqid, seq in prot_seqs.items()
+					 if seqids_map[seqid] not in excluded}
+
+		modes_concat = ':'.join(map(str, modes))
+		st_percentage = int(size_threshold*100)
+		invalid += [[s, 'allele greater than {0}% locus length mode '
+					 '({1}>{2})'.format(st_percentage,
+										alleles_lengths[s],
+										modes_concat)]
+					for s in alm]
+		invalid += [[s, 'allele smaller than {0}% locus length mode '
+					 '({1}<{2})'.format(st_percentage,
+										alleles_lengths[s],
+										modes_concat)]
+					for s in asm]
+
+	total_seqs = len(translated_seqs)
+
+	return [dna_seqs, prot_seqs, invalid, seqids_map, total_seqs]
 
 
 def translate_coding_sequences(seqids, protein_file, sequences_file,
-                               translation_table, minimum_length):
-    """Translate coding sequences.
-
-    Parameters
-    ----------
-    seqids : list
-        List with the sequence identifiers of the sequences
-        to be translated.
-    protein_file : str
-        Path to a file to save protein sequences.
-    sequences_file : str
-        Path to the FASTA file that contains the DNA sequences.
-    translation_table : int
-        Translation table identifier.
-    minimum_length : int
-        The minimum sequence length value.
-
-    Returns
-    -------
-    A list with following elements:
-        invalid_alleles : list
-            List with one sublist per invalid allele.
-            Each sublist contains a sequence identifer
-            and the exception message returned after
-            attempting translation.
-        total_seqs : int
-            Total number of DNA sequences that were
-            translated.
-    """
-    # define limit of records to keep in memory
-    total_seqs = 0
-    prot_lines = []
-    line_limit = 10000
-    invalid_alleles = []
-    cds_index = fao.index_fasta(sequences_file)
-    for i, seqid in enumerate(seqids):
-        sequence = str(cds_index.get(seqid).seq)
-
-        translation = translate_dna(sequence, translation_table, minimum_length)
-        if isinstance(translation, list):
-            prot_lines.append('>{0}'.format(seqid))
-            prot_lines.append(str(translation[0][0]))
-            total_seqs += 1
-        # if returned value is a string, translation failed and
-        # string contains exceptions
-        elif isinstance(translation, str):
-            invalid_alleles.append([seqid, translation])
-
-        if len(prot_lines)//2 == line_limit or i+1 == len(seqids):
-            prot_lines = im.join_list(prot_lines, '\n')
-            fo.write_to_file(prot_lines, protein_file, 'a', '\n')
-            prot_lines = []
-
-    return [invalid_alleles, total_seqs]
+							   translation_table, minimum_length):
+	"""Translate coding sequences.
+
+	Parameters
+	----------
+	seqids : list
+		List with the sequence identifiers of the sequences
+		to be translated.
+	protein_file : str
+		Path to a file to save protein sequences.
+	sequences_file : str
+		Path to the FASTA file that contains the DNA sequences.
+	translation_table : int
+		Translation table identifier.
+	minimum_length : int
+		The minimum sequence length value.
+
+	Returns
+	-------
+	A list with following elements:
+		invalid_alleles : list
+			List with one sublist per invalid allele.
+			Each sublist contains a sequence identifer
+			and the exception message returned after
+			attempting translation.
+		total_seqs : int
+			Total number of DNA sequences that were
+			translated.
+	"""
+	# define limit of records to keep in memory
+	total_seqs = 0
+	prot_lines = []
+	line_limit = 10000
+	invalid_alleles = []
+	cds_index = fao.index_fasta(sequences_file)
+	for i, seqid in enumerate(seqids):
+		sequence = str(cds_index.get(seqid).seq)
+
+		translation = translate_dna(sequence, translation_table, minimum_length)
+		if isinstance(translation, list):
+			prot_lines.append('>{0}'.format(seqid))
+			prot_lines.append(str(translation[0][0]))
+			total_seqs += 1
+		# if returned value is a string, translation failed and
+		# string contains exceptions
+		elif isinstance(translation, str):
+			invalid_alleles.append([seqid, translation])
+
+		if len(prot_lines)//2 == line_limit or i+1 == len(seqids):
+			prot_lines = im.join_list(prot_lines, '\n')
+			fo.write_to_file(prot_lines, protein_file, 'a', '\n')
+			prot_lines = []
+
+	return [invalid_alleles, total_seqs]
 
 
 def determine_distinct(sequences_file, unique_fasta, map_ids):
-    """Identify duplicated sequences in a FASTA file.
-
-    Parameters
-    ----------
-    sequences_file : str
-        Path to a FASTA file.
-    unique_fasta : str
-        Path to a FASTA file that will be created to
-        store distinct sequences.
-    map_ids : dict
-        Dictionary with mapping between genome string
-        identifiers and genome integer identifiers.
-
-    Returns
-    -------
-    pickle_out : str
-        Pickled file that contains a dictionary with sequence
-        hashes as keys and a list with pairs of protein
-        identifiers and genome identifiers as values
-    """
-    out_seqs = []
-    duplicates = {}
-    exhausted = False
-    # limit of 10000 Fasta records in memory
-    out_limit = 10000
-    seq_generator = fao.sequence_generator(sequences_file)
-    while exhausted is False:
-        record = next(seq_generator, None)
-        if record is not None:
-            # seq object has to be converted to string
-            seqid = record.id
-            sequence = str(record.seq.upper())
-
-            # use digest() instead of hexdigest() to reduce memory usage?
-            seq_hash = im.hash_sequence(sequence)
-
-            # add unseen sequence to Fasta file with distinct sequences
-            if seq_hash not in duplicates:
-                recout = fao.fasta_str_record(ct.FASTA_RECORD_TEMPLATE, [seqid, sequence])
-                out_seqs.append(recout)
-
-            # add CDS hash as key
-            # add genome integer identifier and protein identifier to values list
-            # genome identifier and protein identifier can be used to fetch sequences
-            genome_id, protid = seqid.split('-protein')
-            genome_id = map_ids[genome_id]
-            duplicates.setdefault(seq_hash, []).extend([int(protid), int(genome_id)])
-        else:
-            exhausted = True
-
-        # write Fasta records to file
-        if len(out_seqs) == out_limit or exhausted is True:
-            if len(out_seqs) > 0:
-                out_seqs = im.join_list(out_seqs, '\n')
-                fo.write_to_file(out_seqs, unique_fasta, 'a', '\n')
-                # reset list to avoid writing same records multiple times
-                out_seqs = []
-
-    # save dictionary with genome integer identifiers per distinct sequence
-    # to pickle and only return file path to avoid keeping all dicts from
-    # parallel processes in memory
-    pickle_out = unique_fasta.replace('.fasta', '.duplicates')
-    fo.pickle_dumper(duplicates, pickle_out)
-
-    return pickle_out
+	"""Identify duplicated sequences in a FASTA file.
+
+	Parameters
+	----------
+	sequences_file : str
+		Path to a FASTA file.
+	unique_fasta : str
+		Path to a FASTA file that will be created to
+		store distinct sequences.
+	map_ids : dict
+		Dictionary with mapping between genome string
+		identifiers and genome integer identifiers.
+
+	Returns
+	-------
+	pickle_out : str
+		Pickled file that contains a dictionary with sequence
+		hashes as keys and a list with pairs of protein
+		identifiers and genome identifiers as values
+	"""
+	out_seqs = []
+	duplicates = {}
+	exhausted = False
+	# limit of 10000 Fasta records in memory
+	out_limit = 10000
+	seq_generator = fao.sequence_generator(sequences_file)
+	while exhausted is False:
+		record = next(seq_generator, None)
+		if record is not None:
+			# seq object has to be converted to string
+			seqid = record.id
+			sequence = str(record.seq.upper())
+
+			# use digest() instead of hexdigest() to reduce memory usage?
+			seq_hash = im.hash_sequence(sequence)
+
+			# add unseen sequence to Fasta file with distinct sequences
+			if seq_hash not in duplicates:
+				recout = fao.fasta_str_record(ct.FASTA_RECORD_TEMPLATE, [seqid, sequence])
+				out_seqs.append(recout)
+
+			# add CDS hash as key
+			# add genome integer identifier and protein identifier to values list
+			# genome identifier and protein identifier can be used to fetch sequences
+			genome_id, protid = seqid.split('-protein')
+			genome_id = map_ids[genome_id]
+			duplicates.setdefault(seq_hash, []).extend([int(protid), int(genome_id)])
+		else:
+			exhausted = True
+
+		# write Fasta records to file
+		if len(out_seqs) == out_limit or exhausted is True:
+			if len(out_seqs) > 0:
+				out_seqs = im.join_list(out_seqs, '\n')
+				fo.write_to_file(out_seqs, unique_fasta, 'a', '\n')
+				# reset list to avoid writing same records multiple times
+				out_seqs = []
+
+	# save dictionary with genome integer identifiers per distinct sequence
+	# to pickle and only return file path to avoid keeping all dicts from
+	# parallel processes in memory
+	pickle_out = unique_fasta.replace('.fasta', '.duplicates')
+	fo.pickle_dumper(duplicates, pickle_out)
+
+	return pickle_out
 
 
 def determine_small(sequences_file, minimum_length, variation=0):
-    """Find protein sequences that are shorter than a specified length.
-
-    Parameters
-    ----------
-    sequences_file : str
-        Path to a FASTA file.
-    minimum_length : int
-        Sequences with a length value below this value
-        are considered small.
-    variation : float
-        Accept sequences with length variation of up to
-        minus (`minimum_length`*`variation`).
-
-    Returns
-    -------
-    small_seqids : list
-        List with the identifiers of small sequences.
-    """
-    variation = minimum_length - (minimum_length*variation)
-    seq_generator = fao.sequence_generator(sequences_file)
-    small_seqids = []
-    for record in seq_generator:
-        if len(record.seq) < variation:
-            small_seqids.append(record.id)
-
-    return small_seqids
+	"""Find protein sequences that are shorter than a specified length.
+
+	Parameters
+	----------
+	sequences_file : str
+		Path to a FASTA file.
+	minimum_length : int
+		Sequences with a length value below this value
+		are considered small.
+	variation : float
+		Accept sequences with length variation of up to
+		minus (`minimum_length`*`variation`).
+
+	Returns
+	-------
+	small_seqids : list
+		List with the identifiers of small sequences.
+	"""
+	variation = minimum_length - (minimum_length*variation)
+	seq_generator = fao.sequence_generator(sequences_file)
+	small_seqids = []
+	for record in seq_generator:
+		if len(record.seq) < variation:
+			small_seqids.append(record.id)
+
+	return small_seqids
 
 
 def apply_bsr(blast_results, fasta_file, bsr):
-    """Find similar sequences based on the BLAST Score Ratio.
-
-    Parameters
-    ----------
-    blast_results : list
-        List with the path to a file with BLAST
-        results in tabular format.
-    fasta_file : str
-        Path to a FASTA file that contains the
-        sequences that were aligned.
-    bsr : float
-        The BSR value to use as threshold
-
-    Returns
-    -------
-    excluded_alleles : list
-        List with the identifiers of the sequences
-        that were highly similar to other sequences.
-    """
-    # Separate self-results from other results
-    self_scores = {r[0]: r[-1] for r in blast_results if r[0] == r[4]}
-    blast_results = [r for r in blast_results if r[0] != r[4]]
-
-    # Determine sequence lengths
-    lengths = {}
-    for k in self_scores:
-        record = fasta_file.get(k)
-        sequence = str(record.seq)
-        lengths[k] = len(sequence)
-
-    # Exclude based on BSR
-    excluded_alleles = []
-    for result in blast_results:
-        query = result[0]
-        target = result[4]
-        score = result[-1]
-        if query not in excluded_alleles:
-            # Determine sequence to exclude based on BSR
-            try:
-                self_blast_score = self_scores[query]
-                query_length = lengths[query]
-                target_length = lengths[target]
-                blast_score_ratio = float(score)/float(self_blast_score)
-
-                # Only proceed if target has not been excluded
-                if blast_score_ratio >= bsr and target not in excluded_alleles:
-                    # Exclude query if target is bigger
-                    if target_length > query_length and query not in excluded_alleles:
-                        excluded_alleles.append(query)
-                    # Exclude target if query is bigger
-                    elif target_length <= query_length:
-                        excluded_alleles.append(target)
-            # It might not be possible to determine the self-score for some sequences
-            # This might be related with composition-based stats being enabled
-            except Exception:
-                excluded_alleles.append(query)
-
-    return excluded_alleles
+	"""Find similar sequences based on the BLAST Score Ratio.
+
+	Parameters
+	----------
+	blast_results : list
+		List with the path to a file with BLAST
+		results in tabular format.
+	fasta_file : str
+		Path to a FASTA file that contains the
+		sequences that were aligned.
+	bsr : float
+		The BSR value to use as threshold
+
+	Returns
+	-------
+	excluded_alleles : list
+		List with the identifiers of the sequences
+		that were highly similar to other sequences.
+	"""
+	# Separate self-results from other results
+	self_scores = {r[0]: r[-1] for r in blast_results if r[0] == r[4]}
+	blast_results = [r for r in blast_results if r[0] != r[4]]
+
+	# Determine sequence lengths
+	lengths = {}
+	for k in self_scores:
+		record = fasta_file.get(k)
+		sequence = str(record.seq)
+		lengths[k] = len(sequence)
+
+	# Exclude based on BSR
+	excluded_alleles = []
+	for result in blast_results:
+		query = result[0]
+		target = result[4]
+		score = result[-1]
+		if query not in excluded_alleles:
+			# Determine sequence to exclude based on BSR
+			try:
+				self_blast_score = self_scores[query]
+				query_length = lengths[query]
+				target_length = lengths[target]
+				blast_score_ratio = float(score)/float(self_blast_score)
+
+				# Only proceed if target has not been excluded
+				if blast_score_ratio >= bsr and target not in excluded_alleles:
+					# Exclude query if target is bigger
+					if target_length > query_length and query not in excluded_alleles:
+						excluded_alleles.append(query)
+					# Exclude target if query is bigger
+					elif target_length <= query_length:
+						excluded_alleles.append(target)
+			# It might not be possible to determine the self-score for some sequences
+			# This might be related with composition-based stats being enabled
+			except Exception:
+				excluded_alleles.append(query)
+
+	return excluded_alleles
diff --git a/CHEWBBACA/utils/uniprot_requests.py b/CHEWBBACA/utils/uniprot_requests.py
index ce375dce..96d85509 100644
--- a/CHEWBBACA/utils/uniprot_requests.py
+++ b/CHEWBBACA/utils/uniprot_requests.py
@@ -17,320 +17,320 @@
 from SPARQLWrapper import SPARQLWrapper, JSON
 
 try:
-    from utils import (constants as ct,
-                       file_operations as fo,
-                       fasta_operations as fao,
-                       sequence_manipulation as sm)
+	from utils import (constants as ct,
+					   file_operations as fo,
+					   fasta_operations as fao,
+					   sequence_manipulation as sm)
 except ModuleNotFoundError:
-    from CHEWBBACA.utils import (constants as ct,
-                                 file_operations as fo,
-                                 fasta_operations as fao,
-                                 sequence_manipulation as sm)
+	from CHEWBBACA.utils import (constants as ct,
+								 file_operations as fo,
+								 fasta_operations as fao,
+								 sequence_manipulation as sm)
 
 
 UNIPROT_SERVER = SPARQLWrapper(ct.UNIPROT_SPARQL)
 
 
 def website_availability(url):
-    """Determine if a website is reachable.
+	"""Determine if a website is reachable.
 
-    Parameters
-    ----------
-    url : str
-        URL to a website.
+	Parameters
+	----------
+	url : str
+		URL to a website.
 
-    Returns
-    -------
-    reponse : http.client.HTTPResponse or urllib.error.HTTPError
-        Response with status code 200 if website is reachable
-        or a different status code if the website was not reachable.
-    """
-    try:
-        response = urlopen(url)
-    except Exception as e:
-        response = e
+	Returns
+	-------
+	reponse : http.client.HTTPResponse or urllib.error.HTTPError
+		Response with status code 200 if website is reachable
+		or a different status code if the website was not reachable.
+	"""
+	try:
+		response = urlopen(url)
+	except Exception as e:
+		response = e
 
-    return response
+	return response
 
 
 def select_name(result):
-    """Extract an annotation from the result of a query to UniProt.
-
-    Parameters
-    ----------
-    result : dict
-        A dictionary with the results from querying
-        the UniProt SPARQL endpoint.
-
-    Returns
-    -------
-    A list with the following elements:
-        selected_name : str
-            The annotation description.
-        selected_url : str
-            The URL to the UniProt page about the protein record.
-        selected_label : str
-            A label that has descriptive value.
-    """
-    url = ''
-    name = ''
-    label = ''
-    selected_name = ''
-    selected_url = ''
-    selected_label = ''
-
-    i = 0
-    found = False
-    # Get the entries with results
-    try:
-        aux = result['results']['bindings']
-    # Response does not contain annotation data
-    except Exception as e:
-        aux = {}
-
-    # only check results that are not empty
-    if len(aux) > 0:
-        # iterate over all results to find suitable
-        while found is False:
-            current_res = aux[i]
-            res_keys = aux[i].keys()
-
-            # annotation name can be associated
-            # to different keys
-            if 'fname' in res_keys:
-                name = str(current_res['fname']['value'])
-            elif 'sname2' in res_keys:
-                name = str(current_res['sname2']['value'])
-            elif 'label' in res_keys:
-                name = str(current_res['label']['value'])
-
-            if 'label' in res_keys:
-                label = str(current_res['label']['value'])
-            else:
-                label = name
-
-            # get UniProt URL
-            if 'uri' in res_keys:
-                url = str(current_res['seq']['value'])
-            elif 'seq' in res_keys:
-                url = str(current_res['seq']['value'])
-
-            if any([term in name for term in ct.UNIPROT_UNINFORMATIVE]) is False:
-                selected_name = name
-                selected_url = url
-                selected_label = label
-                found = True
-            else:
-                if selected_name == '':
-                    selected_name = name
-                    selected_url = url
-                    selected_label = label
-
-            i += 1
-            if i == len(aux):
-                found = True
-
-    return [selected_name, selected_url, selected_label]
+	"""Extract an annotation from the result of a query to UniProt.
+
+	Parameters
+	----------
+	result : dict
+		A dictionary with the results from querying
+		the UniProt SPARQL endpoint.
+
+	Returns
+	-------
+	A list with the following elements:
+		selected_name : str
+			The annotation description.
+		selected_url : str
+			The URL to the UniProt page about the protein record.
+		selected_label : str
+			A label that has descriptive value.
+	"""
+	url = ''
+	name = ''
+	label = ''
+	selected_name = ''
+	selected_url = ''
+	selected_label = ''
+
+	i = 0
+	found = False
+	# Get the entries with results
+	try:
+		aux = result['results']['bindings']
+	# Response does not contain annotation data
+	except Exception as e:
+		aux = {}
+
+	# only check results that are not empty
+	if len(aux) > 0:
+		# iterate over all results to find suitable
+		while found is False:
+			current_res = aux[i]
+			res_keys = aux[i].keys()
+
+			# annotation name can be associated
+			# to different keys
+			if 'fname' in res_keys:
+				name = str(current_res['fname']['value'])
+			elif 'sname2' in res_keys:
+				name = str(current_res['sname2']['value'])
+			elif 'label' in res_keys:
+				name = str(current_res['label']['value'])
+
+			if 'label' in res_keys:
+				label = str(current_res['label']['value'])
+			else:
+				label = name
+
+			# get UniProt URL
+			if 'uri' in res_keys:
+				url = str(current_res['seq']['value'])
+			elif 'seq' in res_keys:
+				url = str(current_res['seq']['value'])
+
+			if any([term in name for term in ct.UNIPROT_UNINFORMATIVE]) is False:
+				selected_name = name
+				selected_url = url
+				selected_label = label
+				found = True
+			else:
+				if selected_name == '':
+					selected_name = name
+					selected_url = url
+					selected_label = label
+
+			i += 1
+			if i == len(aux):
+				found = True
+
+	return [selected_name, selected_url, selected_label]
 
 
 def uniprot_query(sequence):
-    """Construct a SPARQL query to search for exact matches in UniProt.
-
-    Parameters
-    ----------
-    sequence : str
-        The Protein sequence that will be added to
-        the query/searched for.
-
-    Returns
-    -------
-    query : str
-        The SPARQL query that will allow to search for
-        exact matches in the UniProt database.
-    """
-    query = ('PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>  '
-             'PREFIX up: <http://purl.uniprot.org/core/> '
-             'select ?seq ?fname ?sname2 ?label  where {'
-             '{?b a up:Simple_Sequence; rdf:value '
-             '"'+sequence+'". ?seq up:sequence ?b. '
-             'OPTIONAL{?seq up:submittedName ?sname. ?sname up:fullName ?sname2} '
-             'OPTIONAL{?seq up:recommendedName ?rname.?rname up:fullName ?fname} }'
-             'UNION{?seq a up:Sequence; rdf:value "'+sequence+'"; '
-             'rdfs:label ?label. }}')
-
-    return query
+	"""Construct a SPARQL query to search for exact matches in UniProt.
+
+	Parameters
+	----------
+	sequence : str
+		The Protein sequence that will be added to
+		the query/searched for.
+
+	Returns
+	-------
+	query : str
+		The SPARQL query that will allow to search for
+		exact matches in the UniProt database.
+	"""
+	query = ('PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>  '
+			 'PREFIX up: <http://purl.uniprot.org/core/> '
+			 'select ?seq ?fname ?sname2 ?label  where {'
+			 '{?b a up:Simple_Sequence; rdf:value '
+			 '"'+sequence+'". ?seq up:sequence ?b. '
+			 'OPTIONAL{?seq up:submittedName ?sname. ?sname up:fullName ?sname2} '
+			 'OPTIONAL{?seq up:recommendedName ?rname.?rname up:fullName ?fname} }'
+			 'UNION{?seq a up:Sequence; rdf:value "'+sequence+'"; '
+			 'rdfs:label ?label. }}')
+
+	return query
 
 
 def get_data(sparql_query):
-    """Send requests to query UniProts's SPARQL endpoint.
-
-    Parameters
-    ----------
-    sparql_query : str
-        SPARQL query.
-
-    Returns
-    -------
-    result : dict
-        Dictionary with data retrieved from UniProt.
-    """
-    tries = 0
-    failed = []
-    max_tries = ct.MAX_RETRIES
-    success = False
-    while success is False and tries < max_tries:
-        try:
-            UNIPROT_SERVER.setQuery(sparql_query)
-            UNIPROT_SERVER.setReturnFormat(JSON)
-            UNIPROT_SERVER.setTimeout(60)
-            result = UNIPROT_SERVER.query().convert()
-            success = True
-        # Might fail if sequence is too long for GET method(URITooLong)
-        except Exception as e:
-            failed.append(e)
-            tries += 1
-            time.sleep(1)
-
-    if success is False:
-        result = {}
-
-    return [result, failed]
+	"""Send requests to query UniProts's SPARQL endpoint.
+
+	Parameters
+	----------
+	sparql_query : str
+		SPARQL query.
+
+	Returns
+	-------
+	result : dict
+		Dictionary with data retrieved from UniProt.
+	"""
+	tries = 0
+	failed = []
+	max_tries = ct.MAX_RETRIES
+	success = False
+	while success is False and tries < max_tries:
+		try:
+			UNIPROT_SERVER.setQuery(sparql_query)
+			UNIPROT_SERVER.setReturnFormat(JSON)
+			UNIPROT_SERVER.setTimeout(60)
+			result = UNIPROT_SERVER.query().convert()
+			success = True
+		# Might fail if sequence is too long for GET method(URITooLong)
+		except Exception as e:
+			failed.append(e)
+			tries += 1
+			time.sleep(1)
+
+	if success is False:
+		result = {}
+
+	return [result, failed]
 
 
 def get_proteomes(proteome_ids, output_dir):
-    """Download reference proteomes from UniProt's FTP.
-
-    Parameters
-    ----------
-    proteomes : list
-        List with a sublist per proteome to download.
-        Each sublist has the information about a proteome
-        that was contained in the README file with the list
-        of UniProt's reference proteomes.
-    output_dir : str
-        Path to the output directory where downloaded
-        proteomes will be saved to.
-
-    Returns
-    -------
-    proteomes_files : list
-        Local paths to the downloaded proteomes.
-    """
-    print('Downloading reference proteomes...')
-    # construct FTP URLs for each proteome
-    downloaded = 0
-    proteomes_files = []
-    for pid in proteome_ids:
-        domain = '{0}{1}'.format(pid[3][0].upper(), pid[3][1:])
-        proteome_id = '{0}_{1}'.format(pid[0], pid[1])
-        proteome_file = '{0}.fasta.gz'.format(proteome_id)
-        local_proteome_file = fo.join_paths(output_dir, [proteome_file])
-        proteome_url = fo.join_paths(ct.UNIPROT_PROTEOMES_FTP, [domain, pid[0], proteome_file])
-        res = fo.download_file(proteome_url, local_proteome_file)
-        proteomes_files.append(local_proteome_file)
-        downloaded += 1
-        print('\r', 'Downloaded {0}/{1}'.format(downloaded, len(proteome_ids)), end='')
-        time.sleep(0.1)
-
-    return proteomes_files
+	"""Download reference proteomes from UniProt's FTP.
+
+	Parameters
+	----------
+	proteomes : list
+		List with a sublist per proteome to download.
+		Each sublist has the information about a proteome
+		that was contained in the README file with the list
+		of UniProt's reference proteomes.
+	output_dir : str
+		Path to the output directory where downloaded
+		proteomes will be saved to.
+
+	Returns
+	-------
+	proteomes_files : list
+		Local paths to the downloaded proteomes.
+	"""
+	print('Downloading reference proteomes...')
+	# construct FTP URLs for each proteome
+	downloaded = 0
+	proteomes_files = []
+	for pid in proteome_ids:
+		domain = '{0}{1}'.format(pid[3][0].upper(), pid[3][1:])
+		proteome_id = '{0}_{1}'.format(pid[0], pid[1])
+		proteome_file = '{0}.fasta.gz'.format(proteome_id)
+		local_proteome_file = fo.join_paths(output_dir, [proteome_file])
+		proteome_url = fo.join_paths(ct.UNIPROT_PROTEOMES_FTP, [domain, pid[0], proteome_file])
+		res = fo.download_file(proteome_url, local_proteome_file)
+		proteomes_files.append(local_proteome_file)
+		downloaded += 1
+		print('\r', 'Downloaded {0}/{1}'.format(downloaded, len(proteome_ids)), end='')
+		time.sleep(0.1)
+
+	return proteomes_files
 
 
 def get_annotation(gene, translation_table):
-    """Retrieve and select best annotation for a locus.
-
-    Parameters
-    ----------
-    gene : str
-        Path to a FASTA file with the DNA sequences for
-        the alleles of the locus.
-    translation_table : int
-        Translation table used to translate DNA sequences.
-
-    Returns
-    -------
-    gene : str
-        Path to a FASTA file with the DNA sequences for
-        the alleles of the locus.
-    selected_name : str
-        Product name selected from the terms retrieved from UniProt.
-    selected_url : str
-        URL for the page with information about the selected terms.
-    """
-    selected_url = ''
-    selected_name = ''
-    # Import locus alleles
-    sequences = fao.import_sequences(gene)
-    queried = []
-    for seqid, sequence in sequences.items():
-        # Translate allele
-        protein_sequence = str(sm.translate_sequence(sequence,
-                                                     table_id=translation_table))
-
-        if protein_sequence not in queried:
-            query = uniprot_query(protein_sequence)
-            result, failed = get_data(query)
-            queried.append(protein_sequence)
-            if len(result) > 0:
-                name, url, label = select_name(result)
-
-                lowercase_name = name.lower()
-                if any([term in lowercase_name for term in ct.UNIPROT_UNINFORMATIVE]) is True:
-                    if selected_name == '':
-                        selected_name = name
-                        selected_url = url
-                    continue
-                elif name == '':
-                    continue
-                else:
-                    selected_name = name
-                    selected_url = url
-                    break
-
-    return [gene, selected_name, selected_url, failed]
+	"""Retrieve and select best annotation for a locus.
+
+	Parameters
+	----------
+	gene : str
+		Path to a FASTA file with the DNA sequences for
+		the alleles of the locus.
+	translation_table : int
+		Translation table used to translate DNA sequences.
+
+	Returns
+	-------
+	gene : str
+		Path to a FASTA file with the DNA sequences for
+		the alleles of the locus.
+	selected_name : str
+		Product name selected from the terms retrieved from UniProt.
+	selected_url : str
+		URL for the page with information about the selected terms.
+	"""
+	selected_url = ''
+	selected_name = ''
+	# Import locus alleles
+	sequences = fao.import_sequences(gene)
+	queried = []
+	for seqid, sequence in sequences.items():
+		# Translate allele
+		protein_sequence = str(sm.translate_sequence(sequence,
+													 table_id=translation_table))
+
+		if protein_sequence not in queried:
+			query = uniprot_query(protein_sequence)
+			result, failed = get_data(query)
+			queried.append(protein_sequence)
+			if len(result) > 0:
+				name, url, label = select_name(result)
+
+				lowercase_name = name.lower()
+				if any([term in lowercase_name for term in ct.UNIPROT_UNINFORMATIVE]) is True:
+					if selected_name == '':
+						selected_name = name
+						selected_url = url
+					continue
+				elif name == '':
+					continue
+				else:
+					selected_name = name
+					selected_url = url
+					break
+
+	return [gene, selected_name, selected_url, failed]
 
 
 def extract_proteome_terms(header_items):
-    """Extract information from the header of a proteome record.
-
-    Extracts the sequence identifier, product name, gene name
-    and species name fields from the sequence header of a
-    reference proteome from Uniprot.
-
-    Parameters
-    ----------
-    header_items : dict
-        Dictionary with the keys and values from a Biopython
-        Bio.SeqRecord.SeqRecord object. Created by passing the
-        Biopython Bio.SeqRecord.SeqRecord object to the vars
-        function.
-
-    Returns
-    -------
-    seqid : str
-        Sequence identifier of the record. Composed
-        of the db, UniqueIdentifier and EntryName fields.
-    record_product : str
-        ProteinName field value in the sequence header.
-    record_gene_name : str
-        GeneName field (GN) in the sequence header.
-    record_species : str
-        OrganismName field (OS) in the sequence header.
-    """
-    # some tags might be missing
-    seqid = header_items.get('id', 'not_found')
-    record_description = header_items.get('description', 'not_found')
-    if record_description != '':
-        if 'OS=' in record_description:
-            # get organism name
-            record_species = (record_description.split('OS=')[1]).split(' OX=')[0]
-            record_product = (record_description.split(seqid+' ')[1]).split(' OS=')[0]
-        else:
-            record_species = 'not_found'
-            record_product = 'not_found'
-
-        if 'GN=' in record_description:
-            record_gene_name = (record_description.split('GN=')[1]).split(' PE=')[0]
-        else:
-            record_gene_name = 'not_found'
-
-    return [seqid, record_product, record_gene_name, record_species]
+	"""Extract information from the header of a proteome record.
+
+	Extracts the sequence identifier, product name, gene name
+	and species name fields from the sequence header of a
+	reference proteome from Uniprot.
+
+	Parameters
+	----------
+	header_items : dict
+		Dictionary with the keys and values from a Biopython
+		Bio.SeqRecord.SeqRecord object. Created by passing the
+		Biopython Bio.SeqRecord.SeqRecord object to the vars
+		function.
+
+	Returns
+	-------
+	seqid : str
+		Sequence identifier of the record. Composed
+		of the db, UniqueIdentifier and EntryName fields.
+	record_product : str
+		ProteinName field value in the sequence header.
+	record_gene_name : str
+		GeneName field (GN) in the sequence header.
+	record_species : str
+		OrganismName field (OS) in the sequence header.
+	"""
+	# some tags might be missing
+	seqid = header_items.get('id', 'not_found')
+	record_description = header_items.get('description', 'not_found')
+	if record_description != '':
+		if 'OS=' in record_description:
+			# get organism name
+			record_species = (record_description.split('OS=')[1]).split(' OX=')[0]
+			record_product = (record_description.split(seqid+' ')[1]).split(' OS=')[0]
+		else:
+			record_species = 'not_found'
+			record_product = 'not_found'
+
+		if 'GN=' in record_description:
+			record_gene_name = (record_description.split('GN=')[1]).split(' PE=')[0]
+		else:
+			record_gene_name = 'not_found'
+
+	return [seqid, record_product, record_gene_name, record_species]