diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..ac5c292 Binary files /dev/null and b/.DS_Store differ diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..a3ed7f4 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,14 @@ +# Codecov configuration to make it a bit less noisy +coverage: + status: + patch: false + project: + default: + threshold: 50% +comment: + layout: "header" + require_changes: false + branches: null + behavior: default + flags: null + paths: null \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f20f8f6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,106 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +.pytest_cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# profraw files from LLVM? Unclear exactly what triggers this +# There are reports this comes from LLVM profiling, but also Xcode 9. +*profraw diff --git a/.lgtm.yml b/.lgtm.yml new file mode 100644 index 0000000..e2d0de7 --- /dev/null +++ b/.lgtm.yml @@ -0,0 +1,12 @@ +# Configure LGTM for this package + +extraction: + python: # Configure Python + python_setup: # Configure the setup + version: 3 # Specify Version 3 +path_classifiers: + library: + - versioneer.py # Set Versioneer.py to an external "library" (3rd party code) + - devtools/* + generated: + - metapredict/_version.py diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..e16f772 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,51 @@ +language: python + +# Run jobs on container-based infrastructure, can be overridden per job + +matrix: + include: + # Extra includes for OSX since python language is not available by default on OSX + - os: osx + language: generic + env: PYTHON_VER=3.6 + - os: osx + language: generic + env: PYTHON_VER=3.7 + + + - os: linux + language: generic # No need to set Python version since its conda + env: PYTHON_VER=3.6 + - os: linux + language: generic + env: PYTHON_VER=3.7 + + +before_install: + # Additional info about the build + - uname -a + - df -h + - ulimit -a + + # Install the Python environment + - source devtools/travis-ci/before_install.sh + - python -V + +install: + + # Create test environment for package + - python devtools/scripts/create_conda_env.py -n=test -p=$PYTHON_VER devtools/conda-envs/test_env.yaml + # Activate the test environment + - conda activate test + # Build and install package + - python setup.py develop --no-deps + + +script: + - pytest -v --cov=metapredict metapredict/tests/ + +notifications: + email: false + +after_success: + - codecov diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..c9e41ad --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,77 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, +body size, disability, ethnicity, gender identity and expression, level of +experience, nationality, personal appearance, race, religion, or sexual +identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +Moreover, project maintainers will strive to offer feedback and advice to +ensure quality and consistency of contributions to the code. Contributions +from outside the group of project maintainers are strongly welcomed but the +final decision as to whether commits are merged into the codebase rests with +the team of project maintainers. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an +appointed representative at an online or offline event. Representation of a +project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at 'remenecker@wustl.edu'. The project team will +review and investigate all complaints, and will respond in a way that it deems +appropriate to the circumstances. The project team is obligated to maintain +confidentiality with regard to the reporter of an incident. Further details of +specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 1.4, available at +[http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7d010d3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ + +MIT License + +Copyright (c) 2020 Ryan Emenecker - Holehouse Lab - WUSM + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..06ed1e1 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include LICENSE +include MANIFEST.in +include versioneer.py + +graft metapredict +global-exclude *.py[cod] __pycache__ *.so \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..9c11224 --- /dev/null +++ b/README.md @@ -0,0 +1,234 @@ +# metapredict: A machine learning based tool for predicting protein disorder. + +**metapredict** uses a bidirectional recurrent neural network trained on the consensus disorder values from 8 disorder predictors from 12 proteomes that were obtained from MobiDB. The creation of metapredict was made possible by IDP-parrot. + +This package will allow for predicting disorder for any amino acid sequence, and predictions can be output as graphs or as raw values. Additionally, this package allows for predicting disorder values for protein sequences from .fasta files either from a Python IDE or from the command line. + +## Installation: + +metapredict is currently only availabile through Github. + +To clone the GitHub repository and gain the ability to modify a local copy of the code, run + + $ git clone https://github.com/allOfTheGoodUsernamesWereTaken/metapredict.git + $ cd metapredict-master + $ pip install . + +This will install metapredict locally. + +## Usage: + +There are two ways you can use metapredict: +1. Directly from the command line +2. From within Python + +## Command line usage: + +### predicting disorder +The predict-disorder command from the command line takes a .fasta file as input and returns a .csv file containing rows where the first column in the row is the uniprot header and all following rows are predicted disorder values for each residue in the amino acid sequence associated with the fasta header. + + $ predict-disorder + +This will save a .csv file to the location specified by . The name specified in will be the name of the output file followed by .csv. The .csv extension is automatically added to the output file name. +**Example** + + $ predict-disorder /Users/thisUser/Desktop/interestingProteins.fasta /Users/thisUser/Desktop/DisorderPredictions/ myCoolPredictions + +**Additional Usage** +**Get raw prediction values** +By default, this will output prediction values that are normalized between 0 and 1. However, some of the raw values from the predictor are slightly less than 0 and slightly greater than 1. The negative values are simply replaced with 0 and the values greater than 1 are replaced with 1 by default. However, if you want raw values, simply add the flag --no_normalization. + +**Example** + + $ predict-disorder /Users/thisUser/Desktop/interestingProteins.fasta /Users/thisUser/Desktop/DisorderPredictions/ myCoolPredictions --no_normalization + + +### graphing disorder +The graph-disorder command from the command line takes a .fasta file as input and returns a .png for every sequence within the .fasta file. The .png files for each sequence will be saved to wherever the user specifies as the output location. Each file will be named as predicted_disorder_ followed by the first 10 characters of the .fasta header (which is typically the unique identifier for the protein). For example, a fasta header of >sp|Q8N6T3|ARFG1_HUMAN will return a file saved as *predicted_disorder_sp|Q8N6T3|.png*. Additionally, the title of each graph is automatically generated and will have the title Predicted Protein Disorder followed by the first 10 characters of the .fasta header. In the previous example, the graph would be titled *Predicted Protein Disorder sp|Q8N6T3|*. + + $ graph-disorder + +**Example** + + $ graph-disorder /Users/thisUser/Desktop/interestingProteins.fasta /Users/thisUser/Desktop/DisorderGraphsFolder/ + +**WARNING** +This command will generate a .png file for ***every*** sequence in the .fasta file. If you have 1,000 sequences in a .fasta file, it will generate **1,000** files. Therefore, I recommend saving the output to a dedicated folder (or at least not your Desktop...). + +**Additional Usage** +**Changing resolution of save graphs** +By default, the output files have a DPI of 150. However, the user can change the DPI of the output (higher values have greater resolution but take up more space). To change the DPI simply add the flage -D followed by the wanted DPI value. + +**Example** + + $ graph-disorder /Users/thisUser/Desktop/interestingProteins.fasta /Users/thisUser/Desktop/DisorderGraphsFolder/ -D 300 + +**Remove non-alphabetic characters from file name** +By default, the output files contain characters that are non-alphabetic (example *predicted_disorder_sp|Q8N6T3|.png*). This is not a problem on some operating systems, while others do not allow files to have names that contain certain characters. To get around this, you can add the --remove_characters flag. This will remove all non-alphabetic characters from the .fasta header when saving the file. The previous example with the header >sp|Q8N6T3|ARFG1_HUMAN would now save as *predicted_disorder_spQ8N726AR.png*. + +**Example** + + $ graph-disorder /Users/thisUser/Desktop/interestingProteins.fasta /Users/thisUser/Desktop/DisorderGraphsFolder/ --remove_characters + + +## Using in Python: +In addition to using metapredict from the command line, you can also use it directly in Python. + +First import metapredict - + + import metapredict + from metapredict import meta + +Once metapredict is imported you can work with individual sequences or .fasta files. + +### predicting disorder +The predict_disorder function will return a list of predicted disorder value for each residue of the input sequence. The input sequence should be a string. + + meta.predict_disorder("DAPTSQEHTQAEDKERDSKTHPQKKQSPS") + +By default, the values are normalized between 1, but the user can get the raw prediction values by specifying + + meta.predict_disorder("DAPTSQEHTQAEDKERDSKTHPQKKQSPS", normalized=False) + + +### graphing disorder +The graph_disorder function will show a plot of the predicted disorder values across the input amino acid sequence. + + meta.graph_disorder("DAPTSQEHTQAEDKERDSKTHPQKKQSPS") + +**additional usage** +**Changing title of generated graph** +There are two parameters that the user can change easily for graphing disorder. The first is the name of the title for the generate graph. The name by default is blank and the title of the graph is simply *Predicted Protein Disorder*. However, the name can be specified in order to add the name of the protein after the default title. For example, specifing name = "- PAB1" would result in a title of *Predicted Protein Disorder - PAB1*. + +**Example** + + meta.graph_disorder("DAPPTSQEHTQAEDKERD", name="Name of this nonexistant protein") + +**Changing the resolution of the generate graph** +By default, the output graph has a DPI of 150. However, the user can change the DPI of the generated graph (higher values have greater resolution). To do so, simply specify DPI="Number" where the number is an integer. + +**Example** + + meta.graph_disorder("DAPPTSQEHTQAEDKERD", DPI=300) + + +### Calculating percent disorder +The percent_disorder function will return the percent of disordered residues in a sequence (as a decimal value). + +**Example** + + meta.percent_disorder("DAPPTSQEHTQAEDKERD") + +By default, this uses a cutoff predicted value of equal to or greater than 0.5 for a residue to be considered disordered. + +**additional usage** +**Changing cutoff value** +If you want to be more strict in what you consider to be disordered for calculating percent disorder of an input sequence, you can simply specify the cutoff value. + + +**Example** + + meta.percent_disorder("DAPPTSQEHTQAEDKERD", cutoff=0.8) + +The higher the cutoff value, the higher the value for any given predicted residue must be greater than or equal to in order to be considered disordered when calculating the final percent disorder for the input sequence. + + +### Predicting disorder from a .fasta file +Similar to the command line, you can predict disorder values for the amino acid sequence in a .fasta file. By default, this function will return a dictionary where the keys in the dictionary are the fasta headers and the values are the disorder predictions of the amino acid sequence associated with each fasta header in the original .fasta file. + +**Example** + + meta.predict_disorder_fasta("file path to .fasta file/fileName.fasta") + +An actual filepath would look something like: + + meta.predict_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta") + + +**additional usage** +**Save the output values** +By default the predict_disorder_fasta function will immediately return a dictionary. However, you can also save them to a .csv file by specifying *save=True* and output_path="location you want to save the file to". This will save a file called *predicted_disorder_values.csv* to the location you specify for the output_path + +**Example** + + meta.predict_disorder_fasta("file path to .fasta file/fileName.fasta", save=True, output_path="file path where the output .csv should be saved") + +An actual filepath would look something like: + + meta.predict_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", save=True output_path"/Users/thisUser/Desktop/") + +**Specifying the name of the output file** +By default, the generated .csv file will save as *predicted_disorder_values.csv*. However, you can change the default by specifing output_name="my_cool_file". + +**Example** + + meta.predict_disorder_fasta("file path to .fasta file/fileName.fasta", save=True, output_path="file path where the output .csv should be saved", output_name="name of file") + +An actual filepath would look something like: + + meta.predict_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", save=True output_path"/Users/thisUser/Desktop/", output_name="my_predictions") + +Importantly, you do not need to add the .csv file extension to your file name specified in output_name. However, if you do specify .csv as a file extension, everything should still work. + +**Get raw prediction values** +By default, this will output prediction values that are normalized between 0 and 1. However, some of the raw values from the predictor are slightly less than 0 and slightly greater than 1. The negative values are simply replaced with 0 and the values greater than 1 are replaced with 1 by default. If you want the raw values simply specify normalized=False. + +**Example** + + meta.predict_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", normalized=False) + + +### Generating graphs from a .fasta file +Similar to the command line, you can graph predicted disorder values for the amino acid sequence in a .fasta file. The graph_disorder_fasta function takes a .fasta file as input and returns a .png for every sequence within the .fasta file. The .png files for each sequence will be saved to wherever the user specifies as the output location. Each file will be named as predicted_disorder_ followed by the first 10 characters of the .fasta header (which is typically the unique identifier for the protein). For example, a fasta header of >sp|Q8N6T3|ARFG1_HUMAN will return a file saved as *predicted_disorder_sp|Q8N6T3|.png*. Additionally, the title of each graph is automatically generated and will have the title Predicted Protein Disorder followed by the first 10 characters of the .fasta header. In the previous example, the graph would be titled *Predicted Protein Disorder sp|Q8N6T3|*. + +**WARNING** +This command will generate a .png file for ***every*** sequence in the .fasta file. If you have 1,000 sequences in a .fasta file, it will generate **1,000** files. Therefore, I recommend saving the output to a dedicated folder (or at least not your Desktop...). + +**Example** + + meta.graph_disorder_fasta("file path to .fasta file/fileName.fasta", output_path="file path of where to save output graphs") + +An actual filepath would look something like: + + meta.graph_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", output_path="/Users/thisUser/Desktop/folderForGraphs") + + + +**Additional Usage** +**Changing resolution of save graphs** +By default, the output files have a DPI of 150. However, the user can change the DPI of the output files (higher values have greater resolution but take up more space). To change the DPI, specify DPI=# where # is an whole integer number. + +**Example** + + meta.graph_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", DPI=300, output_path="/Users/thisUser/Desktop/folderForGraphs") + +**Remove non-alphabetic characters from file name** +By default, the output files contain characters that are non-alphabetic (example *predicted_disorder_sp|Q8N6T3|.png*). This is not a problem on some operating systems, while others do not allow files to have names that contain certain characters. To get around this, you can add the --remove_characters flag. This will remove all non-alphabetic characters from the .fasta header when saving the file. The previous example with the header >sp|Q8N6T3|ARFG1_HUMAN would now save as *predicted_disorder_spQ8N726AR.png*. + +**Example** + + meta.graph_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", DPI=300, output_path="/Users/thisUser/Desktop/folderForGraphs", remove_characters=True) + +**Viewing generated graphs without saving** +The default behavior for the graph_disorder_fasta function is to save the generated graphs for viewing elsewhere. However, the user can choose to view the generated graphs without saving them. + +**WARNING** +If you choose to view the generated graphs instead of saving them, you can only view one at a time and each must be closed before the next will open. This is not a problem if you only have around 10 sequences in your .fasta file. However, if you have 1,000 sequences in a .fasta file, you will have to close out ***1,000*** graphs. This isn't a problem if you don't mind clicking... a lot. + +**Example** + + meta.graph_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", save=False) + + + +### Copyright + +Copyright (c) 2020, Holehouse Lab - WUSM + +#### Acknowledgements +IDP-Parrot, created by Dan Griffith, was used to generate the network used for metapredict. See https://pypi.org/project/idptools-parrot/ for some very cool machine learning stuff. + +In addition to using Dan Griffith's tool for creating metapredict, the code for brnn_architecture.py and encode_sequence.py was written by Dan (originally for IDP-Parrot). + +Project based on the +[Computational Molecular Science Python Cookiecutter](https://github.com/molssi/cookiecutter-cms) version 1.3. diff --git a/devtools/README.md b/devtools/README.md new file mode 100644 index 0000000..b2966ad --- /dev/null +++ b/devtools/README.md @@ -0,0 +1,60 @@ +# Development, testing, and deployment tools + +This directory contains a collection of tools for running Continuous Integration (CI) tests, +conda installation, and other development tools not directly related to the coding process. + + +## Manifest + +### Continuous Integration + +You should test your code, but do not feel compelled to use these specific programs. You also may not need Unix and +Windows testing if you only plan to deploy on specific platforms. These are just to help you get started + +* `travis-ci`: Linux and OSX based testing through [Travis-CI](https://about.travis-ci.com/) + * `before_install.sh`: Pip/Miniconda pre-package installation script for Travis +* `appveyor`: Windows based testing through [AppVeyor](https://www.appveyor.com/) (there are no files directly related to this) + +### Conda Environment: + +This directory contains the files to setup the Conda environment for testing purposes + +* `conda-envs`: directory containing the YAML file(s) which fully describe Conda Environments, their dependencies, and those dependency provenance's + * `test_env.yaml`: Simple test environment file with base dependencies. Channels are not specified here and therefore respect global Conda configuration + +### Additional Scripts: + +This directory contains OS agnostic helper scripts which don't fall in any of the previous categories +* `scripts` + * `create_conda_env.py`: Helper program for spinning up new conda environments based on a starter file with Python Version and Env. Name command-line options + + +## How to contribute changes +- Clone the repository if you have write access to the main repo, fork the repository if you are a collaborator. +- Make a new branch with `git checkout -b {your branch name}` +- Make changes and test your code +- Ensure that the test environment dependencies (`conda-envs`) line up with the build and deploy dependencies (`conda-recipe/meta.yaml`) +- Push the branch to the repo (either the main or your fork) with `git push -u origin {your branch name}` + * Note that `origin` is the default name assigned to the remote, yours may be different +- Make a PR on GitHub with your changes +- We'll review the changes and get your code into the repo after lively discussion! + + +## Checklist for updates +- [ ] Make sure there is an/are issue(s) opened for your specific update +- [ ] Create the PR, referencing the issue +- [ ] Debug the PR as needed until tests pass +- [ ] Tag the final, debugged version + * `git tag -a X.Y.Z [latest pushed commit] && git push --follow-tags` +- [ ] Get the PR merged in + +## Versioneer Auto-version +[Versioneer](https://github.com/warner/python-versioneer) will automatically infer what version +is installed by looking at the `git` tags and how many commits ahead this version is. The format follows +[PEP 440](https://www.python.org/dev/peps/pep-0440/) and has the regular expression of: +```regexp +\d+.\d+.\d+(?\+\d+-[a-z0-9]+) +``` +If the version of this commit is the same as a `git` tag, the installed version is the same as the tag, +e.g. `metapredict-0.1.2`, otherwise it will be appended with `+X` where `X` is the number of commits +ahead from the last tag, and then `-YYYYYY` where the `Y`'s are replaced with the `git` commit hash. diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml new file mode 100644 index 0000000..529b25e --- /dev/null +++ b/devtools/conda-envs/test_env.yaml @@ -0,0 +1,15 @@ +name: test +channels: +dependencies: + # Base depends + - python + - pip + + # Testing + - pytest + - pytest-cov + + # Pip-only installs + - pip: + - codecov + diff --git a/devtools/scripts/create_conda_env.py b/devtools/scripts/create_conda_env.py new file mode 100644 index 0000000..b51adc8 --- /dev/null +++ b/devtools/scripts/create_conda_env.py @@ -0,0 +1,95 @@ +import argparse +import os +import re +import glob +import shutil +import subprocess as sp +from tempfile import TemporaryDirectory +from contextlib import contextmanager +# YAML imports +try: + import yaml # PyYAML + loader = yaml.load +except ImportError: + try: + import ruamel_yaml as yaml # Ruamel YAML + except ImportError: + try: + # Load Ruamel YAML from the base conda environment + from importlib import util as import_util + CONDA_BIN = os.path.dirname(os.environ['CONDA_EXE']) + ruamel_yaml_path = glob.glob(os.path.join(CONDA_BIN, '..', + 'lib', 'python*.*', 'site-packages', + 'ruamel_yaml', '__init__.py'))[0] + # Based on importlib example, but only needs to load_module since its the whole package, not just + # a module + spec = import_util.spec_from_file_location('ruamel_yaml', ruamel_yaml_path) + yaml = spec.loader.load_module() + except (KeyError, ImportError, IndexError): + raise ImportError("No YAML parser could be found in this or the conda environment. " + "Could not find PyYAML or Ruamel YAML in the current environment, " + "AND could not find Ruamel YAML in the base conda environment through CONDA_EXE path. " + "Environment not created!") + loader = yaml.YAML(typ="safe").load # typ="safe" avoids odd typing on output + + +@contextmanager +def temp_cd(): + """Temporary CD Helper""" + cwd = os.getcwd() + with TemporaryDirectory() as td: + try: + os.chdir(td) + yield + finally: + os.chdir(cwd) + + +# Args +parser = argparse.ArgumentParser(description='Creates a conda environment from file for a given Python version.') +parser.add_argument('-n', '--name', type=str, + help='The name of the created Python environment') +parser.add_argument('-p', '--python', type=str, + help='The version of the created Python environment') +parser.add_argument('conda_file', + help='The file for the created Python environment') + +args = parser.parse_args() + +# Open the base file +with open(args.conda_file, "r") as handle: + yaml_script = loader(handle.read()) + +python_replacement_string = "python {}*".format(args.python) + +try: + for dep_index, dep_value in enumerate(yaml_script['dependencies']): + if re.match('python([ ><=*]+[0-9.*]*)?$', dep_value): # Match explicitly 'python' and its formats + yaml_script['dependencies'].pop(dep_index) + break # Making the assumption there is only one Python entry, also avoids need to enumerate in reverse +except (KeyError, TypeError): + # Case of no dependencies key, or dependencies: None + yaml_script['dependencies'] = [] +finally: + # Ensure the python version is added in. Even if the code does not need it, we assume the env does + yaml_script['dependencies'].insert(0, python_replacement_string) + +# Figure out conda path +if "CONDA_EXE" in os.environ: + conda_path = os.environ["CONDA_EXE"] +else: + conda_path = shutil.which("conda") +if conda_path is None: + raise RuntimeError("Could not find a conda binary in CONDA_EXE variable or in executable search path") + +print("CONDA ENV NAME {}".format(args.name)) +print("PYTHON VERSION {}".format(args.python)) +print("CONDA FILE NAME {}".format(args.conda_file)) +print("CONDA PATH {}".format(conda_path)) + +# Write to a temp directory which will always be cleaned up +with temp_cd(): + temp_file_name = "temp_script.yaml" + with open(temp_file_name, 'w') as f: + f.write(yaml.dump(yaml_script)) + sp.call("{} env create -n {} -f {}".format(conda_path, args.name, temp_file_name), shell=True) diff --git a/devtools/travis-ci/before_install.sh b/devtools/travis-ci/before_install.sh new file mode 100755 index 0000000..c2807e2 --- /dev/null +++ b/devtools/travis-ci/before_install.sh @@ -0,0 +1,39 @@ +# Temporarily change directory to $HOME to install software +pushd . +cd $HOME +# Make sure some level of pip is installed +python -m ensurepip + +# Install Miniconda +if [ "$TRAVIS_OS_NAME" == "osx" ]; then + # Make OSX md5 mimic md5sum from linux, alias does not work + md5sum () { + command md5 -r "$@" + } + MINICONDA=Miniconda3-latest-MacOSX-x86_64.sh +else + MINICONDA=Miniconda3-latest-Linux-x86_64.sh +fi +MINICONDA_HOME=$HOME/miniconda +MINICONDA_MD5=$(wget -qO- https://repo.anaconda.com/miniconda/ | grep -A3 $MINICONDA | sed -n '4p' | sed -n 's/ *\(.*\)<\/td> */\1/p') +wget -q https://repo.anaconda.com/miniconda/$MINICONDA +if [[ $MINICONDA_MD5 != $(md5sum $MINICONDA | cut -d ' ' -f 1) ]]; then + echo "Miniconda MD5 mismatch" + exit 1 +fi +bash $MINICONDA -b -p $MINICONDA_HOME + +# Configure miniconda +export PIP_ARGS="-U" +# New to conda >=4.4 +echo ". $MINICONDA_HOME/etc/profile.d/conda.sh" >> ~/.bashrc # Source the profile.d file +echo "conda activate" >> ~/.bashrc # Activate conda +source ~/.bashrc # source file to get new commands +#export PATH=$MINICONDA_HOME/bin:$PATH # Old way, should not be needed anymore + +conda config --set always_yes yes +conda install conda conda-build jinja2 anaconda-client +conda update --quiet --all + +# Restore original directory +popd diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 0000000..c3bebe8 Binary files /dev/null and b/docs/.DS_Store differ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..97f9c61 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = metapredict +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..8cc89eb --- /dev/null +++ b/docs/README.md @@ -0,0 +1,18 @@ +# Compiling metapredict's Documentation + +The docs for this project are built with [Sphinx](http://www.sphinx-doc.org/en/master/). +To compile the docs, first ensure that Sphinx and the ReadTheDocs theme are installed. + + +```bash +conda install sphinx sphinx_rtd_theme +``` + + +Once installed, you can use the `Makefile` in this directory to compile static HTML pages by +```bash +make html +``` + +The compiled docs will be in the `_build` directory and can be viewed by opening `index.html` (which may itself +be inside a directory called `html/` depending on what version of Sphinx is installed). \ No newline at end of file diff --git a/docs/_static/README.md b/docs/_static/README.md new file mode 100644 index 0000000..2f0cf84 --- /dev/null +++ b/docs/_static/README.md @@ -0,0 +1,16 @@ +# Static Doc Directory + +Add any paths that contain custom static files (such as style sheets) here, +relative to the `conf.py` file's directory. +They are copied after the builtin static files, +so a file named "default.css" will overwrite the builtin "default.css". + +The path to this folder is set in the Sphinx `conf.py` file in the line: +```python +templates_path = ['_static'] +``` + +## Examples of file to add to this directory +* Custom Cascading Style Sheets +* Custom JavaScript code +* Static logo images diff --git a/docs/_templates/README.md b/docs/_templates/README.md new file mode 100644 index 0000000..3f4f804 --- /dev/null +++ b/docs/_templates/README.md @@ -0,0 +1,14 @@ +# Templates Doc Directory + +Add any paths that contain templates here, relative to +the `conf.py` file's directory. +They are copied after the builtin template files, +so a file named "page.html" will overwrite the builtin "page.html". + +The path to this folder is set in the Sphinx `conf.py` file in the line: +```python +html_static_path = ['_templates'] +``` + +## Examples of file to add to this directory +* HTML extensions of stock pages like `page.html` or `layout.html` diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..cac8a20 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,35 @@ +Module Documentation +==================== + +brnn_architecture.py +-------------------- + +.. automodule:: metapredict.brnn_architecture + :noindex: + :members: + +encode_sequence.py +------------------ + +.. automodule:: metapredict.encode_sequence + :members: + +metapredict.py +--------------------- + +.. automodule:: metapredict.meta + :members: + +meta_graph.py +---------------- + +.. automodule:: metapredict.meta_graph + :members: + +meta_predict_disorder.py +------------------------ + +.. automodule:: metapredict.meta_predict_disorder + :noindex: + :members: + diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..a34c03b --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/stable/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +# Incase the project was not installed +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + +import metapredict + + +# -- Project information ----------------------------------------------------- + +project = 'metapredict' +copyright = ("2020, Ryan Emenecker - Holehouse Lab - WUSM. Project structure based on the " + "Computational Molecular Science Python Cookiecutter version 1.3") +author = 'Ryan Emenecker - Holehouse Lab - WUSM' + +# The short X.Y version +version = '' +# The full version, including alpha/beta/rc tags +release = '' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autosummary', + 'sphinx.ext.autodoc', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', + 'sphinx.ext.intersphinx', + 'sphinx.ext.extlinks', + 'sphinx.ext.napoleon' +] + +autosummary_generate = True +napoleon_google_docstring = False +napoleon_use_param = False +napoleon_use_ivar = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'default' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'metapredictdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'metapredict.tex', 'metapredict Documentation', + 'metapredict', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'metapredict', 'metapredict Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'metapredict', 'metapredict Documentation', + author, 'metapredict', 'A protein disorder predictor based on a BRNN (IDP-Parrot) trained on the consensus disorder values from 8 disorder predictors from 12 proteomes.', + 'Miscellaneous'), +] + + +# -- Extension configuration ------------------------------------------------- diff --git a/docs/getting_started.rst b/docs/getting_started.rst new file mode 100644 index 0000000..c610acb --- /dev/null +++ b/docs/getting_started.rst @@ -0,0 +1,35 @@ +Getting Started with metapredict +================================ + +Installation +------------ + +metapredict is available through GitHub or the Python Package Index (PyPI). To install through PyPI, run + +.. code-block:: bash + + $ pip install metapredict + +To clone the GitHub repository and gain the ability to modify a local copy of the code, run + +.. code-block:: bash + + $ git clone https://github.com/idptools/metapredict.git + $ cd metapredict + $ pip install . + +This will install metapredict locally. If you modify the source code in the local repository, be sure to reinstall with pip. + +Testing +------- + +To see if your installation of metapredict is working properly, you can run the unit test included in the package by navigating to the metapredict/tests folder within the installation directory and running: + +.. code-block:: bash + + $ pytest -v + +Example datasets +---------------- + +Example data that can be used with metapredict can be found in the metapredict/data folder on GitHub. The example data set is just a .fasta file containing 5 protein sequences. \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..ba4b4f3 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,28 @@ +.. metapredict documentation master file, created by + sphinx-quickstart on Thu Mar 15 13:55:56 2018. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to metapredict's documentation! +========================================================= + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + getting_started + usage/graph-disorder + usage/predict-disorder + usage/using-in-python + + getting_started + api + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..43da3b0 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=metapredict + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/usage/graph-disorder.rst b/docs/usage/graph-disorder.rst new file mode 100644 index 0000000..62e9be2 --- /dev/null +++ b/docs/usage/graph-disorder.rst @@ -0,0 +1,38 @@ +graph-disorder +============== + +**graphing disorder** +``graph-disorder`` is a command from that takes a .fasta file as input and returns a .png for every sequence within the .fasta file. The .png files for each sequence will be saved to wherever the user specifies as the output location. Each file will be named as predicted_disorder_ followed by the first 10 characters of the .fasta header (which is typically the unique identifier for the protein). For example, a fasta header of >sp|Q8N6T3|ARFG1_HUMAN will return a file saved as *predicted_disorder_sp|Q8N6T3|.png*. Additionally, the title of each graph is automatically generated and will have the title Predicted Protein Disorder followed by the first 10 characters of the .fasta header. In the previous example, the graph would be titled *Predicted Protein Disorder sp|Q8N6T3|*. + +Once metapredict is installed, the user can run ``graph-disorder`` from the command line: + +.. code-block:: bash + + $ graph-disorder + +**Example** +.. code-block:: bash + + $ graph-disorder /Users/thisUser/Desktop/interestingProteins.fasta /Users/thisUser/Desktop/DisorderGraphsFolder/ + +**WARNING** +This command will generate a .png file for ***every*** sequence in the .fasta file. If you have 1,000 sequences in a .fasta file, it will generate **1,000** files. Therefore, I recommend saving the output to a dedicated folder (or at least not your Desktop...). + +**Additional Usage** +**Changing resolution of save graphs** +* ``--DPI`` / ``-D`` +By default, the output files have a DPI of 150. However, the user can change the DPI of the output (higher values have greater resolution but take up more space). To change the DPI simply add the flage -D followed by the wanted DPI value. + +**Example** +.. code-block:: bash + + $ graph-disorder /Users/thisUser/Desktop/interestingProteins.fasta /Users/thisUser/Desktop/DisorderGraphsFolder/ -D 300 + +**Remove non-alphabetic characters from file name** +* ``--remove_characters`` +By default, the output files contain characters that are non-alphabetic (example *predicted_disorder_sp|Q8N6T3|.png*). This is not a problem on some operating systems, while others do not allow files to have names that contain certain characters. To get around this, you can add the --remove_characters flag. This will remove all non-alphabetic characters from the .fasta header when saving the file. The previous example with the header >sp|Q8N6T3|ARFG1_HUMAN would now save as *predicted_disorder_spQ8N726AR.png*. + +**Example** +.. code-block:: bash + + $ graph-disorder /Users/thisUser/Desktop/interestingProteins.fasta /Users/thisUser/Desktop/DisorderGraphsFolder/ --remove_characters diff --git a/docs/usage/predict-disorder.rst b/docs/usage/predict-disorder.rst new file mode 100644 index 0000000..ee5e742 --- /dev/null +++ b/docs/usage/predict-disorder.rst @@ -0,0 +1,26 @@ +predict-disorder +============== + +**predicting disorder** +``predict-disorder`` is a commant that takes a .fasta file as input and returns a .csv file containing rows where the first column in the row is the uniprot header and all following rows are predicted disorder values for each residue in the amino acid sequence associated with the fasta header. +Once metapredict is installed, the user can run ``predict-disorder`` from the command line: + +.. code-block:: bash + + $ predict-disorder + +This will save a .csv file to the location specified by . The name specified in will be the name of the output file followed by .csv. The .csv extension is automatically added to the output file name. +**Example** +.. code-block:: bash + + $ predict-disorder /Users/thisUser/Desktop/interestingProteins.fasta /Users/thisUser/Desktop/DisorderPredictions/ myCoolPredictions + +**Additional Usage** +* ``--no_normalization`` +**Get raw prediction values** +By default, this will output prediction values that are normalized between 0 and 1. However, some of the raw values from the predictor are slightly less than 0 and slightly greater than 1. The negative values are simply replaced with 0 and the values greater than 1 are replaced with 1 by default. However, if you want raw values, simply add the flag --no_normalization. + +**Example** +.. code-block:: bash + + $ predict-disorder /Users/thisUser/Desktop/interestingProteins.fasta /Users/thisUser/Desktop/DisorderPredictions/ myCoolPredictions --no_normalization diff --git a/docs/usage/using-in-python.rst b/docs/usage/using-in-python.rst new file mode 100644 index 0000000..64ff5f1 --- /dev/null +++ b/docs/usage/using-in-python.rst @@ -0,0 +1,172 @@ +metapredict in Python +===================== + +In addition to using metapredict from the command line, you can also use it directly in Python. + +First import metapredict - +.. code-block:: python + + import metapredict + from metapredict import meta + +Once metapredict is imported you can work with individual sequences or .fasta files. + +**predicting disorder** +The predict_disorder function will return a list of predicted disorder value for each residue of the input sequence. The input sequence should be a string. + +.. code-block:: python + + meta.predict_disorder("DAPTSQEHTQAEDKERDSKTHPQKKQSPS") + +By default, the values are normalized between 1, but the user can get the raw prediction values by specifying + +.. code-block:: python + + meta.predict_disorder("DAPTSQEHTQAEDKERDSKTHPQKKQSPS", normalized=False) + + +**graphing disorder** +The graph_disorder function will show a plot of the predicted disorder values across the input amino acid sequence. + +.. code-block:: python + + meta.graph_disorder("DAPTSQEHTQAEDKERDSKTHPQKKQSPS") + +**additional usage** +**Changing title of generated graph** +There are two parameters that the user can change easily for graphing disorder. The first is the name of the title for the generate graph. The name by default is blank and the title of the graph is simply *Predicted Protein Disorder*. However, the name can be specified in order to add the name of the protein after the default title. For example, specifing name = "- PAB1" would result in a title of *Predicted Protein Disorder - PAB1*. + +**Example** +.. code-block:: python + + meta.graph_disorder("DAPPTSQEHTQAEDKERD", name="Name of this nonexistant protein") + +**Changing the resolution of the generate graph** +By default, the output graph has a DPI of 150. However, the user can change the DPI of the generated graph (higher values have greater resolution). To do so, simply specify DPI="Number" where the number is an integer. + +**Example** +.. code-block:: python + + meta.graph_disorder("DAPPTSQEHTQAEDKERD", DPI=300) + + +**Calculating percent disorder** +The percent_disorder function will return the percent of disordered residues in a sequence (as a decimal value). + +**Example** +.. code-block:: python + + meta.percent_disorder("DAPPTSQEHTQAEDKERD") + +By default, this uses a cutoff predicted value of equal to or greater than 0.5 for a residue to be considered disordered. + +**additional usage** +**Changing cutoff value** +If you want to be more strict in what you consider to be disordered for calculating percent disorder of an input sequence, you can simply specify the cutoff value. + + +**Example** +.. code-block:: python + + meta.percent_disorder("DAPPTSQEHTQAEDKERD", cutoff=0.8) + +The higher the cutoff value, the higher the value for any given predicted residue must be greater than or equal to in order to be considered disordered when calculating the final percent disorder for the input sequence. + + +**Predicting disorder from a .fasta file** +Similar to the command line, you can predict disorder values for the amino acid sequence in a .fasta file. By default, this function will return a dictionary where the keys in the dictionary are the fasta headers and the values are the disorder predictions of the amino acid sequence associated with each fasta header in the original .fasta file. + +**Example** +.. code-block:: python + + meta.predict_disorder_fasta("file path to .fasta file/fileName.fasta") + +An actual filepath would look something like: +.. code-block:: python + + meta.predict_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta") + + +**additional usage** +**Save the output values** +By default the predict_disorder_fasta function will immediately return a dictionary. However, you can also save them to a .csv file by specifying *save=True* and output_path="location you want to save the file to". This will save a file called *predicted_disorder_values.csv* to the location you specify for the output_path + +**Example** +.. code-block:: python + + meta.predict_disorder_fasta("file path to .fasta file/fileName.fasta", save=True, output_path="file path where the output .csv should be saved") + +An actual filepath would look something like: +.. code-block:: python + + meta.predict_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", save=True output_path"/Users/thisUser/Desktop/") + +**Specifying the name of the output file** +By default, the generated .csv file will save as *predicted_disorder_values.csv*. However, you can change the default by specifing output_name="my_cool_file". + +**Example** +.. code-block:: python + + meta.predict_disorder_fasta("file path to .fasta file/fileName.fasta", save=True, output_path="file path where the output .csv should be saved", output_name="name of file") + +An actual filepath would look something like: +.. code-block:: python + + meta.predict_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", save=True output_path"/Users/thisUser/Desktop/", output_name="my_predictions") + +Importantly, you do not need to add the .csv file extension to your file name specified in output_name. However, if you do specify .csv as a file extension, everything should still work. + +**Get raw prediction values** +By default, this will output prediction values that are normalized between 0 and 1. However, some of the raw values from the predictor are slightly less than 0 and slightly greater than 1. The negative values are simply replaced with 0 and the values greater than 1 are replaced with 1 by default. If you want the raw values simply specify normalized=False. + +**Example** +.. code-block:: python + + meta.predict_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", normalized=False) + + +**Generating graphs from a .fasta file** +Similar to the command line, you can graph predicted disorder values for the amino acid sequence in a .fasta file. The graph_disorder_fasta function takes a .fasta file as input and returns a .png for every sequence within the .fasta file. The .png files for each sequence will be saved to wherever the user specifies as the output location. Each file will be named as predicted\_disorder\_ followed by the first 10 characters of the .fasta header (which is typically the unique identifier for the protein). For example, a fasta header of >sp|Q8N6T3|ARFG1_HUMAN will return a file saved as *predicted_disorder_sp|Q8N6T3|.png*. Additionally, the title of each graph is automatically generated and will have the title Predicted Protein Disorder followed by the first 10 characters of the .fasta header. In the previous example, the graph would be titled *Predicted Protein Disorder sp|Q8N6T3|*. + +**WARNING** +This command will generate a .png file for ***every*** sequence in the .fasta file. If you have 1,000 sequences in a .fasta file, it will generate **1,000** files. Therefore, I recommend saving the output to a dedicated folder (or at least not your Desktop...). + +**Example** +.. code-block:: python + + meta.graph_disorder_fasta("file path to .fasta file/fileName.fasta", output_path="file path of where to save output graphs") + +An actual filepath would look something like: +.. code-block:: python + + meta.graph_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", output_path="/Users/thisUser/Desktop/folderForGraphs") + + + +**Additional Usage** +**Changing resolution of save graphs** +By default, the output files have a DPI of 150. However, the user can change the DPI of the output files (higher values have greater resolution but take up more space). To change the DPI, specify DPI=# where # is an whole integer number. + +**Example** +.. code-block:: python + + meta.graph_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", DPI=300, output_path="/Users/thisUser/Desktop/folderForGraphs") + +**Remove non-alphabetic characters from file name** +By default, the output files contain characters that are non-alphabetic (example *predicted_disorder_sp|Q8N6T3|.png*). This is not a problem on some operating systems, while others do not allow files to have names that contain certain characters. To get around this, you can add the --remove_characters flag. This will remove all non-alphabetic characters from the .fasta header when saving the file. The previous example with the header >sp|Q8N6T3|ARFG1_HUMAN would now save as *predicted_disorder_spQ8N726AR.png*. + +**Example** +.. code-block:: python + + meta.graph_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", DPI=300, output_path="/Users/thisUser/Desktop/folderForGraphs", remove_characters=True) + +**Viewing generated graphs without saving** +The default behavior for the graph_disorder_fasta function is to save the generated graphs for viewing elsewhere. However, the user can choose to view the generated graphs without saving them. + +**WARNING** +If you choose to view the generated graphs instead of saving them, you can only view one at a time and each must be closed before the next will open. This is not a problem if you only have around 10 sequences in your .fasta file. However, if you have 1,000 sequences in a .fasta file, you will have to close out ***1,000*** graphs. This isn't a problem if you don't mind clicking... a lot. + +**Example** +.. code-block:: python + + meta.graph_disorder_fasta("/Users/thisUser/Desktop/coolSequences.fasta", save=False) diff --git a/metapredict/.DS_Store b/metapredict/.DS_Store new file mode 100644 index 0000000..8a51b55 Binary files /dev/null and b/metapredict/.DS_Store differ diff --git a/metapredict/ReadMe.txt b/metapredict/ReadMe.txt new file mode 100644 index 0000000..03e40d0 --- /dev/null +++ b/metapredict/ReadMe.txt @@ -0,0 +1 @@ +This predictor uses a network generated by IDP-Parrot. The network is a BRNN machine learning network. It was generated by assessing consensus values of disorder from 12 different genomes from 8 different predictors from Mobibd. diff --git a/metapredict/__init__.py b/metapredict/__init__.py new file mode 100644 index 0000000..b478044 --- /dev/null +++ b/metapredict/__init__.py @@ -0,0 +1,22 @@ +""" +metapredict +A protein disorder predictor based on a BRNN (IDP-Parrot) trained on the consensus disorder values from 8 disorder predictors from 12 proteomes. +""" + +# Add imports here +from .meta import * +from .backend.brnn_architecture import * +from .backend.encode_sequence import * +from .backend.meta_graph import * +from .backend.meta_predict_disorder import * + + + + + +# Handle versioneer +from ._version import get_versions +versions = get_versions() +__version__ = versions['version'] +__git_revision__ = versions['full-revisionid'] +del get_versions, versions diff --git a/metapredict/_version.py b/metapredict/_version.py new file mode 100644 index 0000000..3953f47 --- /dev/null +++ b/metapredict/_version.py @@ -0,0 +1,520 @@ + +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.18 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "" + cfg.parentdir_prefix = "None" + cfg.versionfile_source = "metapredict/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} diff --git a/metapredict/backend/brnn_architecture.py b/metapredict/backend/brnn_architecture.py new file mode 100644 index 0000000..14ac920 --- /dev/null +++ b/metapredict/backend/brnn_architecture.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +""" +Code from Dan Griffith's IDP-Parrot tool from the Holehouse Lab. +All credit for this code should go to Dan. +""" + +import torch +import torch.nn as nn + +class BRNN_MtM(nn.Module): + """A PyTorch many-to-many bidirectional recurrent neural network + + A class containing the PyTorch implementation of a BRNN. The network consists + of repeating LSTM units in the hidden layers that propogate sequence information + in both the foward and reverse directions. A final fully connected layer + aggregates the deepest hidden layers of both directions and produces the + outputs. + + "Many-to-many" refers to the fact that the network will produce outputs + corresponding to every item of the input sequence. For example, an input + sequence of length 10 will produce 10 sequential outputs. + + Attributes + ---------- + device : str + String describing where the network is physically stored on the computer. + Should be either 'cpu' or 'cuda' (GPU). + hidden_size : int + Size of hidden vectors in the network + num_layers : int + Number of hidden layers (for each direction) in the network + num_classes : int + Number of classes for the machine learning task. If it is a regression + problem, `num_classes` should be 1. If it is a classification problem, + it should be the number of classes. + lstm : PyTorch LSTM object + The bidirectional LSTM layer(s) of the recurrent neural network. + fc : PyTorch Linear object + The fully connected linear layer of the recurrent neural network. Across + the length of the input sequence, this layer aggregates the output of the + LSTM nodes from the deepest forward layer and deepest reverse layer and + returns the output for that residue in the sequence. + + Methods + ------- + forward(x) + Propogate input sequences through the network to produce outputs + """ + + def __init__(self, input_size, hidden_size, num_layers, num_classes, device): + """ + Parameters + ---------- + input_size : int + Length of the input vectors at each timestep + hidden_size : int + Size of hidden vectors in the network + num_layers : int + Number of hidden layers (for each direction) in the network + num_classes : int + Number of classes for the machine learning task. If it is a regression + problem, `num_classes` should be 1. If it is a classification problem, + it should be the number of classes. + device : str + String describing where the network is physically stored on the computer. + Should be either 'cpu' or 'cuda' (GPU). + """ + + super(BRNN_MtM, self).__init__() + self.device = device + self.hidden_size = hidden_size + self.num_layers = num_layers + self.num_classes = num_classes + self.lstm = nn.LSTM(input_size, hidden_size, num_layers, + batch_first=True, bidirectional=True) + self.fc = nn.Linear(in_features=hidden_size*2, # *2 for bidirection + out_features=num_classes) + + def forward(self, x): + """Propogate input sequences through the network to produce outputs + + Parameters + ---------- + x : 3-dimensional PyTorch IntTensor + Input sequence to the network. Should be in the format: + [batch_dim X sequence_length X input_size] + + Returns + ------- + 3-dimensional PyTorch FloatTensor + Output after propogating the sequences through the network. Will + be in the format: + [batch_dim X sequence_length X num_classes] + """ + + # Set initial states + # h0 and c0 dimensions: [num_layers*2 X batch_size X hidden_size] + h0 = torch.zeros(self.num_layers*2, # *2 for bidirection + x.size(0), self.hidden_size).to(self.device) + c0 = torch.zeros(self.num_layers*2, + x.size(0), self.hidden_size).to(self.device) + + # Forward propagate LSTM + # out: tensor of shape: [batch_size, seq_length, hidden_size*2] + out, (h_n, c_n) = self.lstm(x, (h0, c0)) + + # Decode the hidden state for each time step + fc_out = self.fc(out) + return fc_out diff --git a/metapredict/backend/encode_sequence.py b/metapredict/backend/encode_sequence.py new file mode 100644 index 0000000..6702d61 --- /dev/null +++ b/metapredict/backend/encode_sequence.py @@ -0,0 +1,78 @@ +""" +Code from Dan Griffith's parrot from the Holehouse lab. +All credit for this code should go to Dan. + +File containing functions for encoding a string of amino acids into a numeric vector. +............................................................................. +parrot was developed by the Holehouse lab + Original release ---- 2020 + +Question/comments/concerns? Raise an issue on github: +https://github.com/idptools/parrot + +Licensed under the MIT license. +""" + +import sys +import os + +import numpy as np +import torch + +ONE_HOT = {'A':0, 'C':1, 'D':2, 'E':3, 'F':4, 'G':5, 'H':6, 'I':7, 'K':8, 'L':9, + 'M':10,'N':11,'P':12,'Q':13,'R':14,'S':15,'T':16,'V':17,'W':18,'Y':19} + +def one_hot(seq): + """Convert an amino acid sequence to a PyTorch tensor of one-hot vectors + + Each amino acid is represented by a length 20 vector with a single 1 and + 19 0's Inputing a sequence with a nono-canonical amino acid letter will + cause the program to exit. + + E.g. Glutamic acid (E) is encoded: [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + + Parameters + ---------- + seq : str + An uppercase sequence of amino acids (single letter code) + + Returns + ------- + torch.IntTensor + a PyTorch tensor representing the encoded sequence + """ + + l = len(seq) + m = np.zeros((l, 20)) + try: + for i in range(l): + m[i, ONE_HOT[seq[i]]] = 1 + except: + error_str = 'Invalid amino acid detected: ' + seq[i] + raise ValueError(error_str) + return torch.from_numpy(m) + +def rev_one_hot(seq_vectors): + """Decode a list of one-hot sequence vectors into amino acid sequences + + Parameters + ---------- + seq_vectors : list of numpy arrays + A list containing sequence vectors + + Returns + ------- + list + Strings of amino acid sequences + """ + + REV_ONE_HOT = 'ACDEFGHIKLMNPQRSTVWY' + sequences = [] + + for seq_vector in seq_vectors: + seq = [] + for residue in seq_vector: + seq.append(REV_ONE_HOT[np.argmax(residue)]) + sequences.append("".join(seq)) + + return sequences \ No newline at end of file diff --git a/metapredict/backend/metaDisorder.pt b/metapredict/backend/metaDisorder.pt new file mode 100644 index 0000000..13cd081 Binary files /dev/null and b/metapredict/backend/metaDisorder.pt differ diff --git a/metapredict/backend/meta_graph.py b/metapredict/backend/meta_graph.py new file mode 100644 index 0000000..3342e65 --- /dev/null +++ b/metapredict/backend/meta_graph.py @@ -0,0 +1,79 @@ +#code for graphing IDRs. +#Import stuff +import numpy as np +import matplotlib +import matplotlib.pyplot as plt + +from metapredict.backend import meta_predict_disorder +from metapredict.backend.meta_predict_disorder import meta_predict as predict + +def graph(sequence, name = " ", line_color = "blue", DPI = 150, save_fig = False, output_file = "./predicted_disorder.png"): + """ + Function for graphing predicted disorder. By default, this function will show a graph. + However, if saveFig = True, then it will save the figure (by default) to the location + where the script is (which isn't ideal). However, you can specify outputFile as the + file path followed by the name of the saved file with the proper extension (.png by default). + This is the backend for the meta.py graphing functions. + + Arguments + --------- + sequence - Input amino acid sequence (as string) to be predicted. + + name (optional) - setting the value of name will change the title of the + graph. By default, the title is "Predicted Protein Disorder", so if you + for example set name = "- PAB1", the title on the graph will be "Predicted + Protein Disorder - PAB1". + + line_color (optional) - set the color of the predicted disorder values line. Default is blue. + + DPI (optional) - default value is 150. Increasing this value will increase + the resolution of the output graph. Decreasing this value will decrease + the resolution. + + save_fig (optional) - by default will not save the figure and will instead show it immediately. + Set save_fig = True in order to save the figure. + ***important*** + If you set save_fig = True, you must specify an output file! + + output_file - the path to where the output graph should be saved followed by the file name. + For example, on MacOS: + output_file="Users/thisUser/Desktop/folder_of_cool_graphs/my_cool_protein.png" + + This code is meant to be backend code for meta.py. + """ + #set yValues equal to the predicted disorder from the sequence (normalized) + yValues = predict(sequence) + #set title of figure to Predicted Protein Disorder followed by the name if given + Title = "Predicted Protein Disorder {}".format(name) + #if a name is set, the figure will hold that name as the identifier + fig = plt.figure(num = name, figsize = [8, 3], dpi = DPI, edgecolor = 'black') + axes = fig.add_axes([0.15, 0.15, 0.75, 0.75]) + axes.set_title(Title) + axes.set_xlabel("Position Across Protein Sequence") + axes.set_ylabel("Ordered -----------> Disordered") + #make x values for each residue with predicted disorder + xValues = np.arange(0, len(yValues)) + #graph the disorder values of each residue at each point along the x-axis + axes.plot(xValues, yValues, color = line_color, linewidth = '1.6') + #set x limit as the number of residues + axes.set_xlim(0, len(xValues)) + #set y limit as 0-1 since the predictor data is normalized from 0 to 1. + axes.set_ylim(-0.003, 1.003) + + #graph "disorder cutoff line at 0.5" + disorderValues = [] + for i in range(0, len(yValues)): + disorderValues.append(0.5) + axes.plot(xValues, disorderValues, color = "black", linewidth = "1.25") + #add dashed lines at 0.2 intervals + cutoffLines = [0.2, 0.4, 0.6, 0.8] + for i in cutoffLines: + tempList = [] + for j in range(0, len(yValues)): + tempList.append(i) + axes.plot(xValues, tempList, color = "black", linestyle = "dashed", linewidth = "0.75") + if save_fig == False: + plt.show() + else: + plt.savefig(fname = output_file, dpi = DPI) + plt.close() diff --git a/metapredict/backend/meta_predict_disorder.py b/metapredict/backend/meta_predict_disorder.py new file mode 100644 index 0000000..1d20c2a --- /dev/null +++ b/metapredict/backend/meta_predict_disorder.py @@ -0,0 +1,122 @@ +""" +Backend of the IDR machine learning predictor. Based partly +on code from Dan Griffith's IDP-Parrot from the Holehouse lab +(specifically the test_unlabeled_data function in train_network.py). +""" + +#import packages for predictor +import sys +import os + +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader + + +#import modules that predictor depends on +from metapredict.backend import encode_sequence +from metapredict.backend import brnn_architecture + + +#set path for location of predictor. Using this in case I want to update the predictor or +#eventually make multiple predictors. +PATH = os.path.dirname(os.path.realpath(__file__)) + +#Setting predictor equal to location of weighted values. +predictor = "{}/metaDisorder.pt".format(PATH) + +################################################################################################## +#hyperparameters used by when metapredict was trained. Manually setting them here for clarity. +################################################################################################## + +device = 'cpu' +hidden_size = 5 +num_layers = 1 +dtype = 'residues' +num_classes = 1 +encoding_scheme = 'onehot' +input_size = 20 +problem_type = 'regression' + +#set location of saved_weights for load_state_dict +saved_weights = predictor + +############################################################################### +# Initialize network architecture using previously defined hyperparameters +############################################################################### +brnn_network = brnn_architecture.BRNN_MtM(input_size, hidden_size, num_layers, num_classes, device).to(device) +brnn_network.load_state_dict(torch.load(saved_weights, map_location=torch.device(device))) +############################################################################### + + +def meta_predict(sequence, normalized=True, network=brnn_network, device=device, encoding_scheme=encoding_scheme): + """ + The actual executing function for predicting the disorder of a sequence using metapredict. + Returns a list containing predicted disorder values for the input sequence. + + Arguments + --------- + sequence - the amino acid sequence to be predicted + + normalized (optional) - by default, negative values are set to be equal to 0 and values greater + than 1 are set to be equal to 1. User can set normalized=False to get raw prediction values. + + network - the network used by the predictor. See brnn_architecture BRNN_MtM for more info. + + device - String describing where the network is physically stored on the computer. + Should be either 'cpu' or 'cuda' (GPU). + + encoding_scheme - encoding scheme used when metapredict was trained. The encoding scheme was onehot. + """ + #set seq_vector equal to converted amino acid sequence that is a PyTorch tensor of one-hot vectors + seq_vector = encode_sequence.one_hot(sequence) + seq_vector=seq_vector.view(1, len(seq_vector), -1) + + #get output values from the seq_vector based on the network (brnn_network) + outputs = network(seq_vector.float()).detach().numpy()[0] + + #make empty list to add in outputs + output_values = [] + #for the values 'i' in outputs + for i in outputs: + #append each value (which is the predicted disorder value) to output values as a float. + #round each value to six digits. + output_values.append(round(float(i),6)) + + #if normalized=True (defualt) + if normalized == True: + #initialize empty list to populate normalized values + normalized_IDR_values = [] + #determine the lowest value in the output_values list + min_IDR = min(output_values) + #if the lowset value is less than 0, normalize the list by replacing negative values with 0. + if min_IDR < 0: + for j in range(0, len(output_values)): + cur_value = output_values[j] + if cur_value < 0: + normalized_IDR_values.append(0) + else: + normalized_IDR_values.append(round(cur_value, 6)) + #overwrite output_values with normalized_IDR_values (which are now all non-negative). + output_values = normalized_IDR_values + #overwrite normalized_IDR_values with an empty list + normalized_IDR_values = [] + #determine the greatest value in the ouputValues list + max_IDR = max(output_values) + #if the greatest value is greater than 1, replace values greater than 1 with 1. + if max_IDR > 1: + for k in range (0, len(output_values)): + cur_value = output_values[k] + if cur_value > 1: + normalized_IDR_values.append(1) + else: + normalized_IDR_values.append(round(cur_value, 6)) + #overwrite output_values with normalized_IDR_values (which are now all less than or equal to 1). + output_values = normalized_IDR_values + #return output_values + return output_values + #if normalized=False, just return the output_values. + else: + return output_values + diff --git a/metapredict/data/README.md b/metapredict/data/README.md new file mode 100644 index 0000000..3408874 --- /dev/null +++ b/metapredict/data/README.md @@ -0,0 +1,3 @@ +# Sample Package Data + +This directory contains sample data to test out functions in metapredict. Notably, there isn't a lot of sample data you will need to use metapredict. Nonetheless, I included a .fasta file called test_data.fasta. \ No newline at end of file diff --git a/metapredict/data/test_data.fasta b/metapredict/data/test_data.fasta new file mode 100644 index 0000000..f9772e3 --- /dev/null +++ b/metapredict/data/test_data.fasta @@ -0,0 +1,50 @@ +>sp|Q8N6T3|ARFG1_HUMAN ADP-ribosylation factor GTPase-activating protein 1 OS=Homo sapiens OX=9606 GN=ARFGAP1 PE=1 SV=2 +MASPRTRKVLKEVRVQDENNVCFECGAFNPQWVSVTYGIWICLECSGRHRGLGVHLSFVR +SVTMDKWKDIELEKMKAGGNAKFREFLESQEDYDPCWSLQEKYNSRAAALFRDKVVALAE +GREWSLESSPAQNWTPPQPRTLPSMVHRVSGQPQSVTASSDKAFEDWLNDDLGSYQGAQG +NRYVGFGNTPPPQKKEDDFLNNAMSSLYSGWSSFTTGASRFASAAKEGATKFGSQASQKA +SELGHSLNENVLKPAQEKVKEGKIFDDVSSGVSQLASKVQGVGSKGWRDVTTFFSGKAEG +PLDSPSEGHSYQNSGLDHFQNSNIDQSFWETFGSAEPTKTRKSPSSDSWTCADTSTERRS +SDSWEVWGSASTNRNSNSDGGEGGEGTKKAVPPAVPTDDGWDNQNW +>sp|O43150|ASAP2_HUMAN Arf-GAP with SH3 domain ANK repeat and PH domain-containing protein 2 OS=Homo sapiens OX=9606 GN=ASAP2 PE=1 SV=3 +MPDQISVSEFVAETHEDYKAPTASSFTTRTAQCRNTVAAIEEALDVDRMVLYKMKKSVKA +INSSGLAHVENEEQYTQALEKFGGNCVCRDDPDLGSAFLKFSVFTKELTALFKNLIQNMN +NIISFPLDSLLKGDLKGVKGDLKKPFDKAWKDYETKITKIEKEKKEHAKLHGMIRTEISG +AEIAEEMEKERRFFQLQMCEYLLKVNEIKIKKGVDLLQNLIKYFHAQCNFFQDGLKAVES +LKPSIETLSTDLHTIKQAQDEERRQLIQLRDILKSALQVEQKEDSQIRQSTAYSLHQPQG +NKEHGTERNGSLYKKSDGIRKVWQKRKCSVKNGFLTISHGTANRPPAKLNLLTCQVKTNP +EEKKCFDLISHDRTYHFQAEDEQECQIWMSVLQNSKEEALNNAFKGDDNTGENNIVQELT +KEIISEVQRMTGNDVCCDCGAPDPTWLSTNLGILTCIECSGIHRELGVHYSRMQSLTLDV +LGTSELLLAKNIGNAGFNEIMECCLPAEDSVKPNPGSDMNARKDYITAKYIERRYARKKH +ADNAAKLHSLCEAVKTRDIFGLLQAYADGVDLTEKIPLANGHEPDETALHLAVRSVDRTS +LHIVDFLVQNSGNLDKQTGKGSTALHYCCLTDNAECLKLLLRGKASIEIANESGETPLDI +AKRLKHEHCEELLTQALSGRFNSHVHVEYEWRLLHEDLDESDDDMDEKLQPSPNRREDRP +ISFYQLGSNQLQSNAVSLARDAANLAKEKQRAFMPSILQNETYGALLSGSPPPAQPAAPS +TTSAPPLPPRNVGKVQTASSANTLWKTNSVSVDGGSRQRSSSDPPAVHPPLPPLRVTSTN +PLTPTPPPPVAKTPSVMEALSQPSKPAPPGISQIRPPPLPPQPPSRLPQKKPAPGADKST +PLTNKGQPRGPVDLSATEALGPLSNAMVLQPPAPMPRKSQATKLKPKRVKALYNCVADNP +DELTFSEGDVIIVDGEEDQEWWIGHIDGDPGRKGAFPVSFVHFIAD +>sp|Q94JM3|ARFB_ARATH Auxin response factor 2 OS=Arabidopsis thaliana OX=3702 GN=ARF2 PE=1 SV=2 +MASSEVSMKGNRGGDNFSSSGFSDPKETRNVSVAGEGQKSNSTRSAAAERALDPEAALYR +ELWHACAGPLVTVPRQDDRVFYFPQGHIEQVEASTNQAAEQQMPLYDLPSKLLCRVINVD +LKAEADTDEVYAQITLLPEANQDENAIEKEAPLPPPPRFQVHSFCKTLTASDTSTHGGFS +VLRRHADECLPPLDMSRQPPTQELVAKDLHANEWRFRHIFRGQPRRHLLQSGWSVFVSSK +RLVAGDAFIFLRGENGELRVGVRRAMRQQGNVPSSVISSHSMHLGVLATAWHAISTGTMF +TVYYKPRTSPSEFIVPFDQYMESVKNNYSIGMRFKMRFEGEEAPEQRFTGTIVGIEESDP +TRWPKSKWRSLKVRWDETSSIPRPDRVSPWKVEPALAPPALSPVPMPRPKRPRSNIAPSS +PDSSMLTREGTTKANMDPLPASGLSRVLQGQEYSTLRTKHTESVECDAPENSVVWQSSAD +DDKVDVVSGSRRYGSENWMSSARHEPTYTDLLSGFGTNIDPSHGQRIPFYDHSSSPSMPA +KRILSDSEGKFDYLANQWQMIHSGLSLKLHESPKVPAATDASLQGRCNVKYSEYPVLNGL +STENAGGNWPIRPRALNYYEEVVNAQAQAQAREQVTKQPFTIQEETAKSREGNCRLFGIP +LTNNMNGTDSTMSQRNNLNDAAGLTQIASPKVQDLSDQSKGSKSTNDHREQGRPFQTNNP +HPKDAQTKTNSSRSCTKVHKQGIALGRSVDLSKFQNYEELVAELDRLFEFNGELMAPKKD +WLIVYTDEENDMMLVGDDPWQEFCCMVRKIFIYTKEEVRKMNPGTLSCRSEEEAVVGEGS +DAKDAKSASNPSLSSAGNS +>sp|Q64364|ARF_MOUSE Tumor suppressor ARF OS=Mus musculus OX=10090 GN=Cdkn2a PE=1 SV=1 +MGRRFLVTVRIQRAGRPLQERVFLVKFVRSRRPRTASCALAFVNMLLRLERILRRGPHRN +PGPGDDDGQRSRSSSSAQLRCRFELRGPHYLLPPGARRSAGRLPGHAGGAARVRGSAGCA +RCLGSPAARLGPRAGTSRHRAIFAFRWVLFVFRWVVFVYRWERRPDRRA +>sp|P40946|ARF6_DROME ADP-ribosylation factor 6 OS=Drosophila melanogaster OX=7227 GN=Arf51F PE=1 SV=3 +MGKLLSKIFGNKEMRILMLGLDAAGKTTILYKLKLGQSVTTIPTVGFNVETVTYKNVKFN +VWDVGGQDKIRPLWRHYYTGTQGLIFVVDCADRDRIDEARTELHRIINDREMRDAIILIF +ANKQDLPDAMKPHEIQEKLGLTRIRDRNWYVQPSCATSGDGLSEGLIWLTSNHKL \ No newline at end of file diff --git a/metapredict/meta.py b/metapredict/meta.py new file mode 100644 index 0000000..4a388e9 --- /dev/null +++ b/metapredict/meta.py @@ -0,0 +1,365 @@ +""" +meta.py +A protein disorder predictor based on a BRNN (IDP-Parrot) trained +on the consensus disorder values from 8 disorder predictors from +12 proteomes (see https://mobidb.bio.unipd.it) as of 08/2020. + +Handles the primary functions +""" +import os +import sys + +#import protfasta to read .fasta files +import protfasta +import pandas as pd + +#import stuff for IDR predictor from backend +from metapredict.backend import meta_predict_disorder +from metapredict.backend.meta_predict_disorder import meta_predict + +#import stuff for graphing from backend +from metapredict.backend import meta_graph +from metapredict.backend.meta_graph import graph + + + +def predict_disorder(sequence, normalized=True): + """ + Function to return disorder of a single input sequence. Returns the + predicted values as a list. + + Arguments: + ---------- + sequence - Input amino acid sequence (as string) to be predicted. + + normalized (optional) - by default predictor returns values normalized between 0 and 1. + This is because the BRNN will output some negative values and some values greater + than 1. Setting normalized=False will result in returning the raw predicted values. + """ + #make all residues upper case + sequence=sequence.upper() + #return predicted values of disorder for sequence + return meta_predict(sequence, normalized=normalized) + + +def graph_disorder(sequence, name = " ", DPI=150): + """ + Function to plot the disorder of an input sequece. Displays immediately. + + Arguments: + ---------- + sequence - Input amino acid sequence (as string) to be predicted. + + name (optional) - setting the value of name will change the title of the + graph. By default, the title is "Predicted Protein Disorder", so if you + for example set name = "- PAB1", the title on the graph will be "Predicted + Protein Disorder - PAB1". + + DPI (optional) - default value is 150. Increasing this value will increase + the resolution of the output graph. Decreasing this value will decrease + the resolution. + """ + #make all residues upper case + sequence=sequence.upper() + #graph sequence + graph(sequence = sequence, name = name, DPI=DPI) + + + +def percent_disorder(sequence, cutoff=0.5): + """ + function to return the percent disorder for any given protein. + By default, uses 0.5 as a cutoff (values greater than or equal + to 0.5 will be considred disordered). + + Arguments: + ---------- + sequence - Input amino acid sequence (as string) to be predicted. + + cutoff (optional) the cutoff for the predicted value of an individual + residue to be considered disordered. By default this value is 0.5. Increasing + this value will make the cutoff more "strict" in that a higher predicted + vallue will be required for a residue to be considered disordered. + + Returns the percent disorder for the input sequence as a decimal. + 1.0 = 100% disordered, + 0.9 = 90% disordered, + and so on. + """ + #make all residues upper case + sequence=sequence.upper() + #set dis equal to the predicted disorder for the input sequence + dis = meta_predict(sequence) + #set arbitrarily chosen variable n to equal 0 + n = 0 + #for predicted disorder values in dis: + for i in dis: + #if predicted value is greater than cutoff, add one to n + if i >= cutoff: + n += 1 + #else continue through the values. + else: + continue + """ + percent disorder is equal to n (number of residues with predicted + value >= cutoff) divided by the total number of residues in the + input sequence. + """ + percent_disordered = (n / len(dis)) + #return percent_disordered + return(percent_disordered) + + + +#./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\ +#./\./\./\./\./\./\./\./\./\./\./\./\.FASTA STUFF./\./\./\./\./\./\./\./\./\./\./\./\ +#./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\./\ + +#Various functions for working with fasta files to make everyones life easier. + + +def predict_disorder_fasta(filepath, save=False, output_path = "", output_name = "predicted_disorder_values", normalized=True): + """ + Function to read in a .fasta file from a specified filepath. + Returns a dictionary of disorder values where the key is the + fasta header and the values are the predicted disorder values. + + Arguments: + ---------- + filepath - the path to where the .fasta file is located. The filepath + should end in the file name. For example (on MacOS): + filepath="/Users/thisUser/Desktop/folder_of_seqs/interesting_proteins.fasta" + + save (optional) - by default, a dictionary of predicted values is + returned immediately. However, you can specify save=True in order to + save the output as a .csv file. + ***important*** + If you specify save=True, then output_path is a required argument! + + output_path - the path to where the output .csv file should be saved. + For example, on MacOS: + output_path="Users/thisUser/Desktop/folder_of_cool_results/" + ***important*** + You cannot specify the output file name here! By default, the file name will + be predicted_values.csv. However, you can change the output file name by + specifying ouput_name (see below). + + output_name (optional) - by default is set to equal "predicted values". However, + the user can specify output_name="my_cool_output_name" in order to specify the + name of the output file. + ***important*** + Do not add a file extension to output_name. The .csv file extension is added + automatically. If you do by accident, it will (hopefully) be removed. + + normalized - decide whether the values are normalized from 0 to 1. By default, the + values are normalized. However, occassionally the raw values from the predictor can be + negative or greater than 1. If you want thes values, set normalied=False. + """ + + #set variable protfastaSeqs equal to output from protfasta (with correction of invalid sequence values) + """ + Importantly, by default this function corrects invalid residue + values using protfasta.read_fasta() because the disorder predictor + cannot have non-amino acid values as an input. + """ + # Test to see if the data_file exists + test_data_file = os.path.abspath(filepath) + if not os.path.isfile(test_data_file): + raise FileNotFoundError('Datafile does not exist.') + + protfasta_seqs = protfasta.read_fasta(filepath, invalid_sequence_action = "convert", return_list = True) + #initialize empty dictionary to be populated with the the fasta headers (key) + #and the predicted disorder values (value) + disorder_dict = {} + #for the sequences in the protffasta_seqs list: + for seqs in protfasta_seqs: + #set cur_header equal to the fasta header + cur_header = seqs[0] + #set cur_seq equal to the sequence associated with the fasta header + cur_seq = seqs[1] + #make all values for curSeq uppercase so they work with predictor + cur_seq = cur_seq.upper() + #set cur_disorder equal to the predicted values for cur_seq + cur_disorder = meta_predict(cur_seq, normalized=normalized) + disorder_dict[cur_header] = cur_disorder + + #if save=False (default), immediately return the dictionary disorder_dict + if save == False: + return disorder_dict + + #if save=True, save the disorder_dict to the specified output_path + else: + # Test to see that the output path is valid + test_output_path = os.path.abspath(output_path) + if not os.path.exists(test_output_path): + raise FileNotFoundError('Output path is not valid.') + + #Check if there is a .csv in output_name (which there shouldn't be) + #set try_output_name = output_name + try_output_name = output_name + #if there is .csv in try_output_name + if ".csv" in try_output_name: + #split try_output_name and set output_file_name equal to everything before .csv + output_file_name = try_output_name.split(".csv")[0] + else: + #if .csv is not in try_output_name, set final output_final_name equal to args.output_name + output_file_name = output_name + + """ + Make sure output_path ends in / (mac) or \\ (windows). + This is necessary because earlier when testing the output path using OS, + a valid output path can still not end in a / or \\. When the path does not + end in a / or \\, then the file does not get saved correctly but pandas + doesn't raise an error. I'm not sure this is totally necessary, but + I made this mistake a few times while testing this stuff so I figured it might + be a nice feature. + """ + #Set final_output_path_character = last character in output_path + final_output_path_character = output_path[-1] + #if / in output_path (user is using MacOS or linux) + if "/" in output_path: + #if the final character in the path does not equal a / + if final_output_path_character != "/": + #add in a / to complete file path + output_path += "/" + #if \ in output_path (user is using Windows) + elif "\\" in output_path: + #if the last character does not equal \ + if final_output_path_character != "\\": + #add in a \ to output path + output_path += "\\" + + #make pandas dataframe from disorder_dict + df = pd.DataFrame.from_dict(disorder_dict, orient="index") + #set variable final_output equal to the output_path followed by output_name + #(by default output_name is equal to predicted_disorder_values). + final_output = "{}{}.csv".format(output_path, output_file_name) + #export the dataframe to a csv file at the location final_output + df.to_csv(final_output, header=False) + + + +def graph_disorder_fasta(filepath, DPI=150, save=True, output_path="", remove_characters=False): + """ + Function to make graphs of predicted disorder from the sequences + in a specified .fasta file. By default will save the generated + graphs to the location output_path specified in filepath. + + Arguments: + ---------- + filepath - the path to where the .fasta file is located. The filepath + should end in the file name. For example (on MacOS): + filepath="/Users/thisUser/Desktop/folder_of_seqs/interesting_proteins.fasta" + + DPI (optional) - default value is 150. Increasing this value will increase + the resolution of the output graph. Decreasing this value will decrease + the resolution. + + save (optional) - by default, the generated graphs are saved. This can be set + to False, which will result in the graphs being sequentially shown. + ***important*** + It is unadvisable to set save=False if you are inputting a large .fasta file! This + is because each graph must be closed individually before the next will appear. Therefore, + you will spend a bunch of time closing each graph. + + output_path - the path to where the output graphs should be saved. + For example, on MacOS: + output_path="Users/thisUser/Desktop/folder_of_cool_results/" + ***important*** + You cannot specify the output file name here! By default, the file name will + be predicted_disorder followed by 6 characters from the .fasta header (which ideally + should be a unique identifier that can be input into uniprot) followed by .png. + For example, predicted_disorder_sp|O43150|.png + + remove_characters (optional) - allows all non-alphabetic characters to be removed + from the output file names. If user is on an OS that doesn't allow specific characters + to be used in file names, this should be a way to get around that. + """ + + # Test to see if the data_file exists + test_data_file = os.path.abspath(filepath) + if not os.path.isfile(test_data_file): + raise FileNotFoundError('Datafile does not exist.') + + #use protfasta to read in fasta file + """ + Importantly, by default this function corrects invalid residue + values using protfasta.read_fasta() because the disorder predictor + cannot have non-amino acid values as an input. + """ + sequences = protfastaSeqs = protfasta.read_fasta(filepath, invalid_sequence_action = "convert") + + #for key, value in sequences.items (which are the items in the dict returned by protfasta) + for i, v in sequences.items(): + #set title of graph equal to the first 10 characters of the fasta header after > + title = i[0:10] + #if remove_characters is False: + if remove_characters == False: + #set name equal to the first 10 characters of the fasta header after > + name = (i[0:10]) + else: + #initializing empty string + empty_name = "" + #removing all common characters (non-alphabetic) from the fasta header + for j in i: + if j==">": + continue + elif j=="|": + continue + elif j=="=": + continue + elif j=="-": + continue + elif j==" ": + continue + else: + empty_name += j + #set final name equal to the first 10 characters from the fasta header with + #the various characters removed. + name = empty_name[0:10] + + #set the sequence equal to the amino acid sequence associated with + #the fasta header and make all amino acids uppercase. + sequence = v.upper() + #If save is True (default) + if save == True: + # Test to see that the output path is valid + test_output_path = os.path.abspath(output_path) + if not os.path.exists(test_output_path): + raise FileNotFoundError('Output path is not valid.') + + + """ + Make sure output_path ends in / (mac) or \\ (windows). + This is necessary because earlier when testing the output path using OS, + a valid output path can still not end in a / or \\. When the path does not + end in a / or \\, then the file does not get saved correctly but pandas + doesn't raise an error. I'm not sure this is totally necessary, but + I made this mistake a few times while testing this stuff so I figured it might + be a nice feature. + """ + #Set final_output_path_character = last character in output_path + final_output_path_character = output_path[-1] + #if / in output_path (user is using MacOS or linux) + if "/" in output_path: + #if the final character in the path does not equal a / + if final_output_path_character != "/": + #add in a / to complete file path + output_path += "/" + #if \ in output_path (user is using Windows) + elif "\\" in output_path: + #if the last character does not equal \ + if final_output_path_character != "\\": + #add in a \ to output path + output_path += "\\" + + #set variable output equal to the output_path folowed by the file name which is + #predicted_disorder_{name}.png, where name is the name specified earlier (first 10 characters + #of the fasta header either as is or with characters removed if remove_characters is set to true). + output = "{}predicted_disorder_{}.png".format(output_path, name) + #use the graph function (specified in meta_graph from the backend) to save the graph. + graph(sequence = sequence, name = title, DPI = DPI, save_fig = True, output_file = output) + else: + #if save was set to False, then just graph the sequences from the .fasta file and show them immediately. + graph(sequence = sequence, name = title, DPI = DPI) + diff --git a/metapredict/tests/__init__.py b/metapredict/tests/__init__.py new file mode 100644 index 0000000..325e75d --- /dev/null +++ b/metapredict/tests/__init__.py @@ -0,0 +1,3 @@ +""" +Empty init file in case you choose a package besides PyTest such as Nose which may look for such a file +""" diff --git a/metapredict/tests/test_metapredict.py b/metapredict/tests/test_metapredict.py new file mode 100644 index 0000000..8b1bdb6 --- /dev/null +++ b/metapredict/tests/test_metapredict.py @@ -0,0 +1,14 @@ +""" +Unit and regression test for the metapredict package. + +This is extremely underdone at this point... Sorry about that :'( +""" + +# Import package, test suite, and other packages as needed +import metapredict +import pytest +import sys + +def test_metapredict_imported(): + """Sample test, will always pass so long as import statement worked""" + assert "metapredict" in sys.modules diff --git a/scripts/graph-disorder b/scripts/graph-disorder new file mode 100644 index 0000000..fe98b64 --- /dev/null +++ b/scripts/graph-disorder @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +#executing script for IDR predictor in command line. + +#import stuff for making CLI +import os +import sys +import argparse + +#import pandas as pd +import pandas as pd +#import protfasta +import protfasta + +#from metapredict import meta +from metapredict import meta + +#Parse command line arguments. +parser = argparse.ArgumentParser(description='Graph predicted disorder of amino acid sequences.') +parser.add_argument('data_file', help='Path to fasta file containing sequences to be predicted.') +parser.add_argument('output_path', help='Path for the returned disorder graphs.') +parser.add_argument('-D', '--DPI', default=150, type=int, metavar='DPI', + help='Optional. Set DPI to change resolution of output graphs. Default is 150.') +parser.add_argument('--remove_characters', action='store_true', + help='Use if you want to avoid using any non-alphabetic characters in the fasta headers as file names') + +args = parser.parse_args() + +DPI=args.DPI + +# Test to see if the data_file exists +test_data_file = os.path.abspath(args.data_file) +if not os.path.isfile(test_data_file): + raise FileNotFoundError('Datafile does not exist.') + +# Test to see that the output path is valid +test_output_path = os.path.abspath(args.output_path) +if not os.path.exists(test_output_path): + raise FileNotFoundError('Output path is not valid.') + +if args.remove_characters: + remove_char = True +else: + remove_char = False + +#run graph_disorder_fasta. For more info, see the graph_disorder_fasta function in meta.py. +meta.graph_disorder_fasta(filepath = args.data_file, DPI=DPI, output_path = args.output_path, remove_characters=remove_char) +#graph_disorder_fasta(filepath, DPI=150, save=True, output_path="", remove_characters=False): \ No newline at end of file diff --git a/scripts/predict-disorder b/scripts/predict-disorder new file mode 100644 index 0000000..f37f190 --- /dev/null +++ b/scripts/predict-disorder @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +#executing script for IDR predictor in command line. + +#import stuff for making CLI +import os +import sys +import argparse + +import pandas as pd +import protfasta + +from metapredict import meta + +#Parse command line arguments. +parser = argparse.ArgumentParser(description='Predict intrinsic disorder of amino acid sequences.') +parser.add_argument('data_file', help='Path to fasta file containing sequences to be predicted.') +parser.add_argument('output_path', help='Path for the returned disorder file.') +parser.add_argument('output_name', + help='Name of the final output file. Do not add a file extension to output_name. The .csv file extension is added automatically.') +parser.add_argument('--no_normalization', action='store_true', + help='Use if you want to get raw values from predictor (not normalized from 0 to 1)') + +args = parser.parse_args() + +# Test to see if the data_file exists +test_data_file = os.path.abspath(args.data_file) +if not os.path.isfile(test_data_file): + raise FileNotFoundError('Datafile does not exist.') + +# Test to see that the output path is valid +test_output_path = os.path.abspath(args.output_path) +if not os.path.exists(test_output_path): + raise FileNotFoundError('Output path is not valid.') + +#if --no_normalization flag is set by user +if args.no_normalization: + #set normalization to false + normalization = False +else: + #otherwise normalize the predictor values from 0 to 1 + normalization = True + +#Check if there is a .csv in output_name (which there shouldn't be) +#set try_output_name = args.output_name +try_output_name = args.output_name +#if there is .csv in try_output_name +if ".csv" in try_output_name: + #split try_output_name and set output_file_name equal to everything before .csv + output_file_name = try_output_name.split(".csv")[0] +else: + #if .csv is not in try_output_name, set final output_final_name equal to args.output_name + output_file_name = args.output_name + + +#run predict disorder fasta +meta.predict_disorder_fasta(filepath = args.data_file, save=True, output_path = args.output_path, output_name = output_file_name, normalized=normalization) + diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..c186634 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,30 @@ +# Helper file to handle all configs + +[coverage:run] +# .coveragerc to control coverage.py and pytest-cov +omit = + # Omit the tests + */tests/* + # Omit generated versioneer + metapredict/_version.py + +[yapf] +# YAPF, in .style.yapf files this shows up as "[style]" header +COLUMN_LIMIT = 119 +INDENT_WIDTH = 4 +USE_TABS = False + +[flake8] +# Flake8, PyFlakes, etc +max-line-length = 119 + +[versioneer] +# Automatic version numbering scheme +VCS = git +style = pep440 +versionfile_source = metapredict/_version.py +versionfile_build = metapredict/_version.py +tag_prefix = '' + +[aliases] +test = pytest diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..784ad3d --- /dev/null +++ b/setup.py @@ -0,0 +1,66 @@ +""" +metapredict +A protein disorder predictor based on a BRNN (IDP-Parrot) trained on the consensus disorder values from 8 disorder predictors from 12 proteomes. +""" +import sys +from setuptools import setup, find_packages +import versioneer + +short_description = __doc__.split("\n") + +# from https://github.com/pytest-dev/pytest-runner#conditional-requirement +needs_pytest = {'pytest', 'test', 'ptr'}.intersection(sys.argv) +pytest_runner = ['pytest-runner'] if needs_pytest else [] + +try: + with open("README.md", "r") as handle: + long_description = handle.read() +except: + long_description = "\n".join(short_description[2:]) + + +setup( + # Self-descriptive entries which should always be present + name='metapredict', + author='Ryan Emenecker - Holehouse Lab - WUSM', + author_email='remenecker@wustl.edu', + description=short_description[0], + long_description=long_description, + long_description_content_type="text/markdown", + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), + license='MIT', + + # Which Python importable modules should be included when your package is installed + # Handled automatically by setuptools. Use 'exclude' to prevent some specific + # subpackage(s) from being added, if needed + packages=find_packages(), + + # Optional include package data to ship with your package + # Customize MANIFEST.in if the general case does not suit your needs + # Comment out this line to prevent the files from being packaged with your software + include_package_data=True, + + # Allows `setup.py test` to work correctly with pytest + setup_requires=[] + pytest_runner, + scripts=['scripts/graph-disorder', + 'scripts/predict-disorder'], + + # Additional entries you may want simply uncomment the lines you want and fill in the data + # url='http://www.my_package.com', # Website + install_requires=[ + 'torch', + 'numpy', + 'matplotlib', + 'pandas', + 'protfasta'], # Required packages, pulls from pip if needed; do not use for Conda deployment + # platforms=['Linux', + # 'Mac OS-X', + # 'Unix', + # 'Windows'], # Valid platforms your code works on, adjust to your flavor + python_requires=">=3.5", # Python version restrictions + + # Manual control if final package is compressible or not, set False to prevent the .egg from being made + # zip_safe=False, + +) diff --git a/versioneer.py b/versioneer.py new file mode 100644 index 0000000..64fea1c --- /dev/null +++ b/versioneer.py @@ -0,0 +1,1822 @@ + +# Version: 0.18 + +"""The Versioneer - like a rocketeer, but for versions. + +The Versioneer +============== + +* like a rocketeer, but for versions! +* https://github.com/warner/python-versioneer +* Brian Warner +* License: Public Domain +* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy +* [![Latest Version] +(https://pypip.in/version/versioneer/badge.svg?style=flat) +](https://pypi.python.org/pypi/versioneer/) +* [![Build Status] +(https://travis-ci.org/warner/python-versioneer.png?branch=master) +](https://travis-ci.org/warner/python-versioneer) + +This is a tool for managing a recorded version number in distutils-based +python projects. The goal is to remove the tedious and error-prone "update +the embedded version string" step from your release process. Making a new +release should be as easy as recording a new tag in your version-control +system, and maybe making new tarballs. + + +## Quick Install + +* `pip install versioneer` to somewhere to your $PATH +* add a `[versioneer]` section to your setup.cfg (see below) +* run `versioneer install` in your source tree, commit the results + +## Version Identifiers + +Source trees come from a variety of places: + +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI + +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: + +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step + +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes. + +The version identifier is used for multiple purposes: + +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball + +## Theory of Operation + +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. + +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. + +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. + +## Installation + +See [INSTALL.md](./INSTALL.md) for detailed installation instructions. + +## Version-String Flavors + +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. + +Both functions return a dictionary with different flavors of version +information: + +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. + +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". + +* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the + commit date in ISO 8601 format. This will be None if the date is not + available. + +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None + +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". + +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. + +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions + +## Styles + +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. + +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". + +Other styles are available. See [details.md](details.md) in the Versioneer +source tree for descriptions. + +## Debugging + +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). + +## Known Limitations + +Some situations are known to cause problems for Versioneer. This details the +most significant ones. More can be found on Github +[issues page](https://github.com/warner/python-versioneer/issues). + +### Subprojects + +Versioneer has limited support for source trees in which `setup.py` is not in +the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are +two common reasons why `setup.py` might not be in the root: + +* Source trees which contain multiple subprojects, such as + [Buildbot](https://github.com/buildbot/buildbot), which contains both + "master" and "slave" subprojects, each with their own `setup.py`, + `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI + distributions (and upload multiple independently-installable tarballs). +* Source trees whose main purpose is to contain a C library, but which also + provide bindings to Python (and perhaps other langauges) in subdirectories. + +Versioneer will look for `.git` in parent directories, and most operations +should get the right version string. However `pip` and `setuptools` have bugs +and implementation details which frequently cause `pip install .` from a +subproject directory to fail to find a correct version string (so it usually +defaults to `0+unknown`). + +`pip install --editable .` should work correctly. `setup.py install` might +work too. + +Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in +some later version. + +[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking +this issue. The discussion in +[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the +issue from the Versioneer side in more detail. +[pip PR#3176](https://github.com/pypa/pip/pull/3176) and +[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve +pip to let Versioneer work correctly. + +Versioneer-0.16 and earlier only looked for a `.git` directory next to the +`setup.cfg`, so subprojects were completely unsupported with those releases. + +### Editable installs with setuptools <= 18.5 + +`setup.py develop` and `pip install --editable .` allow you to install a +project into a virtualenv once, then continue editing the source code (and +test) without re-installing after every change. + +"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a +convenient way to specify executable scripts that should be installed along +with the python package. + +These both work as expected when using modern setuptools. When using +setuptools-18.5 or earlier, however, certain operations will cause +`pkg_resources.DistributionNotFound` errors when running the entrypoint +script, which must be resolved by re-installing the package. This happens +when the install happens with one version, then the egg_info data is +regenerated while a different version is checked out. Many setup.py commands +cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into +a different virtualenv), so this can be surprising. + +[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes +this one, but upgrading to a newer version of setuptools should probably +resolve it. + +### Unicode version strings + +While Versioneer works (and is continually tested) with both Python 2 and +Python 3, it is not entirely consistent with bytes-vs-unicode distinctions. +Newer releases probably generate unicode version strings on py2. It's not +clear that this is wrong, but it may be surprising for applications when then +write these strings to a network connection or include them in bytes-oriented +APIs like cryptographic checksums. + +[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates +this question. + + +## Updating Versioneer + +To upgrade your project to a new release of Versioneer, do the following: + +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files + +## Future Directions + +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. + + +## License + +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . + +""" + +from __future__ import print_function +try: + import configparser +except ImportError: + import ConfigParser as configparser +import errno +import json +import os +import re +import subprocess +import sys + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ("Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND').") + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + me = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(me)[0]) + vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) + if me_dir != vsr_dir: + print("Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py)) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise EnvironmentError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.SafeConfigParser() + with open(setup_cfg, "r") as f: + parser.readfp(f) + VCS = parser.get("versioneer", "VCS") # mandatory + + def get(parser, name): + if parser.has_option("versioneer", name): + return parser.get("versioneer", name) + return None + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = get(parser, "style") or "" + cfg.versionfile_source = get(parser, "versionfile_source") + cfg.versionfile_build = get(parser, "versionfile_build") + cfg.tag_prefix = get(parser, "tag_prefix") + if cfg.tag_prefix in ("''", '""'): + cfg.tag_prefix = "" + cfg.parentdir_prefix = get(parser, "parentdir_prefix") + cfg.verbose = get(parser, "verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +LONG_VERSION_PY['git'] = ''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.18 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + print("stdout was %%s" %% stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %%s but none started with prefix %%s" %% + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs - tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %%s not under git control" %% root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%%s*" %% tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%%d" %% pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%%d" %% pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def do_vcs_install(manifest_in, versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + + For Git, this means creating/changing .gitattributes to mark _version.py + for export-subst keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [manifest_in, versionfile_source] + if ipy: + files.append(ipy) + try: + me = __file__ + if me.endswith(".pyc") or me.endswith(".pyo"): + me = os.path.splitext(me)[0] + ".py" + versioneer_file = os.path.relpath(me) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + f = open(".gitattributes", "r") + for line in f.readlines(): + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + f.close() + except EnvironmentError: + pass + if not present: + f = open(".gitattributes", "a+") + f.write("%s export-subst\n" % versionfile_source) + f.close() + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.18) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. + +import json + +version_json = ''' +%s +''' # END VERSION_JSON + + +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except EnvironmentError: + raise NotThisMethod("unable to read _version.py") + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps(versions, sort_keys=True, + indent=1, separators=(",", ": ")) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set %s to '%s'" % (filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert cfg.versionfile_source is not None, \ + "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print("got version from file %s %s" % (versionfile_abs, ver)) + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, "error": "unable to compute version", + "date": None} + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(): + """Get the custom setuptools/distutils subclasses used by Versioneer.""" + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see https://github.com/warner/python-versioneer/issues/52 + + cmds = {} + + # we add "version" to both distutils and setuptools + from distutils.core import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + print(" date: %s" % vers.get("date")) + if vers["error"]: + print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version + + # we override "build_py" in both distutils and setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + # pip install: + # copies source tree to a tempdir before running egg_info/etc + # if .git isn't copied too, 'git describe' will fail + # then does setup.py bdist_wheel, or sometimes setup.py install + # setup.py egg_info -> ? + + # we override different "build_py" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.build_py import build_py as _build_py + else: + from distutils.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + # nczeczulin reports that py2exe won't like the pep440-style string + # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. + # setup(console=[{ + # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION + # "product_version": versioneer.get_version(), + # ... + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + if 'py2exe' in sys.modules: # py2exe enabled? + try: + from py2exe.distutils_buildexe import py2exe as _py2exe # py3 + except ImportError: + from py2exe.build_exe import py2exe as _py2exe # py2 + + class cmd_py2exe(_py2exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _py2exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["py2exe"] = cmd_py2exe + + # we override different "sdist" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.sdist import sdist as _sdist + else: + from distutils.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, + self._versioneer_generated_versions) + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- + +You will also need to edit your setup.py to use the results: + + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) + +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. + +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = + +""" + +INIT_PY_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + + +def do_setup(): + """Main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except (EnvironmentError, configparser.NoSectionError, + configparser.NoOptionError) as e: + if isinstance(e, (EnvironmentError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", + file=sys.stderr) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), + "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy, "r") as f: + old = f.read() + except EnvironmentError: + old = "" + if INIT_PY_SNIPPET not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(INIT_PY_SNIPPET) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in, "r") as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except EnvironmentError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print(" appending versionfile_source ('%s') to MANIFEST.in" % + cfg.versionfile_source) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-subst keyword + # substitution. + do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py", "r") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1)