diff --git a/AUTHORS.rst b/AUTHORS.rst deleted file mode 100644 index 7bbef57..0000000 --- a/AUTHORS.rst +++ /dev/null @@ -1,16 +0,0 @@ -======= -Credits -======= - -Development Team ----------------- - -* William Boag -* Kevin Wacome -* Tristan Naumann -* Anna Rumshisky - -Contributors ------------- - -None yet. Why not be the first? diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst deleted file mode 100644 index 64d4416..0000000 --- a/CONTRIBUTING.rst +++ /dev/null @@ -1,111 +0,0 @@ -============ -Contributing -============ - -Contributions are welcome, and they are greatly appreciated! Every -little bit helps, and credit will always be given. - -You can contribute in many ways: - -Types of Contributions ----------------------- - -Report Bugs -~~~~~~~~~~~ - -Report bugs at https://github.com/mitmedg/CliCon/issues. - -If you are reporting a bug, please include: - -* Your operating system name and version. -* Any details about your local setup that might be helpful in troubleshooting. -* Detailed steps to reproduce the bug. - -Fix Bugs -~~~~~~~~ - -Look through the GitHub issues for bugs. Anything tagged with "bug" -is open to whoever wants to implement it. - -Implement Features -~~~~~~~~~~~~~~~~~~ - -Look through the GitHub issues for features. Anything tagged with "feature" -is open to whoever wants to implement it. - -Write Documentation -~~~~~~~~~~~~~~~~~~~ - -CliCon could always use more documentation, whether as part of the -official CliCon docs, in docstrings, or even on the web in blog posts, -articles, and such. - -Submit Feedback -~~~~~~~~~~~~~~~ - -The best way to send feedback is to file an issue at https://github.com/mitmedg/CliCon/issues. - -If you are proposing a feature: - -* Explain in detail how it would work. -* Keep the scope as narrow as possible, to make it easier to implement. -* Remember that this is a volunteer-driven project, and that contributions - are welcome :) - -Get Started! ------------- - -Ready to contribute? Here's how to set up `clicon` for local development. - -1. Fork the `clicon` repo on GitHub. -2. Clone your fork locally:: - - $ git clone git@github.com:your_name_here/clicon.git - -3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: - - $ mkvirtualenv clicon - $ cd clicon/ - $ python setup.py develop - -4. Create a branch for local development:: - - $ git checkout -b name-of-your-bugfix-or-feature - - Now you can make your changes locally. - -5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: - - $ flake8 clicon tests - $ python setup.py test - $ tox - - To get flake8 and tox, just pip install them into your virtualenv. - -6. Commit your changes and push your branch to GitHub:: - - $ git add . - $ git commit -m "Your detailed description of your changes." - $ git push origin name-of-your-bugfix-or-feature - -7. Submit a pull request through the GitHub website. - -Pull Request Guidelines ------------------------ - -Before you submit a pull request, check that it meets these guidelines: - -1. The pull request should include tests. -2. If the pull request adds functionality, the docs should be updated. Put - your new functionality into a function with a docstring, and add the - feature to the list in README.rst. -3. The pull request should work for Python 2.6, 2.7, and 3.3, 3.4, and for PyPy. Check - https://travis-ci.org/tnaumann/clicon/pull_requests - and make sure that the tests pass for all supported Python versions. - -Tips ----- - -To run a subset of tests:: - - $ python -m unittest tests.test_clicon \ No newline at end of file diff --git a/COPYRIGHT b/COPYRIGHT deleted file mode 100644 index f2c1843..0000000 --- a/COPYRIGHT +++ /dev/null @@ -1 +0,0 @@ -Copyright (C) 2014 MEDG at MIT CSAIL & Text Machine Lab for NLP at UMass Lowell diff --git a/HISTORY.rst b/HISTORY.rst deleted file mode 100644 index 2ad74df..0000000 --- a/HISTORY.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. :changelog: - -History -------- - -0.1 (YYYY-MM-DD) --------------------- - -* First release on PyPI. diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 5c304d1..0000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ -Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/LSTM_parameters.txt b/LSTM_parameters.txt new file mode 100644 index 0000000..c9c4fcd --- /dev/null +++ b/LSTM_parameters.txt @@ -0,0 +1,22 @@ +token_pretrained_embedding_filepath vectors2.txt +load_all_pretrained_token_embeddings False +load_only_pretrained_token_embeddings False +tagging_format bio +use_character_lstm True +use_crf True +Use_LSTM True +use_features_before_final_lstm False +character_embedding_dimension 25 +character_lstm_hidden_state_dimension 25 +token_embedding_dimension 100 +freeze_token_embeddings False +token_lstm_hidden_state_dimension 100 +optimizer sgd +gradient_clipping_value 5.0 +remap_unknown_tokens_to_unk True +learning_rate 0.005 +check_for_lowercase True +check_for_digits_replaced_with_zeros True +model_folder ./models/NN_models/Test_November +conll_like_result_folder ./RESULTS/TEST_SAVER/NOVEMBER_DEBUG/ +model_name model_00001.ckpt diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 6fd9409..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,11 +0,0 @@ -include AUTHORS.rst -include CONTRIBUTING.rst -include HISTORY.rst -include LICENSE -include README.rst - -recursive-include tests * -recursive-exclude * __pycache__ -recursive-exclude * *.py[co] - -recursive-include docs *.rst conf.py Makefile make.bat \ No newline at end of file diff --git a/Makefile b/Makefile deleted file mode 100644 index ff3aa43..0000000 --- a/Makefile +++ /dev/null @@ -1,57 +0,0 @@ -.PHONY: clean-pyc clean-build docs clean - -help: - @echo "clean-build - remove build artifacts" - @echo "clean-pyc - remove Python file artifacts" - @echo "lint - check style with flake8" - @echo "test - run tests quickly with the default Python" - @echo "test-all - run tests on every Python version with tox" - @echo "coverage - check code coverage quickly with the default Python" - @echo "docs - generate Sphinx HTML documentation, including API docs" - @echo "release - package and upload a release" - @echo "dist - package" - -clean: clean-build clean-pyc - rm -fr htmlcov/ - -clean-build: - rm -fr build/ - rm -fr dist/ - rm -fr *.egg-info - -clean-pyc: - find . -name '*.pyc' -exec rm -f {} + - find . -name '*.pyo' -exec rm -f {} + - find . -name '*~' -exec rm -f {} + - -lint: - flake8 clicon tests - -test: - python setup.py test - -test-all: - tox - -coverage: - coverage run --source clicon setup.py test - coverage report -m - coverage html - open htmlcov/index.html - -docs: - rm -f docs/clicon.rst - rm -f docs/modules.rst - sphinx-apidoc -o docs/ clicon - $(MAKE) -C docs clean - $(MAKE) -C docs html - open docs/_build/html/index.html - -release: clean - python setup.py sdist upload - python setup.py bdist_wheel upload - -dist: clean - python setup.py sdist - python setup.py bdist_wheel - ls -l dist \ No newline at end of file diff --git a/README.rst b/README.rst index ab6a85e..733985f 100644 --- a/README.rst +++ b/README.rst @@ -1,289 +1,117 @@ +This repository has been archived, and is now read-only. + =============================== CliNER =============================== -Clinical Named Entity Recognition system (CliNER) is an open-source natural language processing system for named entity recognition in clinical text of electronic health records. CliNER system is designed to follow best practices in clinical concept extraction, as established in i2b2 2010 shared task. +Clinical Named Entity Recognition system (CliNER) is an open-source natural language processing system for named entity recognition in clinical text of electronic health records. CliNER system is designed to follow best practices in clinical concept extraction, as established in i2b2 2010 shared task. -CliNER is implemented as a two-pass machine learning system for named entity recognition, currently using a Conditional Random Fields (CRF) classifier to establish concept boundaries and a Support Vector Machine (SVM) classifier to establish the type of concept. +CliNER is implemented as a sequence classification task, where every token is predicted IOB-style as either: Problem, Test, Treatment, or None. Command line flags let you specify two different sequence classification algorithms: + 1. CRF (default) - with linguistic and domain-specific features + 2. LSTM -Please note that for optimal performance, CliNER requires the users to obtain a Unified Medical Language System (UMLS) license, since UMLS Metathesaurus is used as one of the knowledge sources for the above classifiers. +Please note that for optimal performance, CliNER requires the users to obtain a Unified Medical Language System (UMLS) license, since UMLS Metathesaurus is used as one of the knowledge sources for the above classifiers. * Free software: Apache v2.0 license -* Documentation: http://clicon.readthedocs.org. - +* See the CliNER Wiki page for additional resources. + + https://github.com/text-machine-lab/CliNER/wiki + -Installation +Out-of-the-Box Model -------- -**Cloning the CliCon git repository:** - -:: - - user@your-machine:~$ git clone https://github.com/mitmedg/CliCon.git - Cloning into 'CliCon'... - remote: Counting objects: 1296, done. - remote: Compressing objects: 100% (503/503), done. - remote: Total 1296 (delta 812), reused 1253 (delta 781) - Receiving objects: 100% (1296/1296), 1001.14 KiB | 759 KiB/s, done. - Resolving deltas: 100% (812/812), done. - - -**Using an installation script** - -Linux users can use an installation script to download and install all the components of this project, including third-party dependencies. Note that it can not get tools and data that require special use agreements (including the i2b2 data and the UMLS tables), which have to be obtained separately. - -The following packages need to be on the system for the script to work: - -:: - - python-pip - python-virtualenv - python-dev - - - - -Some of python modules used by CliNER have the following dependencies, which also need to be installed on the system: - -:: - - g++ - gfortran - libopenblas-dev - liblapack-dev - -For Ubuntu users, the above are the names of the packages that need to be installed. A typical command to install a python module on a Debian-flavored Linux is: - -:: - - apt-get install - - -Although the script is able to build python dependencies via pip, this is a slow process. It would be much faster to obtain binaries of certain python modules and then run the script: - -:: - - numpy - scipy - scikit-learn (version 0.14) - - -To invoke the script, ensure you are running a bash shell. Next, ``cd`` into the ``CliCon`` directory: - -:: - - user@your-machine:~$ cd CliCon - user@your-machine:~/CliCon$ source install.sh - - -If the installation script encounters issues, please see the README section corresponding to the failure message. - -If you opt not to use the provided script, you must follow the steps described below, starting with setting up virtual environments and environment variables. Not terribly difficult (we hope!). - -Please email wboag@cs.uml.edu with your installation questions. - - -**Step-by-step installation instructions:** - - -(1) Set up virtualenv - - Setup a virtual environent. You must re-enable the virtual environment every new session. - - :: - - user@your-machine:~$ virtualenv venv_clicon - user@your-machine:~$ source venv_clicon/bin/activate - - - reference - https://virtualenv.pypa.io/en/latest/ - - - -(2) Set the CLICON_DIR environment variable - - In order to run CliNER, you must define the CLICON_DIR environment variable. - - **This variable must be the path of the directory created by git.** - - :: - - user@your-machine:~$ export CLICON_DIR=$(pwd)/CliCon - - - -(3) Install dependencies - +Although i2b2 licensing prevents us from releasing our cliner models trained on i2b2 data, we generated some comparable models from automatically-annotated MIMIC II text. - Ensure the following packages are installed on the system (they are used for building the required Python dependencies): +This silver MIMIC model can be found at http://text-machine.cs.uml.edu/cliner/models/silver.crf - Linux: - * python-pip - * python-virtualenv - * python-dev - * g++ - * gfortran - * libopenblas-dev - * liblapack-dev - - - Mac OSX (e.g. using [Homebrew](http://brew.sh/)): - * python - * gfortran - - - Ensure the following python modules are installed: - * nose - * numpy - * scikit-learn (version 0.14) - * scipy - * python-crfsuite - * nltk (AND run the NLTK downloader) - - - :: - - (venv_clicon)user@your-machine:~/CliCon$ sudo apt-get install python-pip python-virtualenv python-dev g++ gfortran libopenblas-dev liblapack-dev -y - (venv_clicon)user@your-machine:~/CliCon$ pip install nose numpy scikit-learn scipy nltk python-crfsuite - (venv_clicon)user@your-machine:~/CliCon$ python -m nltk.downloader maxent_treebank_pos_tagger wordnet - - - - -(4) Get i2b2 2010 shared task data - - The Data Use and Confidentiality Agreement (DUA) with i2b2 forbids us from redistributing the i2b2 data. In order to gain access to the data, you must go to: - - https://www.i2b2.org/NLP/DataSets/AgreementAR.php - - to register and sign the DUA. Then you will be able to request the data through them. - - - Although we cannot provide i2b2 data, there is a sample to demonstrate how the data is formatted (not actual data from i2b2, though). **Here is a very basic description of the data formats.** It is by no means a complete tutorial. - - * $CLICON_DIR/examples/pretend.txt - - This is a text file. Discharge summaries are written out in plaintext, just like this. It is paired with a concept file, which has its annotations. - - * $CLICON_DIR/examples/pretend.con - - This is a concept file. It provides annotations for the concepts (problem, treatment, test) of the text file. The format is as follows - each instance of a concept has one line. The line describes the word span, the line number and token numbers of the span (delimited by white space), and the label of the concept. - - * $CLICON_DIR/examples/pretend.xml - - This is an alternative way to annotate concepts from a discharge summary. Unlike the text/concept files, this format is not in a pair - it provides both the text and annotations for the discharge summary. This format is easier to read. - - - - - -(5) Install GENIA tagger (optional) - - This is an optional part of installation. Adding the GENIA tagger will improve results of the system's predictions, but it could run without it. - - Steps - - 1. First you must download the sources for GENIA. Do that with ``wget http://www.nactem.ac.uk/tsujii/GENIA/tagger/geniatagger-3.0.1.tar.gz`` +Installation +-------- - 2. In order to compile the sources, you may need to edit a C++ so that it has an additional include directive. This should be able to be accomplished by enterring the geniatagger-3.0.1/ directory and running ``echo "$(sed '1i#include ' morph.cpp)" > morph.cpp`` + $ git clone https://github.com/text-machine-lab/CliNER.git - 3. Compile GENIA. Just run ``make`` + $ pip install -r requirements.txt + +Verifying Installation +-------- + + $ wget http://text-machine.cs.uml.edu/cliner/models/silver.crf + + $ mv silver.crf models/silver.crf + + $ python cliner predict --txt data/examples/ex_doc.txt --out data/predictions --model models/silver.crf --format i2b2 - 4. If you do not have any errors, then the tagger has been built successfully. If there were compile errors, try to resolve them (it'd be one of those "well it works for me" scenarios). +If you *do not* run into build errors, then your installation is complete. - 5. Set the file "$CLICON_DIR/config.txt" so that the line that has "GENIA None" is replaced with "GENIA '. This file is how CliCon is able to find and run the tagger. +Example Data +-------- +Although we cannot provide i2b2 data, there is a sample to demonstrate how the data is formatted (not actual data from i2b2, though). + data/examples/ex_doc.txt -(6) Get UMLS tables (optional) +This is a text file. Discharge summaries are written out in plaintext, just like this. It is paired with a concept file, which has its annotations. - This is an optional part of installation. Adding the UMLS tables will improve results of the system's predictions, but it could run without it. + data/examples/ex_doc.con - In order to use the UMLS tables, you must request a license. See: +This is a concept file. It provides annotations for the concepts (problems, treatments, and tests) of the text file. The format is as follows - each instance of a concept has one line. The line shows the text span, the line number, token numbers of the span (delimited by white space), and the label of the concept. - http://www.nlm.nih.gov/databases/umls.html +**Please note that the example data is simply one of many examples that can found online.** - You will need to get following tables: **MRREL, MRCON, MRSTY** +Usage +-------- - **Put these tables in the $CLICON_DIR/umls_tables directory.** +Here are some use cases: - In order to tell CliNER that the tables are there, you must edit the file "$CLICON_DIR/config.txt" and change the line saying "UMLS None" to "UMLS True". This command will do that ``sed -i "s/UMLS None/UMLS True/g" $CLICON_DIR/config.txt`` +(1) Help - **The database will be built from the tables when CliNER is run for the first time.** +This help message will list the options available to run (train/predict/evaluate) + python cliner --help +(2) Training -(7) Create 'clicon' executable script for command-line use +These examples demonstrate how to build a CliNER model which can then be used for predicting concepts in text files. - In order to run CliNER (as done in the usage examples), you must run setup.py. + python cliner train --txt data/examples/ex_doc.txt --annotations data/examples/ex_doc.con --format i2b2 --model models/foo.model - As long as the Python dependencies are properly installed, you should be able to run the setup script. +This example trains a very simple CliNER model. The (pretend.txt, pretend.con) pair form as the only document for learning to identify concepts. We must specify that these files are i2b2 format (even though the .con extension implies i2b2 format, you can never be too careful). The CliNER model is then serialized to models/foo.model as specified. - If it works, you should see a brief help message when invoking clicon with the ``--help`` option: +**Please note that multiple files could be passed by enclosing them as a glob within "" quotes.** - :: +(3) Prediction - (venv_clicon)user@your-machine:~/CliCon$ python $CLICON_DIR/setup.py install - (venv_clicon)user@your-machine:~/CliCon$ clicon --help +Once your CliNER model is built, you can use it to predict concepts in text files. + python cliner predict --txt data/examples/ex_doc.txt --out data/test_predictions/ --format i2b2 --model models/foo.model +In this example, we use the models/foo.model CliNER model that we built up above. This model is used to predict concepts in i2b2 format for the "ex_doc.txt" file. This generates a file named "ex_doc.con" and stores it in the specified output directory. -(8) Run unit tests +(4) Evaluation - [this section is under construction] +This allows us to evaluate how well CliNER does by comparing it against a gold standard. + python cliner evaluate --txt data/examples/ex_doc.txt --gold examples --predictions data/test_predictions/ --format i2b2 +Evaluate how well the system predictions did. Both sets of data must be in the same format, and that format must be specified. This means that both the examples and data/test_predictions directories contain the file pretend.con. -Usage Examples +Optional Resources -------- - Demo Script - :: - user@your-machine:~/CliCon$ source install.sh - (venv_clicon)user@your-machine:~/CliCon$ bash examples/demo.sh - - - i2b2 format - - Train model on i2b2-formatted data - :: - (venv_clicon)user@your-machine:~/CliCon$ clicon train $CLICON_DIR/examples/pretend.txt --annotations $CLICON_DIR/examples/pretend.con - - Train model on i2b2-formatted data with SVM grid search (NOTE: Currently does not work with sample data because the sample data is too small for cross validation). - :: - (venv_clicon)user@your-machine:~/CliCon$ clicon train $CLICON_DIR/examples/pretend.txt --annotations $CLICON_DIR/examples/pretend.con --grid-search - - Predict concepts and output in i2b2 format - :: - (venv_clicon)user@your-machine:~/CliCon$ clicon predict $CLICON_DIR/examples/pretend.txt --out $CLICON_DIR/data/test_predictions/ - - Evaluation - :: - (venv_clicon)user@your-machine:~/CliCon$ clicon evaluate $CLICON_DIR/examples/pretend.txt --gold $CLICON_DIR/examples --predictions $CLICON_DIR/data/test_predictions/ --format i2b2 - - Change Format - :: - (venv_clicon)user@your-machine:~/CliCon$ clicon format $CLICON_DIR/examples/pretend.txt --annotations $CLICON_DIR/data/test_predictions/pretend.con --format xml - +There are a few external resources that are not packaged with CliNER but can improve prediction performance for feature extraction with the CRF. - xml format +**GENIA** - Train model on xml-formatted data - :: - (venv_clicon)user@your-machine:~/CliCon$ clicon train $CLICON_DIR/examples/pretend.txt --annotations $CLICON_DIR/examples/pretend.xml --format xml +*Why would I want this?* - Predict concepts and output in xml format - :: - (venv_clicon)user@your-machine:~/CliCon$ clicon predict $CLICON_DIR/examples/pretend.txt --out $CLICON_DIR/data/test_predictions/ --format xml +The GENIA tagger is a tool similar to CliNER but designed for biomedical text. Depending on the domain of your data, this tool's pretrained model may or may not be able to improve performance for CliNER as it detects concepts. - Evaluation - :: - (venv_clicon)user@your-machine:~/CliCon$ clicon evaluate $CLICON_DIR/examples/pretend.txt --gold $CLICON_DIR/examples --predictions $CLICON_DIR/data/test_predictions/ --format xml +**UMLS** - Change Format - :: - (venv_clicon)user@your-machine:~/CliCon$ clicon format $CLICON_DIR/data/test_predictions/pretend.xml --format i2b2 +*Why would I want this?* +The UMLS, or Unified Medical Language System, is a very comprehensive database of various medical terms and concepts. Access to it would allow CliNER to leverage domain-specific knowledge. +For installation of optional resources, please refer to the CliNER Wiki: + https://github.com/text-machine-lab/CliNER/wiki#optional-resources diff --git a/clicon/cli.py b/clicon/cli.py deleted file mode 100644 index 94a96e0..0000000 --- a/clicon/cli.py +++ /dev/null @@ -1,187 +0,0 @@ -###################################################################### -# CliNER - cli.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Command Line Interface for working with CliNER. # -###################################################################### - - -__author__ = 'Willie Boag' -__date__ = 'Oct. 5, 2014' - - - -import click -import os -import sys -import subprocess -import glob - -sys.path.append( os.environ['CLICON_DIR'] + "/clicon/notes" ) - -from note import Note - -@click.group() -def clicon(): - pass - - -supported_formats_help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )" - - -# Train -@clicon.command() -@click.option('--annotations' , help='Concept files for training.' ) -@click.option('--model' , help='Model output by train.' ) -@click.option('--format' , help=supported_formats_help ) -@click.option('--grid/--no-grid', help='Flag that enables grid search', - default=False) -@click.option('--crf/--no-crf' , help='Flag that enables crfsuite' , - default=True) -@click.argument('input') -def train(annotations, model, format, grid, crf, input): - - # training data needs concept file annotations - if not annotations: - print >>sys.stderr, '\n\tError: Must provide annotations for text files' - print >>sys.stderr, '' - exit(1) - - # Base directory - BASE_DIR = os.environ.get('CLICON_DIR') - if not BASE_DIR: - raise Exception('Environment variable CLICON_DIR must be defined') - - # Executable - runable = os.path.join(BASE_DIR, 'clicon/train.py') - - # Build command - cmd = ['python', runable, '-t', input] - - # Arguments - if annotations: - cmd += ['-c', annotations] - if model: - cmd += ['-m', model] - if format: - cmd += ['-f', format] - if grid: - cmd += ['-g'] - if not crf: - cmd += ['-no-crf'] - - # Execute train.py - subprocess.call(cmd) - - - - -# Predict -@clicon.command() -@click.option('--out' , help='The directory to write the output') -@click.option('--model' , help='Model used to predict on files' ) -@click.option('--format', help=supported_formats_help ) -@click.argument('input') -def predict(model, out, format, input): - - # Base directory - BASE_DIR = os.environ.get('CLICON_DIR') - if not BASE_DIR: - raise Exception('Environment variable CLICON_DIR must be defined') - - # Executable - runable = os.path.join(BASE_DIR,'clicon/predict.py') - - # Build command - cmd = ['python', runable, '-i', input] - - # Optional arguments - if out: - cmd += ['-o', out] - if model: - cmd += ['-m', model] - if format: - cmd += ['-f', format] - - # Execute train.py - subprocess.call(cmd) - - - - - -# Evaluate -@clicon.command() -@click.option('--predictions', help='Directory where predictions are stored.') -@click.option('--gold' , help='Directory where gold standard is stored.') -@click.option('--out' , help='Output file' ) -@click.option('--format' , help=supported_formats_help ) -@click.argument('input') -def evaluate(predictions, gold, out, format, input): - - # Base directory - BASE_DIR = os.environ.get('CLICON_DIR') - if not BASE_DIR: - raise Exception('Environment variable CLICON_DIR must be defined') - - # Executable - runable = os.path.join(BASE_DIR,'clicon/evaluate.py') - - # Build command - cmd = ['python', runable, '-t', input] - - # Optional arguments - if predictions: - cmd += ['-c', predictions] - if gold: - cmd += ['-r', gold] - if out: - cmd += ['-o', out] - if format: - cmd += ['-f', format] - - # Execute train.py - subprocess.call(cmd) - - - - - -# Format -@clicon.command() -@click.option('--annotations', help='Concept files for training.') -@click.option('--format' , help=supported_formats_help ) -@click.option('--out' , help='File to write the output.' ) -@click.argument('input') -def format(annotations, format, out, input): - - # Base directory - BASE_DIR = os.environ.get('CLICON_DIR') - if not BASE_DIR: - raise Exception('Environment variable CLICON_DIR must be defined') - - # Executable - runable = os.path.join(BASE_DIR,'clicon/format.py') - - # Build command - cmd = ['python', runable, flag, input] - - # Optional arguments - if annotations: - cmd += ['-a', annotations] - if out: - cmd += ['-o', out] - if format: - cmd += ['-f', format] - - # Execute train.py - subprocess.call(cmd) - - - - -if __name__ == '__main__': - clicon() - - diff --git a/clicon/evaluate.py b/clicon/evaluate.py deleted file mode 100644 index 8bf7e91..0000000 --- a/clicon/evaluate.py +++ /dev/null @@ -1,471 +0,0 @@ -###################################################################### -# CliNER - evaluate.py # -# # -# Kevin Wacome kwacome@cs.uml.edu # -# # -# Purpose: Evaluate predictions of concept labels against gold. # -###################################################################### - - -__author__ = 'Kevin Wacome' -__date__ = 'Aug. 20, 2014' - - - -import os -import sys -import argparse -import glob -import helper -from copy import deepcopy - -from notes.note import Note -from notes.note import concept_labels as labels - -def containsSpan(s1, s2): - return (s1[0] <= s2[0]) and (s2[1] <= s1[1]) - -def spanOverlap(s1, s2): - if s2[0] <= s1[0] <= s2[1]: return True - if s2[0] <= s1[1] <= s2[1]: return True - if s1[0] <= s2[0] <= s1[1]: return True - if s1[0] <= s2[1] <= s1[1]: return True - return False - -def getConceptSpans(boundaries, classifications): - - conceptSpans = {} - - for lineIndex, span in enumerate(boundaries): - for boundaryIndex, boundary in enumerate(span): - if boundary == 'B': - - concept = classifications[lineIndex][boundaryIndex] - beginning = boundaryIndex - end = boundaryIndex - - if conceptSpans.has_key(lineIndex) == False: - conceptSpans[lineIndex] = {} - - for possibleEnd in span[boundaryIndex+1:]: - if possibleEnd == 'B' or possibleEnd == 'O': - break - if possibleEnd == 'I': - end += 1 - conceptSpans[lineIndex].update({(beginning,end):concept}) - - return conceptSpans - -def evaluate(referenceSpans, predictedSpans, exactMatch=False, reportSeperately=False): - - #used to generate a dictionary of dictionaries - #of the form measuresForClasses["treatment"]["True Positives"] -> Number of True Positives for treatment classes - classes = [ - "treatment", - "problem", - "test" - ] - - measures = { - "True Positives":0, - "False Negatives":0, - "False Positives":0 - } - - confusion = [[0] * len(labels) for e in labels] - - #TO DO: figure out how to report concepts seperately and have it work with reporting together as well. - - measuresForClasses = {classKey:deepcopy(measures) for classKey in dict.fromkeys(classes)} - - falseNegs = [] - - for line in referenceSpans: - - #if the line does not exist for whatever reason for all spans on that line - #mark them as false negative - if line not in predictedSpans: - for spanNotInPredictedSpan in referenceSpans[line]: - classification = referenceSpans[line][spanNotInPredictedSpan] - measuresForClasses[classification]["False Negatives"] +=1 - confusion[labels[classification]][labels['none']] += 1 - continue - - if exactMatch == True: - - for span in referenceSpans[line]: - - classInRefSpan = referenceSpans[line][span] - - # if the span exists & concept matches, then true positive - if span in predictedSpans[line]: - - classInPredSpan = predictedSpans[line][span] - - if referenceSpans[line][span] == predictedSpans[line][span]: - measuresForClasses[classInRefSpan]["True Positives"] += 1 - else: - measuresForClasses[classInRefSpan]["False Negatives"] += 1 - confusion[labels[classInRefSpan]][labels[classInPredSpan]] += 1 - predictedSpans[line].pop(span) - - else: - - overlap = False - for pSpan,v in predictedSpans[line].items(): - #if containsSpan(span, pSpan): - if spanOverlap(span, pSpan): # or spanOverlap(pSpan,span): - overlap = True - - if not overlap: - measuresForClasses[classInRefSpan]["False Negatives"] += 1 - confusion[labels[classInRefSpan]][labels['none']] += 1 - - else: - - #find true positives for inexact spans - accountedFor = {} - for span in referenceSpans[line]: - - accountedFor[span] = False - classInRefSpan = referenceSpans[line][span] - - longestMatchingSpanWithMatchingClassification = {"Predicted Span":None, "Predicted Span length":None, "Predicted Concept":None} - - - for predictedSpan in predictedSpans[line]: - - classinPredSpan = predictedSpans[line][predictedSpan] - - #FIND LONGEST OVERLAP - if predictedSpan[0] >= span[0] and predictedSpan[0] <= span[1] \ - or predictedSpan[1] >= span[0] and predictedSpan[1] <= span[1]: - - #find longest span with match - if classInRefSpan == classinPredSpan: - - if longestMatchingSpanWithMatchingClassification["Predicted Span length"] <(predictedSpan[1] - predictedSpan[0]): - longestMatchingSpanWithMatchingClassification["Predicted Span length"] = (predictedSpan[1] - predictedSpan[0]) - longestMatchingSpanWithMatchingClassification["Predicted Concept"] = classinPredSpan - longestMatchingSpanWithMatchingClassification["Predicted Span"] = predictedSpan - - #if there is an overlapping concept match report true positive - if longestMatchingSpanWithMatchingClassification['Predicted Span'] != None: - accountedFor[span] = True - measuresForClasses[classInRefSpan]["True Positives"] += 1 - confusion[labels[classInRefSpan]][labels[longestMatchingSpanWithMatchingClassification["Predicted Concept"]]] += 1 - predictedSpans[line].pop(longestMatchingSpanWithMatchingClassification["Predicted Span"]) - - - #find the false negatives for inexact spans - for span in referenceSpans[line]: - #already accounted for - if accountedFor[span] == True: - continue - - classInRefSpan = referenceSpans[line][span] - - longestMatchingSpan = {"Predicted Span":None, "Predicted Span length":None, "Predicted Concept":None} - for predictedSpan in predictedSpans[line]: - classinPredSpan = predictedSpans[line][predictedSpan] - - #FIND LONGEST OVERLAP - if predictedSpan[0] >= span[0] and predictedSpan[0] <= span[1] \ - or predictedSpan[1] >= span[0] and predictedSpan[1] <= span[1]: - - if longestMatchingSpan["Predicted Span length"] <(predictedSpan[1] - predictedSpan[0]): - longestMatchingSpan["Predicted Span length"] = (predictedSpan[1] - predictedSpan[0]) - longestMatchingSpan["Predicted Concept"] = classinPredSpan - longestMatchingSpan["Predicted Span"] = predictedSpan - - if longestMatchingSpan['Predicted Span'] != None: - measuresForClasses[classInRefSpan]["False Negatives"] += 1 - confusion[labels[classInRefSpan]][labels[longestMatchingSpan["Predicted Concept"]]] += 1 - predictedSpans[line].pop(longestMatchingSpan["Predicted Span"]) - else: - measuresForClasses[classInRefSpan]["False Negatives"] += 1 - confusion[labels[classInRefSpan]][labels['none']] += 1 - #predictedSpans[line].pop(span) - - - #for all the spans that are in predicted that are left. these are false positives - #as they do not occur in the reference spans. - leftover = deepcopy(predictedSpans) - for line in predictedSpans: - for span in predictedSpans[line]: - - classInPredSpan = predictedSpans[line][span] - - if True: - measuresForClasses[classInPredSpan]["False Positives"] += 1 - confusion[labels['none']][labels[classInPredSpan]] += 1 - leftover[line].pop(span) - - - #if false then do not report concepts - if reportSeperately == False: - truePositive = 0 - falseNegative = 0 - falsePositive = 0 - for dictKey in measuresForClasses: - truePositive += measuresForClasses[dictKey]["True Positives"] - falseNegative += measuresForClasses[dictKey]["False Negatives"] - falsePositive += measuresForClasses[dictKey]["False Positives"] - - return {"True Positives":truePositive, "False Negatives":falseNegative, "False Positives":falsePositive} - else: - return confusion - -def displayMatrix(out, name, confusion): - # Display the confusion matrix - print >>out, "" - print >>out, "" - print >>out, "" - print >>out, "================" - print >>out, name + " RESULTS" - print >>out, "================" - print >>out, "" - print >>out, "Confusion Matrix" - pad = max(len(l) for l in labels) + 6 - print >>out, "%s %s" % (' ' * pad, "\t".join(labels.keys())) - for act, act_v in labels.items(): - print >>out, "%s %s" % (act.rjust(pad), "\t".join([str(confusion[act_v][pre_v]) for pre, pre_v in labels.items()])) - print >>out, "" - - - # Compute the analysis stuff - precision = [] - recall = [] - specificity = [] - f1 = [] - - tp = 0 - fp = 0 - fn = 0 - tn = 0 - - print >>out, "Analysis" - print >>out, " " * pad, "Precision\tRecall\tF1" - - - for lab, lab_v in labels.items(): - if lab == 'none': continue - - tp = confusion[lab_v][lab_v] - fp = sum(confusion[v][lab_v] for k, v in labels.items() if v != lab_v) - fn = sum(confusion[lab_v][v] for k, v in labels.items() if v != lab_v) - - precision += [float(tp) / (tp + fp + 1e-100)] - recall += [float(tp) / (tp + fn + 1e-100)] - f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)] - print >>out, "%s %.4f\t%.4f\t%.4f" % (lab.rjust(pad), precision[-1], recall[-1], f1[-1]) - - print >>out, "--------" - - precision = sum(precision) / len(precision) - recall = sum(recall) / len(recall) - f1 = sum(f1) / len(f1) - - print >>out, "Average: %.4f\t%.4f\t%.4f" % (precision, recall, f1) - - - -def generateResultsForExactSpans(truePositive, falseNegative, falsePositive): - - #convert to float implicitly incase of any truncation - truePositive = float(truePositive) - flaseNegative = float(falseNegative) - falsePositive = float(falsePositive) - - recall = truePositive / (truePositive + falseNegative) - precision = truePositive / (truePositive + falsePositive) - fScore = (2*truePositive) / (2*truePositive + falseNegative + falsePositive) - - #convert to percent - return {"Recall":(recall * 100), "Precision":(precision * 100), "F Score":(fScore * 100)} - - -def main(): - - parser = argparse.ArgumentParser() - - parser.add_argument("-t", - help = "Text files that were used to generate predictions", - dest = "txt", - default = os.path.join(os.getenv('CLICON_DIR'), 'data/test_data/*') - ) - - parser.add_argument("-c", - help = "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf", - dest = "con", - default = os.path.join(os.getenv('CLICON_DIR'), 'data/test_predictions/') - ) - - parser.add_argument("-r", - help = "The directory that contains reference gold standard concept files", - dest = "ref", - default = os.path.join(os.getenv('CLICON_DIR'), 'data/reference_standard_for_test_data/concepts/') - ) - - parser.add_argument("-f", - dest = "format", - help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )", - default = 'i2b2' - ) - - parser.add_argument("-o", - help = "Write the evaluation to a file rather than STDOUT", - dest = "output", - default = None - ) - - # Parse command line arguments - args = parser.parse_args() - format = args.format - - - # Is output destination specified? - if args.output: - args.output = open(args.output, "w") - else: - args.output = sys.stdout - - - # Must specify output format - if format not in Note.supportedFormats(): - print >>sys.stderr, '\n\tError: Must specify output format' - print >>sys.stderr, '\tAvailable formats: ', ' | '.join(Note.supportedFormats()) - print >>sys.stderr, '' - exit(1) - - - # List of medical text - txt_files = glob.glob(args.txt) - txt_files_map = helper.map_files(txt_files) - wildcard = '*.' + Note.dictOfFormatToExtensions()[format] - - - # List of gold data - ref_files = glob.glob( os.path.join(args.ref, wildcard) ) - ref_files_map = helper.map_files(ref_files) - - - # List of predictions - pred_files = glob.glob( os.path.join(args.con, wildcard) ) - pred_files_map = helper.map_files(pred_files) - - - # Grouping of text, predictions, gold - files = [] - for k in txt_files_map: - if k in pred_files_map and k in ref_files_map: - files.append((txt_files_map[k], pred_files_map[k], ref_files_map[k])) - - - # txt <- medical text - # annotations <- predictions - # gold <- gold standard - - - truePositivesExactSpan = 0 - falseNegativesExactSpan = 0 - falsePositivesExactSpan = 0 - - truePositivesInexactSpan = 0 - falseNegativesInexactSpan = 0 - falsePositivesInexactSpan = 0 - - confusion = [[0] * len(labels) for e in labels] - - confusionMatrixExactSpan = deepcopy(confusion) - confusionMatrixInexactSpan = deepcopy(confusion) - - - - for txt, annotations, gold in files: - - # Read predictions and gols standard data - cnote = Note(format) - rnote = Note(format) - cnote.read(txt, annotations) - rnote.read(txt, gold) - - referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist()) - predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist()) - - #TO DO: i need to generate a cumulative total accross all of the files - #modify my functions slightly and have it return the number of true positive and etc... - #then call generate results - - exactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=False) - - inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False) - - - truePositivesExactSpan += exactResults["True Positives"] - falseNegativesExactSpan += exactResults["False Negatives"] - falsePositivesExactSpan += exactResults["False Positives"] - - - inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False) - - truePositivesInexactSpan += inexactResults["True Positives"] - falseNegativesInexactSpan += inexactResults["False Negatives"] - falsePositivesInexactSpan += inexactResults["False Positives"] - - MatrixInexactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True) - - for sublist1, sublist2 in zip(confusionMatrixInexactSpan, MatrixInexactSpan): - for i,int2 in enumerate(sublist2): - sublist1[i] += int2 - - MatrixExactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=True) - - for sublist1, sublist2 in zip(confusionMatrixExactSpan, MatrixExactSpan): - for i,int2 in enumerate(sublist2): - sublist1[i] += int2 - - print "\nResults for exact span for concepts together.\n" - - print "True Positives: ", truePositivesExactSpan - print "False Negatives: ", falseNegativesExactSpan - print "False Positives: ", falsePositivesExactSpan - - exactSpan = generateResultsForExactSpans(truePositivesExactSpan, - falseNegativesExactSpan, - falsePositivesExactSpan) - - print "Recall: ", exactSpan["Recall"] - print "Precision: ", exactSpan["Precision"] - print "F Measure: ", exactSpan["F Score"] - - inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan, - falseNegativesInexactSpan, - falsePositivesInexactSpan) - - print "\nResults for inexact span for concepts together.\n" - - print "True Positives: ", truePositivesInexactSpan - print "False Negatives: ", falseNegativesInexactSpan - print "False Positives: ", falsePositivesInexactSpan - - print "Recall: ", inexactSpan["Recall"] - print "Precision: ", inexactSpan["Precision"] - print "F Measure: ", inexactSpan["F Score"] - - #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans - #TO DO: number of FP, FN, TP is not same between exact and inexact. - - #LEFT OFF HERE. FIX DISPLAY FUNCTION - - displayMatrix(args.output, 'Exact' , confusionMatrixExactSpan) - displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan) - - - #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True) - - return - -if __name__ == '__main__': - main() - diff --git a/clicon/features_dir/features.py b/clicon/features_dir/features.py deleted file mode 100644 index a78834d..0000000 --- a/clicon/features_dir/features.py +++ /dev/null @@ -1,78 +0,0 @@ -###################################################################### -# CliCon - clicon_features.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Isolate the model's features from model.py # -###################################################################### - - -__author__ = 'Willie Boag' -__date__ = 'Jan. 27, 2014' - - - -import nltk -import re - -from wordshape import getWordShapes -from utilities import prose_sentence - -from sentence_features import SentenceFeatures - - - -class FeatureWrapper: - - # FIXME - Make three objects - one for each classifier - - - # Instantiate an FeatureWrapper object - def __init__(self, data=None): - - # Sentence-level features - self.feat_sent = SentenceFeatures(data) - - - - # IOB_features() - # - # input: A sentence - # output: A hash table of features - def extract_IOB_features(self, sentence): - - # Different features depending on whether sentence is 'prose' - isProse = prose_sentence(sentence) - - if isProse: - features_list = self.feat_sent.IOB_prose_features(sentence) - else: - features_list = self.feat_sent.IOB_nonprose_features(sentence) - - # Return features as well as indication of whether it is prose or not - return (isProse, features_list) - - - - # concept_features() - # - # input: A sentence/line from a medical text file (list of chunks) - # An list of indices into the sentence for each important chunk - # output: A list of hash tables of features - def concept_features(self, sentence, chunk_inds): - - # FIXME - move all of this work to SentenceFeatures object - - ''' - # VERY basic feature set for sanity check tests during development - features_list = [] - for i,ind in enumerate(chunk_inds): - features = {('phrase',sentence[ind]) : 1} - features_list.append(features) - return features_list - ''' - - # Create a list of feature sets (one per chunk) - features_list = self.feat_sent.concept_features_for_sentence(sentence,chunk_inds) - return features_list - diff --git a/clicon/features_dir/sentence_features.py b/clicon/features_dir/sentence_features.py deleted file mode 100644 index 78cc4bd..0000000 --- a/clicon/features_dir/sentence_features.py +++ /dev/null @@ -1,376 +0,0 @@ -###################################################################### -# CliCon - sentence_features.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Isolate the model's sentence-level features # -###################################################################### - - -__author__ = 'Willie Boag' -__date__ = 'Apr. 27, 2014' - - - -import nltk -import re -from wordshape import getWordShapes - - - -# What modules are available -from read_config import enabled_modules - - - - -# Import feature modules -enabled = enabled_modules() -if enabled['GENIA']: - from genia_dir.genia_features import GeniaFeatures - -if enabled['UMLS']: - from umls_dir.umls_features import UMLSFeatures - -from word_features import WordFeatures - - - - -class SentenceFeatures: - - - # Feature Enabling - - enabled_concept_features = frozenset( ["UMLS"]) - - - - # Instantiate an Sentence object - def __init__(self, data): - - # Word-level features module - self.feat_word = WordFeatures() - - # Only run GENIA tagger if module is available - if data and enabled['GENIA']: - tagger = enabled['GENIA'] - self.feat_genia = GeniaFeatures(tagger,data) - - # Only create UMLS cache if module is available - if enabled['UMLS']: - self.feat_umls = UMLSFeatures() - - - self.enabled_IOB_nonprose_sentence_features = [] - #self.enabled_IOB_nonprose_sentence_features.append('pos') - #self.enabled_IOB_nonprose_sentence_features.append('pos_context') - self.enabled_IOB_nonprose_sentence_features.append('prev') - self.enabled_IOB_nonprose_sentence_features.append('next') - self.enabled_IOB_nonprose_sentence_features.append('unigram_context') - self.enabled_IOB_nonprose_sentence_features.append('UMLS') - - self.enabled_IOB_prose_sentence_features = [] - self.enabled_IOB_prose_sentence_features.append('unigram_context') - self.enabled_IOB_prose_sentence_features.append('pos') - self.enabled_IOB_prose_sentence_features.append('pos_context') - self.enabled_IOB_prose_sentence_features.append('prev') - self.enabled_IOB_prose_sentence_features.append('prev2') - self.enabled_IOB_prose_sentence_features.append('next') - self.enabled_IOB_prose_sentence_features.append('next2') - self.enabled_IOB_prose_sentence_features.append('GENIA') - self.enabled_IOB_prose_sentence_features.append('UMLS') - - - - - # IOB_prose_features() - # - # input: A sentence - # output: A list of hash tables of features - def IOB_prose_features(self, sentence): - - features_list = [] - - # Get a feature set for each word in the sentence - for i,word in enumerate(sentence): - features_list.append(self.feat_word.IOB_prose_features(sentence[i])) - - - # Feature: Bag of Words unigram conext (window=3) - if 'unigram_context' in self.enabled_IOB_prose_sentence_features: - window = 3 - n = len(sentence) - - # Previous unigrams - for i in range(n): - end = min(i, window) - unigrams = sentence[i-end:i] - for j,u in enumerate(unigrams): - features_list[i][('prev_unigrams-%d'%j,u)] = 1 - - # Next unigrams - for i in range(n): - end = min(i + window, n-1) - unigrams = sentence[i+1:end+1] - for j,u in enumerate(unigrams): - features_list[i][('next_unigrams-%d'%j,u)] = 1 - - - # Only POS tag once - if 'pos' in self.enabled_IOB_prose_sentence_features: - pos_tagged = nltk.pos_tag(sentence) - - - # Allow for particular features to be enabled - for feature in self.enabled_IOB_prose_sentence_features: - - - # Feature: Part of Speech - if feature == 'pos': - for (i,(_,pos)) in enumerate(pos_tagged): - features_list[i].update( { ('pos',pos) : 1} ) - - - # Feature: POS context - if 'pos_context' in self.enabled_IOB_prose_sentence_features: - window = 3 - n = len(sentence) - - # Previous POS - for i in range(n): - end = min(i, window) - for j,p in enumerate(pos_tagged[i-end:i]): - pos = p[1] - features_list[i][('prev_pos_context-%d'%j,pos)] = 1 - - # Next POS - for i in range(n): - end = min(i + window, n-1) - for j,p in enumerate(pos_tagged[i+1:i+end+1]): - pos = p[1] - features_list[i][('prev_pos_context-%d'%j,pos)] = 1 - - - # GENIA features - if (feature == 'GENIA') and enabled['GENIA']: - - # Get GENIA features - genia_feat_list = self.feat_genia.features(sentence) - - ''' - print '\t', sentence - - print '\n\n' - for gf in genia_feat_list: - print '\t', gf - print - print '\n\n' - ''' - - for i,feat_dict in enumerate(genia_feat_list): - features_list[i].update(feat_dict) - - - # Feature: UMLS Word Features (only use prose ones) - if (feature == "UMLS") and enabled['UMLS']: - umls_features = self.feat_umls.IOB_prose_features(sentence) - for i in range(len(sentence)): - features_list[i].update( umls_features[i] ) - - - # Used for 'prev' and 'next' features - ngram_features = [{} for i in range(len(features_list))] - if "prev" in self.enabled_IOB_prose_sentence_features: - prev = lambda f: {("prev_"+k[0], k[1]): v for k,v in f.items()} - prev_list = map(prev, features_list) - for i in range(len(features_list)): - if i == 0: - ngram_features[i][("prev", "*")] = 1 - else: - ngram_features[i].update(prev_list[i-1]) - - if "prev2" in self.enabled_IOB_prose_sentence_features: - prev2 = lambda f: {("prev2_"+k[0], k[1]): v/2.0 for k,v in f.items()} - prev_list = map(prev2, features_list) - for i in range(len(features_list)): - if i == 0: - ngram_features[i][("prev2", "*")] = 1 - elif i == 1: - ngram_features[i][("prev2", "*")] = 1 - else: - ngram_features[i].update(prev_list[i-2]) - - if "next" in self.enabled_IOB_prose_sentence_features: - next = lambda f: {("next_"+k[0], k[1]): v for k,v in f.items()} - next_list = map(next, features_list) - for i in range(len(features_list)): - if i < len(features_list) - 1: - ngram_features[i].update(next_list[i+1]) - else: - ngram_features[i][("next", "*")] = 1 - - if "next2" in self.enabled_IOB_prose_sentence_features: - next2 = lambda f: {("next2_"+k[0], k[1]): v/2.0 for k,v in f.items()} - next_list = map(next2, features_list) - for i in range(len(features_list)): - if i < len(features_list) - 2: - ngram_features[i].update(next_list[i+2]) - elif i == len(features_list) - 2: - ngram_features[i][("next2", "**")] = 1 - else: - ngram_features[i][("next2", "*")] = 1 - - merged = lambda d1, d2: dict(d1.items() + d2.items()) - features_list = [merged(features_list[i], ngram_features[i]) - for i in range(len(features_list))] - - - ''' - for f in features_list: - print sorted(f.items()) - print - print '\n\n\n' - ''' - - return features_list - - - - # IOB_nonprose_features() - # - # input: A sentence - # output: A hash table of features - def IOB_nonprose_features(self, sentence): - - - # Get a feature set for each word in the sentence - features_list = [] - for i,word in enumerate(sentence): - word_feats = self.feat_word.IOB_nonprose_features(sentence[i]) - features_list.append( word_feats ) - - - # Feature: Bag of Words unigram conext (window=3) - if 'unigram_context' in self.enabled_IOB_nonprose_sentence_features: - window = 3 - n = len(sentence) - - # Previous unigrams - for i in range(n): - end = min(i, window) - unigrams = sentence[i-end:i] - for j,u in enumerate(unigrams): - features_list[i][('prev_unigrams-%d'%j,u)] = 1 - - # Next unigrams - for i in range(n): - end = min(i + window, n-1) - unigrams = sentence[i+1:end+1] - for u in unigrams: - features_list[i][('next_unigrams-%d'%j,u)] = 1 - - - # Feature: UMLS Word Features (only use nonprose ones) - if enabled['UMLS'] and 'UMLS' in self.enabled_IOB_nonprose_sentence_features: - umls_features = self.feat_umls.IOB_nonprose_features(sentence) - for i in range(len(sentence)): - features_list[i].update( umls_features[i] ) - - - #return features_list - - if 'pos' in self.enabled_IOB_nonprose_sentence_features: - pos_tagged = nltk.pos_tag(sentence) - - - # Allow for particular features to be enabled - for feature in self.enabled_IOB_nonprose_sentence_features: - - # Feature: Part of Speech - if feature == 'pos': - for (i,(_,pos)) in enumerate(pos_tagged): - features_list[i][ ('pos',pos) ] = 1 - - - # Feature: POS context - if 'pos_context' in self.enabled_IOB_nonprose_sentence_features: - window = 3 - n = len(sentence) - - # Previous POS - for i in range(n): - end = min(i, window) - for j,p in enumerate(pos_tagged[i-end:i]): - pos = p[1] - features_list[i][('prev_pos_context-%d'%j,pos)] = 1 - - # Next POS - for i in range(n): - end = min(i + window, n-1) - for j,p in enumerate(pos_tagged[i+1:i+end+1]): - pos = p[1] - features_list[i][('prev_pos_context-%d'%j,pos)] = 1 - - - - ngram_features = [{} for _ in range(len(features_list))] - if "prev" in self.enabled_IOB_nonprose_sentence_features: - prev = lambda f: {("prev_"+k[0], k[1]): v for k,v in f.items()} - prev_list = map(prev, features_list) - for i in range(len(features_list)): - if i == 0: - ngram_features[i][("prev", "*")] = 1 - else: - ngram_features[i].update(prev_list[i-1]) - - if "next" in self.enabled_IOB_nonprose_sentence_features: - next = lambda f: {("next_"+k[0], k[1]): v for k,v in f.items()} - next_list = map(next, features_list) - for i in range(len(features_list)): - if i == len(features_list) - 1: - ngram_features[i][("next", "*")] = 1 - else: - ngram_features[i].update(next_list[i+1]) - - - merged = lambda d1, d2: dict(d1.items() + d2.items()) - features_list = [merged(features_list[i], ngram_features[i]) - for i in range(len(features_list))] - - - return features_list - - - - - def concept_features_for_sentence(self, sentence, chunk_inds): - - """ - concept_features() - - @param sentence. A sentence in list of chunk format - @param chunk_inds. A list of indices for non-None-labeled chunks - @return A list of feature dictionaries - """ - - - # Get a feature set for each word in the sentence - features_list = [] - for ind in chunk_inds: - features_list.append( self.feat_word.concept_features_for_chunk(sentence,ind) ) - - - # Allow for particular features to be enabled - for feature in self.enabled_concept_features: - - # Features: UMLS features - if (feature == "UMLS") and enabled['UMLS']: - umls_features = self.feat_umls.concept_features_for_chunks(sentence, chunk_inds) - for i in range(len(chunk_inds)): - features_list[i].update( umls_features[i] ) - - - return features_list - - diff --git a/clicon/features_dir/umls_dir/create_sqliteDB.py b/clicon/features_dir/umls_dir/create_sqliteDB.py deleted file mode 100644 index 014b91c..0000000 --- a/clicon/features_dir/umls_dir/create_sqliteDB.py +++ /dev/null @@ -1,112 +0,0 @@ -#database.py creates a .db file for performing umls searches. -import sqlite3 -import os -import sys -import os - -def create_db(): - - print "\ncreating umls.db" - #connect to the .db file we are creating. - db_path = os.path.join(os.environ['CLICON_DIR'],'umls_tables/umls.db') - conn = sqlite3.connect( db_path ) - conn.text_factory = str - - print "opening files" - #load data in files. - try: - mrsty_path = os.path.join(os.environ['CLICON_DIR'],'umls_tables/MRSTY') - MRSTY_TABLE = open( mrsty_path, "r" ) - except IOError: - print "\nNo file to use for creating MRSTY table\n" - conn.close() - sys.exit() - - try: - mrcon_path = os.path.join(os.environ['CLICON_DIR'],'umls_tables/MRCON') - MRCON_TABLE = open( mrcon_path , "r" ) - except IOError: - print "\nNo file to use for creating MRCON table\n" - conn.close() - sys.exit() - - try: - mrrel_path = os.path.join(os.environ['CLICON_DIR'],'umls_tables/MRREL') - MRREL_TABLE = open( mrrel_path , "r" ) - except IOError: - print "\nNo file to use for creating MRREL table\n" - conn.close() - sys.exit() - - print "reading files" - - MRSTY_TABLE = MRSTY_TABLE.read() - MRSTY_TABLE = MRSTY_TABLE.split('\n') - - MRCON_TABLE = MRCON_TABLE.read() - MRCON_TABLE = MRCON_TABLE.split( '\n' ) - - MRREL_TABLE = MRREL_TABLE.read() - MRREL_TABLE = MRREL_TABLE.split( '\n' ) - - #data that will be inserted into tables. - MRTSY_DATA = [] - MRCON_DATA = [] - MRREL_DATA = [] - - c = conn.cursor() - - print "parsing files" - - #parse and store the data from the files. - for line in MRSTY_TABLE: - MRTSY_DATA.append( tuple(line.split('|')) ) - for line in MRCON_TABLE: - MRCON_DATA.append( tuple(line.split('|')) ) - for line in MRREL_TABLE: - MRREL_DATA.append( tuple(line.split('|')) ) - - print "creating tables" - - #create tables. - c.execute( "CREATE TABLE MRCON( CUI, LAT, TS, LUI, STT, SUI, STR, LRL, EMPTY ) ;" ) - c.execute( "CREATE TABLE MRSTY( CUI, TUI, STY, EMPTY ) ;" ) - c.execute( "CREATE TABLE MRREL( CUI1, REL, CUI2, RELA, SAB, SL, MG, EMPTY ) ;" ) - - print "inserting data" - - #insert data onto database - for line in MRCON_DATA: - try: - c.execute( "INSERT INTO MRCON( CUI, LAT, TS, LUI, STT, SUI, STR, LRL, EMPTY ) values ( ?, ?, ? ,?, ?,?,?,?,?);", line ) - except sqlite3.ProgrammingError: - continue - for line in MRTSY_DATA: - try: - c.execute( "INSERT INTO MRSTY( CUI, TUI, STY, EMPTY) values( ?, ?, ?, ?)" , line ) - except sqlite3.ProgrammingError: - continue - for line in MRREL_DATA: - try: - c.execute( "INSERT INTO MRREL( CUI1, REL, CUI2, RELA, SAB, SL, MG, EMPTY ) values( ?, ?, ?, ?,?, ? ,? ,? )" , line ) - except sqlite3.ProgrammingError: - continue - - print "creating indices" - - #create indices for faster queries - c.execute( "CREATE INDEX mrsty_cui_map ON MRSTY(CUI)") - c.execute( "CREATE INDEX mrcon_str_map ON MRCON(STR)") - c.execute( "CREATE INDEX mrcon_cui_map ON MRCON(CUI)") - c.execute( "CREATE INDEX mrrel_cui2_map ON MRREL( CUI2 )" ) - c.execute( "CREATE INDEX mrrel_cui1_map on MRREL( CUI1 ) " ) - c.execute( "CREATE INDEX mrrel_rel_map on MRREL( REL )" ) - - #save changes to .db - conn.commit() - - print "\nsqlite database created" - - #close connection - conn.close() - diff --git a/clicon/features_dir/umls_dir/create_trie.py b/clicon/features_dir/umls_dir/create_trie.py deleted file mode 100644 index 51b7ba3..0000000 --- a/clicon/features_dir/umls_dir/create_trie.py +++ /dev/null @@ -1,87 +0,0 @@ -#database.py creates a .db file for performing umls searches. -import marisa_trie -import sys -import os -import cPickle as pickle - - - - -def create_trie(): - - """ - create_trie() - - Purpose: Build a trie of concepts from MRREL - - @return A trie object - """ - - # Is trie already built & pickled? - prefix = os.environ['CLICON_DIR'] - filename = os.path.join( prefix, 'umls_tables/umls-concept.trie' ) - try: - t = pickle.load( open( filename , "rb" ) ) ; - return t - except IOError: - pass - - - print "\ncreating concept-trie" - - #load data in files. - print "opening file" - try: - mrcon_path = os.path.join(os.environ['CLICON_DIR'],'umls_tables/MRCON') - MRCON_TABLE = open( mrcon_path , "r" ) - except IOError: - print "\nNo file to use for creating MRCON table\n" - conn.close() - sys.exit() - - - print "reading file" - MRCON_TABLE = MRCON_TABLE.read() - MRCON_TABLE = MRCON_TABLE.split( '\n' ) - - #data that will be inserted into tables. - MRCON_DATA = [] - - print "parsing file" - - #parse and store the data from the files. - for line in MRCON_TABLE: - MRCON_DATA.append( tuple(line.split('|')) ) - - #insert data onto database - print "inserting data" - concepts = [] - for line in MRCON_DATA: - if len(line) < 6: continue - - concept = line[6] - - # Ignore non-ascii - try: - concept.decode('ascii') - except: - continue - - #print type(concept) - concepts.append(concept) - - - print "creating trie" - t = marisa_trie.Trie(concepts) - - print "concept-trie created" - - - # Pickle trie - pickle.dump( t, open( filename, "wb" ) ) - - return t - - -if __name__ == '__main__': - t = create_trie() diff --git a/clicon/features_dir/umls_dir/interface_umls.py b/clicon/features_dir/umls_dir/interface_umls.py deleted file mode 100644 index 30e000f..0000000 --- a/clicon/features_dir/umls_dir/interface_umls.py +++ /dev/null @@ -1,80 +0,0 @@ -# -# Interface to UMLS Databases and concept trie -# -# -# - - -import copy -import sqlite3 -import create_sqliteDB -import os - -import create_trie - - - - -############################################ -### Setups / Handshakes ### -############################################ - - -#connect to UMLS database -def SQLConnect(): - #try to connect to the sqlite database. - db_path = os.path.join( os.environ['CLICON_DIR'], "umls_tables/umls.db") - if( os.path.isfile( db_path ) ): - print "\ndb exists" - else: - # Database does not exit. Make one. - print "\ndb doesn't exist" - create_sqliteDB.create_db() - - db = sqlite3.connect( db_path ) - return db.cursor() - - - - -############################################ -### Global reource connections ### -############################################ - - -# Global database connection -c = SQLConnect() - -# Global trie -trie = create_trie.create_trie() - - - - -############################################ -### Query Operations ### -############################################ - - -def string_lookup( string ): - """ Get sty for a given string """ - try: - c.execute( "SELECT sty FROM MRCON a, MRSTY b WHERE a.cui = b.cui AND str = ?; " , (string,) ) - return c.fetchall() - except sqlite3.ProgrammingError, e: - return [] - - -def cui_lookup( string ): - """ get cui for a given string """ - try: - # Get cuis - c.execute( "SELECT cui FROM MRCON WHERE str = ?;" , (string,) ) - return c.fetchall() - except sqlite3.ProgrammingError, e: - return [] - - -def concept_exists(string): - """ Fast query for set membership in trie """ - return string in trie diff --git a/clicon/features_dir/umls_dir/interpret_umls.py b/clicon/features_dir/umls_dir/interpret_umls.py deleted file mode 100644 index e120849..0000000 --- a/clicon/features_dir/umls_dir/interpret_umls.py +++ /dev/null @@ -1,163 +0,0 @@ -import cPickle as pickle -import interface_umls - - -def umls_semantic_type_word( umls_string_cache , sentence ): - # Already cached? - if False and umls_string_cache.has_key( sentence ): - mapping = umls_string_cache.get_map( sentence ) - else: - concepts = interface_umls.string_lookup( sentence ) - concepts = [ singleton[0] for singleton in set(concepts) ] - umls_string_cache.add_map(sentence , concepts) - mapping = umls_string_cache.get_map(sentence) - - return mapping - - -def umls_semantic_context_of_words( umls_string_cache, sentence ): - - #Defines the largest string span for the sentence. - WINDOW_SIZE = 7 - - # span of the umls concept of the largest substring - umls_context_list = [] - - # keys: tuple of (start,end) index of a substring - concept_span_dict = {} - - # Each sublist functions as the mappings for each word. - for i in sentence: - umls_context_list.append( [] ) - - # finds the span for each substring of length 1 to currentWindowSize. - for currentWindowSize in range( 1 , WINDOW_SIZE ): - for ti in range( 0 , ( len(sentence) - currentWindowSize ) + 1 ): - rawstring = "" - for tj in range( ti , ti + currentWindowSize): - rawstring += ( sentence[tj] + " " ) - - #Each string is of length 1 to currentWindowSize. - rawstring = rawstring.strip() - - # Not in cache yet? - if not( umls_string_cache.has_key( rawstring ) ): - # returns a tuple if there is a result or None is there is not. - concept = interface_umls.string_lookup( rawstring ) - - if not concept: - umls_string_cache.add_map( rawstring, None ) - else: - umls_string_cache.add_map( rawstring, concept ) ; - - #Store the concept into concept_span_dict with its span as a key. - concept_span_dict[(ti,ti+currentWindowSize-1)] = umls_string_cache.get_map( rawstring ) - - # For each substring if there is a span, then - # assign the concept to every word that is within in the substring - if umls_string_cache.get_map(rawstring): - for i in range( ti , ti + currentWindowSize ): - if len( umls_context_list[i] ) == 0: - umls_context_list[i].append([ti,ti+currentWindowSize-1]) - - else: - updated = 0 - for j in umls_context_list[i]: - if j[0] >= ti and j[1] <= (ti+currentWindowSize-1): - j[0] = ti - j[1] = ( ti + currentWindowSize - 1 ) - updated += 1 - if not(updated): - val = [ti,ti+currentWindowSize-1] - if umls_context_list[i].count(val)== 0: - umls_context_list[i].append(val) - - - - #create a list of sublists - # each sublist represents the contexts for which the word appears - mappings = [] - for i in umls_context_list: - spans = i - if len(spans) == 0: - mappings.append( None ) - else: - sub_mappings = [] - for j in spans: - sub_mappings.append( concept_span_dict[tuple(j)]) - - # FIXME - Decided to concat rather than append (not sure why) - mappings += sub_mappings - - return mappings - - -def umls_semantic_type_sentence( cache , sentence ): - - #Defines the largest string span for the sentence. - WINDOW_SIZE = 7 - - longestSpanLength = 0 - longestSpans = [] # List of (start,end) tokens - - for i in range(len(sentence)): - maxVal = min(i+WINDOW_SIZE, len(sentence)) - for j in range(i,maxVal): - # Lookup key - span = sentence[i:j+1] - rawstring = unicode(' '.join(span)) - - # string does have an associated UMLS concept? - if interface_umls.concept_exists(rawstring): - if len(span) == longestSpanLength: - longestSpans.append( (i,j) ) - # new longest span size - elif len(span) > longestSpanLength: - longestSpans = [ (i,j) ] - longestSpanLength = len(span) - - # lookup UMLS concept for a given (start,end) span - def span2concept(span): - rawstring = ' '.join(sentence[span[0]:span[1]+1]) - - # Already cached? - if cache.has_key( rawstring ): - return cache.get_map( rawstring ) - - else: - concept = interface_umls.string_lookup( rawstring ) - - if concept: - cache.add_map( rawstring , concept ) - else: - cache.add_map( rawstring , [] ) - - return cache.get_map( rawstring ) - - mappings = [ span2concept(span) for span in longestSpans ] - return mappings - - - -# Get the semantic types for a given word -def get_cui( cache , word ): - - # If already in cache - if cache.has_key( word + '--cuis' ): - - cuis = cache.get_map( word + '--cuis' ) - - else: - - # Get cui - cuis = interface_umls.cui_lookup(word) - - # Eliminate duplicates - cuis = list(set(cuis)) - cuis = [c[0] for c in cuis] - - # Store result in cache - cache.add_map( word + '--cuis', cuis ) - - return cuis - diff --git a/clicon/features_dir/umls_dir/umls_cache.py b/clicon/features_dir/umls_dir/umls_cache.py deleted file mode 100644 index e84e253..0000000 --- a/clicon/features_dir/umls_dir/umls_cache.py +++ /dev/null @@ -1,23 +0,0 @@ -import cPickle as pickle -import os - -class UmlsCache: - def __init__(self): - try: - prefix = os.environ['CLICON_DIR'] - self.filename = os.path.join( prefix, 'umls_tables/umls_cache' ) - self.cache = pickle.load( open( self.filename , "rb" ) ) ; - except IOError: - self.cache = {} - - def has_key( self , string ): - return self.cache.has_key( string ) - - def add_map( self , string, mapping ): - self.cache[string] = mapping - - def get_map( self , string ): - return self.cache[string] - - def __del__(self): - pickle.dump( self.cache, open( self.filename, "wb" ) ) diff --git a/clicon/features_dir/umls_dir/umls_features.py b/clicon/features_dir/umls_dir/umls_features.py deleted file mode 100644 index 8166338..0000000 --- a/clicon/features_dir/umls_dir/umls_features.py +++ /dev/null @@ -1,184 +0,0 @@ -###################################################################### -# CliCon - umls_features.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Independent UMLS module # -###################################################################### - - - -from umls_cache import UmlsCache -import interpret_umls - - - -class UMLSFeatures: - - - def __init__(self): - - """ - UMLSFeatures::Constructor - """ - - # cache for the mappings of all umls lookups made - self.umls_lookup_cache = UmlsCache() - - - - def IOB_prose_features(self, sentence): - - """ - UMLSFeatures::IOB_prose_features() - - @ param sentence. A list of words - @return dictionary of features - """ - - features_list = [] - - for word in sentence: - features_list.append( self.features_for_word(word) ) - - return features_list - - - - def IOB_nonprose_features(self, sentence): - - """ - UMLSFeatures::IOB_nonprose_features() - - @ param sentence. A list of words - @return dictionary of features - """ - - features_list = [] - - for word in sentence: - features_list.append( self.features_for_word(word) ) - - - # TODO - Add umls.umls_semantic_type_sentence() to first pass feature set - ''' - # Feature: UMLS semantic type for the sentence - # a list of the uml semantic of the largest substring(s). - sentence_mapping = umls.umls_semantic_type_sentence( self.umls_lookup_cache, sentence ) - - # if there are no mappings - if not sentence_mapping: - features[('umls_semantic_type_sentence', None ) ] = 1 - # assign the umls definitions to the vector for each word - else: - for concept in sentence_mapping: - if concept: - for mapping in concept: - features[('umls_semantic_type_sentence' , mapping[0] ) ] = 1 - ''' - - - return features_list - - - - - def features_for_word(self, word): - - """ - UMLSFeatures::features_for_word() - - @ param word. word to lookup in UMLS database - @return dictionary of word-level features - """ - - - # Return value is a list of dictionaries (of features) - features = {} - - #print '\n' - #print word - - # Feature: UMLS Semantic Types - cuis = interpret_umls.get_cui(self.umls_lookup_cache , word) - - # Add each CUI - if cuis: - for cui in cuis: - features[('umls_cui',cui)] = 1 - #print '\tcui: ', cui - #print - - - # Feature: UMLS Semantic Type (for each word) - mapping = interpret_umls.umls_semantic_type_word(self.umls_lookup_cache , word ) - - # Add each semantic type - if mapping: - for concept in mapping: - features[('umls_semantic_type_word', concept )] = 1 - #print '\t', 'semantic_type_word: ', concept - #print - - return features - - - - def concept_features_for_chunk(self, sentence, ind): - - """ - UMLSFeatures::concept_features_for_sentence() - - @ param sentence. list of words from line (after flattening chunks) - @return dictionary of chunk-level features - """ - - #print '\n\n\n' - #print 'concept_features_for_chunk' - #print sentence - #print ind - - # Return value is a list of dictionaries (of features) - features = {} - - # UMLS features for each words - for word in sentence[ind].split(): - word_feats = self.features_for_word(word) - features.update(word_feats) - - - # Feature: UMLS semantic type for the sentence - # a list of the uml semantic of the largest substring(s). - sentence_mapping = interpret_umls.umls_semantic_type_sentence( self.umls_lookup_cache, sentence ) - - # if there are no mappings - if not sentence_mapping: - features[('umls_semantic_type_sentence', None ) ] = 1 - # assign the umls definitions to the vector for each word - else: - for concept in sentence_mapping: - for mapping in concept: - features[('umls_semantic_type_sentence' , mapping[0] ) ] = 1 - - - # Feature: UMLS semantic context - - # the umls definition of the largest string the word is in - umls_semantic_context_mappings = interpret_umls.umls_semantic_context_of_words( self.umls_lookup_cache , sentence ) - - # there could be multiple contexts, iterate through the sublist - for mapping in umls_semantic_context_mappings: - if not mapping: continue - for concept in mapping: - features[('umls_semantic_context',concept)] = 1 - - return features - - - - def concept_features_for_chunks(self, sentence, inds): - retVal = [] - for ind in inds: - retVal.append( self.concept_features_for_chunk(sentence, ind) ) - return retVal - diff --git a/clicon/features_dir/utilities.py b/clicon/features_dir/utilities.py deleted file mode 100644 index 2041f71..0000000 --- a/clicon/features_dir/utilities.py +++ /dev/null @@ -1,72 +0,0 @@ -###################################################################### -# CliCon - utilities.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Miscellaneous tools for handling data. # -###################################################################### - - -import re - - -# prose_sentence() -# -# input: A sentence -# output: Boolean yes/no -def prose_sentence(sentence): - - # Empty sentence is not prose - if not sentence: - return False - - if sentence[-1] == '.' or sentence[-1] == '?': - return True - elif sentence[-1] == ':': - return False - elif len(sentence) <= 5: - return False - elif at_least_half_nonprose(sentence): - return True - else: - return False - - - -# at_least_half_nonprose() -# -# input: A sentence -# output: A bollean yes/no -def at_least_half_nonprose(sentence): - - count = len( [ w for w in sentence if prose_word(w) ] ) - - if count >= len(sentence)/2: - return True - else: - return False - - - -# prose_word() -# -# input: A word -# output: Boolean yes/no -def prose_word(word): - - # Punctuation - for punc in ".?,!:\"'": - if punc in word: - return False - - # Digit - if re.match('\d', word): - return False - - # All uppercase - if word == word.upper(): - return False - - # Else - return True - diff --git a/clicon/features_dir/word_features.py b/clicon/features_dir/word_features.py deleted file mode 100644 index 6b2440e..0000000 --- a/clicon/features_dir/word_features.py +++ /dev/null @@ -1,411 +0,0 @@ -###################################################################### -# CliCon - word_features.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Isolate all word-level features into a single file # -###################################################################### - - - - -__author__ = 'Willie Boag' -__date__ = 'Apr 27, 2014' - - - -import nltk -import re -from wordshape import getWordShapes - - - - -class WordFeatures: - - enabled_IOB_prose_word_features = frozenset( ['Generic#', 'last_two_letters', 'word', 'length', 'mitre', 'stem_porter', 'stem_lancaster', 'word_shape', 'metric_unit' ] ) - - enabled_IOB_nonprose_word_features = frozenset( ['word', 'word_shape', 'mitre', 'QANN' ] ) - - #enabled_concept_features = frozenset( ['word', 'prefix', 'stem_porter', 'stem_lancaster', 'previous_word_stem', 'next_word_stem'] ) - enabled_concept_features = frozenset( ['word', 'prefix', 'stem_porter', 'stem_lancaster', 'previous_word_stem', 'next_word_stem', 'word_shape', 'metric_unit', 'mitre', 'directive', 'date'] ) - - - def __init__(self): - pass - - - # IOB_prose_features_for_word() - # - # input: A single word - # output: A dictionary of features - def IOB_prose_features(self, word): - - # Feature: - features = {('dummy', None): 1} # always have >0 dimensions - - # Allow for particular features to be enabled - for feature in self.enabled_IOB_prose_word_features: - - if feature == "word": - features[(feature, word.lower())] = 1 - - if feature == "stem_lancaster": - st = nltk.stem.LancasterStemmer() - features[ (feature, st.stem(word.lower())) ] = 1 - - # Feature: Generic# stemmed word - if feature == 'Generic#': - generic = re.sub('[0-9]','0',word) - features[ ('Generic#',generic) ] = 1 - - # Feature: Last two leters of word - if feature == 'last_two_letters': - features[ ('last_two_letters',word[-2:]) ] = 1 - - - if feature == "length": - features[(feature, None)] = len(word) - - if feature == "stem_porter": - st = nltk.stem.PorterStemmer() - features[(feature, st.stem(word))] = 1 - - - if feature == "mitre": - for f in self.mitre_features: - if re.search(self.mitre_features[f], word): - features[(feature, f)] = 1 - - if feature == "word_shape": - wordShapes = getWordShapes(word) - for shape in wordShapes: - features[(feature, shape)] = 1 - - - return features - - - - - - # IOB_nonprose_features_for_word() - # - # input: A single word - # output: A dictionary of features - def IOB_nonprose_features(self, word): - - features = {} - - # Feature: The word, itself - features[('word', word.lower())] = 1 - - # Allow for particular features to be enabled - for feature in self.enabled_IOB_nonprose_word_features: - - # Feature: Mitre - if feature == "mitre": - for f in self.mitre_features: - if re.search(self.mitre_features[f], word): - features[('mitre', f)] = 1 - - # Feature: Word Shape - if feature == "word_shape": - wordShapes = getWordShapes(word) - for shape in wordShapes: - features[('word_shape', shape)] = 1 - - # Feature: QANN features - if feature == 'QANN': - qann_feats = self.QANN_features(word) - features.update(qann_feats) - - return features - - - - - def concept_features_for_word(self, word): - - """ - concept_features_for_word() - - @param word. A word to generate features for - @return A dictionary of features - """ - - features = {} - - # Allow for particular features to be enabled - for feature in self.enabled_concept_features: - - # Feature: Uncased Word - if feature == "word": - features[ ("word",word.lower()) ] = 1 - - - ''' - # Feature: Porter Stem - if feature == "stem_porter": - st = nltk.stem.PorterStemmer() - features[ ("stem_poter", st.stem(word)) ] = 1 - - # Feature: Lancaster Stem - if feature == "stem_lancaster": - st = nltk.stem.LancasterStemmer() - features[ ("stem_lancaster", st.stem(word)) ] = 1 - ''' - - ''' - # Feature: First Four Letters - if feature == "prefix": - prefix = word[:4].lower() - features[ ("prefix",prefix) ] = 1 - ''' - - ''' - # Use: None - # Feature: Length - if feature == "length": - features[ ("length",None) ] = len(word) - ''' - - # Feature: Metric Unit - if feature == "metric_unit": - unit = None - if self.is_weight(word): - unit = 'weight' - elif self.is_size(word): - unit = 'size' - elif self.is_volume(word): - unit = 'volume' - features[('metric_unit',unit)] = 1 - - ''' - # Feature: Date - if feature == 'date': - if self.is_date(word): - features[('date',None)] = 1 - - # Feature: Directive - if feature == 'directive': - if self.is_directive(word): - features[('directive',None)] = 1 - - # Feature: Mitre - if feature == "mitre": - for f in self.mitre_features: - if re.search(self.mitre_features[f], word): - features[('mitre', f)] = 1 - - # Feature: Word Shape - if feature == "word_shape": - wordShapes = getWordShapes(word) - for shape in wordShapes: - features[('word_shape', shape)] = 1 - ''' - - - - return features - - - - - def concept_features_for_chunk(self, sentence, ind): - - """ - concept_features_for_chunk() - - @param word. A chunk from the sentence - @return A dictionary of features - """ - - features = {'dummy':1} - - # Word-level features for each word of the chunk - for w in sentence[ind].split(): - word_features = self.concept_features_for_word(w) - features.update(word_features) - - return features - - # Stemmer - st = nltk.stem.PorterStemmer() - - - # Context windows - for feature in self.enabled_concept_features: - - # Feature: Previous word - if feature == "previous_word_stem": - if ind != 0: - prev_ind = ind - 1 - prev_chunk = sentence[prev_ind].split() - prev_word = st.stem( prev_chunk[-1] ) - features[ ('prev_word_stem',prev_word) ] = 1 - else: - features[ ('prev_word_stem','') ] = 1 - - # Feature: Previous word - if feature == "next_word_stem": - if ind != len(sentence)-1: - next_ind = ind + 1 - next_chunk = sentence[next_ind].split() - next_word = st.stem( next_chunk[0] ) - features[ ('next_word_stem',next_word) ] = 1 - else: - features[ ('next_word_stem','') ] = 1 - - - return features - - - - - mitre_features = { - "INITCAP": r"^[A-Z].*$", - "ALLCAPS": r"^[A-Z]+$", - "CAPSMIX": r"^[A-Za-z]+$", - "HASDIGIT": r"^.*[0-9].*$", - "SINGLEDIGIT": r"^[0-9]$", - "DOUBLEDIGIT": r"^[0-9][0-9]$", - "FOURDIGITS": r"^[0-9][0-9][0-9][0-9]$", - "NATURALNUM": r"^[0-9]+$", - "REALNUM": r"^[0-9]+.[0-9]+$", - "ALPHANUM": r"^[0-9A-Za-z]+$", - "HASDASH": r"^.*-.*$", - "PUNCTUATION": r"^[^A-Za-z0-9]+$", - "PHONE1": r"^[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]$", - "PHONE2": r"^[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]$", - "FIVEDIGIT": r"^[0-9][0-9][0-9][0-9][0-9]", - "NOVOWELS": r"^[^AaEeIiOoUu]+$", - "HASDASHNUMALPHA": r"^.*[A-z].*-.*[0-9].*$ | *.[0-9].*-.*[0-9].*$", - "DATESEPERATOR": r"^[-/]$", - } - - # Try to get QANN features - def QANN_features(self, word): - features = {} - - # Feature: test result - if self.is_test_result(word): features[('test_result',None)] = 1 - - # Feature: measurements - if self.is_measurement(word): features[('measurement',None)] = 1 - - # Feature: directive - if self.is_directive(word): features[('directive', None)] = 1 - - # Feature: date - if self.is_date(word): features[('date', None)] = 1 - - # Feature: volume - if self.is_volume(word): features[('volume', None)] = 1 - - # Feature: weight - if self.is_weight(word): features[('weight', None)] = 1 - - # Feature: size - if self.is_size(word): features[('size', None)] = 1 - - # Feature: prognosis location - if self.is_prognosis_location: features[('prog_location', None)] = 1 - - # Feature: problem form - if self.has_problem_form(word): features[('problem_form', None)] = 1 - - # Feature: concept class - if self.is_weight(word): features[('weight', None)] = 1 - - return features - - - def is_test_result(self, context): - # note: make spaces optional? - regex = r"^[A-Za-z]+( )*(-|--|:|was|of|\*|>|<|more than|less than)( )*[0-9]+(%)*" - if not re.search(regex, context): - return re.search(r"^[A-Za-z]+ was (positive|negative)", context) - return True - - def is_measurement(self, word): - regex = r"^[0-9]*(unit(s)|cc|L|mL|dL)$" - return re.search(regex, word) - - def is_directive(self, word): - regex = r"^(q\..*|q..|PRM|bid|prm|p\..*)$" - return re.search(regex, word) - - def is_date(self, word): - regex= r'^(\d\d\d\d-\d\d-\d|\d\d?-\d\d?-\d\d\d\d?|\d\d\d\d-\d\d?-\d\d?)$' - return re.search(regex,word) - - def is_volume(self, word): - regex = r"^[0-9]*(ml|mL|dL)$" - return re.search(regex, word) - - def is_weight(self, word): - regex = r"^[0-9]*(mg|g|mcg|milligrams|grams)$" - return re.search(regex, word) - - def is_size(self, word): - regex = r"^[0-9]*(mm|cm|millimeters|centimeters)$" - return re.search(regex, word) - - def is_prognosis_location(self, word): - regex = r"^(c|C)[0-9]+(-(c|C)[0-9]+)*$" - return re.search(regex, word) - - def has_problem_form(self, word): - regex = r".*(ic|is)$" - return re.search(regex, word) - - # checks for a definitive classification at the word level - def get_def_class(self, word): - test_terms = { - "eval", "evaluation", "evaluations", - "sat", "sats", "saturation", - "exam", "exams", - "rate", "rates", - "test", "tests", - "xray", "xrays", - "screen", "screens", - "level", "levels", - "tox" - } - problem_terms = { - "swelling", - "wound", "wounds", - "symptom", "symptoms", - "shifts", "failure", - "insufficiency", "insufficiencies", - "mass", "masses", - "aneurysm", "aneurysms", - "ulcer", "ulcers", - "trama", "cancer", - "disease", "diseased", - "bacterial", "viral", - "syndrome", "syndromes", - "pain", "pains" - "burns", "burned", - "broken", "fractured" - } - treatment_terms = { - "therapy", - "replacement", - "anesthesia", - "supplement", "supplemental", - "vaccine", "vaccines" - "dose", "doses", - "shot", "shots", - "medication", "medicine", - "treament", "treatments" - } - if word.lower() in test_terms: - return 1 - elif word.lower() in problem_terms: - return 2 - elif word.lower() in treatment_terms: - return 3 - return 0 - diff --git a/clicon/helper.py b/clicon/helper.py deleted file mode 100644 index a9edb02..0000000 --- a/clicon/helper.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Utility methods.""" -import os -import os.path -import errno - - -def map_files(files): - """Maps a list of files to basename -> path.""" - output = {} - for f in files: #pylint: disable=invalid-name - basename = os.path.splitext(os.path.basename(f))[0] - output[basename] = f - return output - - -def mkpath(path): - """Alias for mkdir -p.""" - try: - os.makedirs(path) - except OSError as exc: - if exc.errno == errno.EEXIST and os.path.isdir(path): - pass - else: - raise diff --git a/clicon/is_installed.py b/clicon/is_installed.py deleted file mode 100644 index 2133e1a..0000000 --- a/clicon/is_installed.py +++ /dev/null @@ -1,50 +0,0 @@ -###################################################################### -# CliCon - is_installed.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Determine if a given python module is available. # -###################################################################### - - -import sys - - - -def main(): - - # Ensure proper usage - if len(sys.argv) != 2: - print '\n\tusage: %s module_name\n' % sys.argv[0] - exit(2) - - - # Modules used by clicon (package name -> module name) - module2import = { 'scipy' : 'import scipy' , - 'numpy' : 'import numpy' , - 'nose' : 'import nose' , - 'nltk' : 'import nltk' , - 'python-crfsuite' : 'import pycrfsuite' , - 'scikit-learn' : 'from sklearn.feature_extraction import DictVectorizer' } - - - # Attempt to import module - try: - # Get import statement from module name - if sys.argv[1] not in module2import: exit(3) - attempt = module2import[ sys.argv[1] ] - - # Execute import - exec(attempt) - error = 0 - except ImportError: - error = 1 - - - # Return error code back to shell - exit(error) - - - -if __name__ == '__main__': - main() diff --git a/clicon/machine_learning/.gitignore b/clicon/machine_learning/.gitignore deleted file mode 100644 index 72723e5..0000000 --- a/clicon/machine_learning/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*pyc diff --git a/clicon/machine_learning/sci.py b/clicon/machine_learning/sci.py deleted file mode 100644 index 13949b9..0000000 --- a/clicon/machine_learning/sci.py +++ /dev/null @@ -1,54 +0,0 @@ -import numpy as np -from sklearn.svm import SVC -from sklearn.svm import LinearSVC -from sklearn.grid_search import GridSearchCV -from multiprocessing import cpu_count -from sklearn.metrics import f1_score - - -# Solution for trying to train with all instances having single label -class TrivialClassifier: - - def __init__(self, label): - self.label = label - - def predict(self, X): - return [ self.label for x in X ] - - - -def train(X, Y, do_grid): - - # scikit-learn requires you train data with more than one label - if len(Y) and all( [ (y==Y[0]) for y in Y ] ): - return TrivialClassifier(Y[0]) - - # Search space - C_range = 10.0 ** np.arange( -5, 9 ) - gamma_range = 10.0 ** np.arange( -5 , 9 ) - - # Grid search? - if do_grid: - print '\t\tperforming grid search' - - estimates = LinearSVC() - parameters = [ {'C':C_range } ] - - # Find best classifier - clf = GridSearchCV(estimates, parameters, score_func = f1_score, - n_jobs = cpu_count() ) - clf.fit(X, Y) - - else: - clf = LinearSVC() - clf.fit(X, Y) - - # Return chosen classifier - return clf - - - -def predict(clf, X): - # Predict - retVal = list(clf.predict(X)) - return retVal diff --git a/clicon/model.py b/clicon/model.py deleted file mode 100644 index 2149f5a..0000000 --- a/clicon/model.py +++ /dev/null @@ -1,462 +0,0 @@ -from __future__ import with_statement - -import os -import cPickle as pickle -import helper -import sys - -from sklearn.feature_extraction import DictVectorizer - -from machine_learning import sci -from machine_learning import crf -from features_dir import features, utilities - -from notes.note import concept_labels, reverse_concept_labels, IOB_labels, reverse_IOB_labels - - - - -class Model: - - @staticmethod - def load(filename='awesome.model'): - with open(filename, 'rb') as model: - model = pickle.load(model) - model.filename = filename - return model - - - def __init__(self, is_crf=True): - - # Use python-crfsuite - self.crf_enabled = is_crf - - # DictVectorizers - self.first_prose_vec = DictVectorizer() - self.first_nonprose_vec = DictVectorizer() - self.second_vec = DictVectorizer() - - # Classifiers - self.first_prose_clf = None - self.first_nonprose_clf = None - self.second_clf = None - - - - def train(self, notes, do_grid=False): - - """ - Model::train() - - Purpose: Train a ML model on annotated data - - @param notes. A list of Note objects (containing text and annotations) - @return None - """ - - - ############## - # First pass # - ############## - - # Get the data and annotations from the Note objects - text = [ note.getTokenizedSentences() for note in notes ] - ioblist = [ note.getIOBLabels() for note in notes ] - - data1 = reduce( concat, text ) - Y1 = reduce( concat, ioblist ) - - - # Train classifier (side effect - saved as object's member variable) - print 'first pass' - self.first_train(data1, Y1, do_grid) - - - - ############### - # Second pass # - ############### - - # Get the data and annotations from the Note objects - chunks = [ note.getChunkedText() for note in notes ] - indices = [ note.getConceptIndices() for note in notes ] - conlist = [ note.getConceptLabels() for note in notes ] - - data2 = reduce( concat, chunks ) - inds = reduce( concat, indices ) - Y2 = reduce( concat, conlist ) - - - # Train classifier (side effect - saved as object's member variable) - print 'second pass' - self.second_train(data2, inds, Y2, do_grid) - - - - - def first_train(self, data, Y, do_grid=False): - - """ - Model::first_train() - - Purpose: Train the first pass classifiers (for IOB chunking) - - @param data A list of split sentences (1 sent = 1 line from file) - @param Y A list of list of IOB labels (1:1 mapping with data) - @param do_grid A boolean indicating whether to perform a grid search - - @return None - """ - - print '\textracting features (pass one)' - - - # Create object that is a wrapper for the features - feat_obj = features.FeatureWrapper(data) - - - # Parition into prose v. nonprose - prose = [] - nonprose = [] - pchunks = [] - nchunks = [] - for line,labels in zip(data,Y): - isProse,feats = feat_obj.extract_IOB_features(line) - if isProse: - prose.append(feats) - pchunks += labels - else: - nonprose.append(feats) - nchunks += labels - - - # Classify both prose & nonprose - flabels = ['prose' , 'nonprose' ] - fsets = [prose , nonprose ] - chunksets = [pchunks , nchunks ] - dvects = [self.first_prose_vec, self.first_nonprose_vec] - clfs = [self.first_prose_clf, self.first_nonprose_clf] - - vectorizers = [] - classifiers = [] - - for flabel,fset,chunks,dvect,clf in zip(flabels, fsets, chunksets, dvects, clfs): - - if len(fset) == 0: - raise Exception('Training data must have %s training examples' % flabel) - - print '\tvectorizing features (pass one) ' + flabel - - # Vectorize IOB labels - Y = [ IOB_labels[y] for y in chunks ] - - # Save list structure to reconstruct after vectorization - offsets = [ len(sublist) for sublist in fset ] - for i in range(1, len(offsets)): - offsets[i] += offsets[i-1] - - # Vectorize features - flattened = [item for sublist in fset for item in sublist] - X = dvect.fit_transform(flattened) - vectorizers.append(dvect) - - - print '\ttraining classifiers (pass one) ' + flabel - - # CRF needs reconstructed lists - if self.crf_enabled: - X = list(X) - X = [ X[i:j] for i, j in zip([0] + offsets, offsets)] - Y = [ Y[i:j] for i, j in zip([0] + offsets, offsets)] - lib = crf - else: - lib = sci - - # Train classifiers - clf = lib.train(X, Y, do_grid) - classifiers.append(clf) - - - # Save vectorizers - self.first_prose_vec = vectorizers[0] - self.first_nonprose_vec = vectorizers[1] - - # Save classifiers - self.first_prose_clf = classifiers[0] - self.first_nonprose_clf = classifiers[1] - - - - - # Model::second_train() - # - # - def second_train(self, data, inds_list, Y, do_grid=False): - - """ - Model::second_train() - - Purpose: Train the first pass classifiers (for IOB chunking) - - @param data A list of list of strings. - - A string is a chunked phrase - - An inner list corresponds to one line from the file - @param inds_list A list of list of integer indices - - assertion: len(data) == len(inds_list) - - one line of 'inds_list' contains a list of indices - into the corresponding line for 'data' - @param Y A list of concept labels - - assertion: there are sum(len(inds_list)) labels - AKA each index from inds_list maps to a label - @param do_grid A boolean indicating whether to perform a grid search - - @return None - """ - - print '\textracting features (pass two)' - - # Create object that is a wrapper for the features - feat_o = features.FeatureWrapper() - - # Extract features - X = [ feat_o.concept_features(s,inds) for s,inds in zip(data,inds_list) ] - X = reduce(concat, X) - - - print '\tvectorizing features (pass two)' - - # Vectorize labels - Y = [ concept_labels[y] for y in Y ] - - # Vectorize features - X = self.second_vec.fit_transform(X) - - - print '\ttraining classifier (pass two)' - - - # Train the model - self.second_clf = sci.train(X, Y, do_grid) - - - - - # Model::predict() - # - # @param note. A Note object that contains the data - def predict(self, note): - - - ############## - # First pass # - ############## - - - print 'first pass' - - # Get the data and annotations from the Note objects - data = note.getTokenizedSentences() - - # Predict IOB labels - iobs,_,__ = self.first_predict(data) - note.setIOBLabels(iobs) - - - - ############### - # Second pass # - ############### - - - print 'second pass' - - # Get the data and annotations from the Note objects - chunks = note.getChunkedText() - inds = note.getConceptIndices() - - # Predict concept labels - retVal = self.second_predict(chunks,inds) - - - return retVal - - - - - def first_predict(self, data): - - """ - Model::first_predict() - - Purpose: Predict IOB chunks on data - - @param data. A list of split sentences (1 sent = 1 line from file) - @return A list of list of IOB labels (1:1 mapping with data) - """ - - print '\textracting features (pass one)' - - - # Create object that is a wrapper for the features - feat_obj = features.FeatureWrapper(data) - - - # separate prose and nonprose data - prose = [] - nonprose = [] - plinenos = [] - nlinenos = [] - for i,line in enumerate(data): - isProse,feats = feat_obj.extract_IOB_features(line) - if isProse: - prose.append(feats) - plinenos.append(i) - else: - nonprose.append(feats) - nlinenos.append(i) - - - # Classify both prose & nonprose - flabels = ['prose' , 'nonprose' ] - fsets = [prose , nonprose ] - dvects = [self.first_prose_vec, self.first_nonprose_vec] - clfs = [self.first_prose_clf, self.first_nonprose_clf] - preds = [] - - for flabel,fset,dvect,clf in zip(flabels, fsets, dvects, clfs): - - # If nothing to predict, skip actual prediction - if len(fset) == 0: - preds.append([]) - continue - - - print '\tvectorizing features (pass one) ' + flabel - - # Save list structure to reconstruct after vectorization - offsets = [ len(sublist) for sublist in fset ] - for i in range(1, len(offsets)): - offsets[i] += offsets[i-1] - - # Vectorize features - flattened = [item for sublist in fset for item in sublist] - X = dvect.transform(flattened) - - - print '\tpredicting labels (pass one) ' + flabel - - # CRF requires reconstruct lists - if self.crf_enabled: - X = list(X) - X = [ X[i:j] for i, j in zip([0] + offsets, offsets)] - lib = crf - else: - lib = sci - - # Predict IOB labels - out = lib.predict(clf, X) - - # Format labels from output - pred = [out[i:j] for i, j in zip([0] + offsets, offsets)] - preds.append(pred) - - - # Recover predictions - plist = preds[0] - nlist = preds[1] - - - # Stitch prose and nonprose data back together - # translate IOB labels into a readable format - prose_iobs = [] - nonprose_iobs = [] - iobs = [] - trans = lambda l: reverse_IOB_labels[int(l)] - for sentence in data: - if utilities.prose_sentence(sentence): - prose_iobs.append( plist.pop(0) ) - prose_iobs[-1] = map(trans, prose_iobs[-1]) - iobs.append( prose_iobs[-1] ) - else: - nonprose_iobs.append( nlist.pop(0) ) - nonprose_iobs[-1] = map(trans, nonprose_iobs[-1]) - iobs.append( nonprose_iobs[-1] ) - - - # list of list of IOB labels - return iobs, prose_iobs, nonprose_iobs - - - - - def second_predict(self, data, inds_list): - - # If first pass predicted no concepts, then skip - # NOTE: Special case because SVM cannot have empty input - if sum([ len(inds) for inds in inds_list ]) == 0: - return [] - - - # Create object that is a wrapper for the features - feat_o = features.FeatureWrapper() - - - print '\textracting features (pass two)' - - - # Extract features - X = [ feat_o.concept_features(s,inds) for s,inds in zip(data,inds_list) ] - X = reduce(concat, X) - - - print '\tvectorizing features (pass two)' - - - # Vectorize features - X = self.second_vec.transform(X) - - - print '\tpredicting labels (pass two)' - - - # Predict concept labels - out = sci.predict(self.second_clf, X) - - - # Line-by-line processing - o = list(out) - classifications = [] - for lineno,inds in enumerate(inds_list): - - # Skip empty line - if not inds: continue - - # For each concept - for ind in inds: - - # Get next concept - concept = reverse_concept_labels[o.pop(0)] - - # Get start position (ex. 7th word of line) - start = 0 - for i in range(ind): - start += len( data[lineno][i].split() ) - - # Length of chunk - length = len(data[lineno][ind].split()) - - # Classification token - classifications.append( (concept,lineno+1,start,start+length-1) ) - - # Return classifications - return classifications - - - - - -def concat(a,b): - """ - list concatenation function (for reduce() purpose) - """ - return a+b diff --git a/clicon/notes/.gitignore b/clicon/notes/.gitignore deleted file mode 100644 index 72723e5..0000000 --- a/clicon/notes/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*pyc diff --git a/clicon/notes/abstract_note.py b/clicon/notes/abstract_note.py deleted file mode 100644 index 95ccb2b..0000000 --- a/clicon/notes/abstract_note.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import with_statement - - -###################################################################### -# CliNER - abstract_note.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Abstract Note representation. # -###################################################################### - - -__author__ = 'Willie Boag' -__date__ = 'Nov. 6, 2014' - - - -import re -import string -from copy import copy -import nltk.data -import os.path - - - - -# Abstract Note (to be inherited from) -class AbstractNote: - - def __init__(self): - raise Exception('Cannot instantiate AbstractNote') - - - ################################################################ - #### Must support this interface ##### - ################################################################ - - def getExtension(self): - """ - Purpose: Get the file extension for a particular format (ex. i2b2->con) - """ - raise Exception('Must define getExtension() for derived class') - - - def read(self, txt_file, con_file=None): - """ - Purpose: Abstract method for reading data from file - """ - raise Exception('Must define read() for derived class') - - - def write(self, labels=None): - """ - Purpose: Abstract method for writing data to file - @param labels. A list of (concept,lineo,start_tok,end_tok) tuples - """ - raise Exception('Must define write() for derived class') - - - def getText(self): - """ - Purpose: Return the verabitm string of text from text file - """ - raise Exception('Must define selector for derived class') - - - def getTokenizedSentences(self): - """ - Purpose: Return a list of list of tokens (list of list of strings) - """ - raise Exception('Must define selector for derived class') - - - def getClassificationTuples(self): - """ - Purpose: Return a list of (concept,line_number,start-ind,end-ind) tuples - """ - raise Exception('Must define selector for derived class') - - - def getLineIndices(self): - """ - Purpose: Return a list of (start,end) char index pairs (one per line) - """ - raise Exception('Must define selector for derived class') - - - def read_standard(self, txt, con=None): - """ - Purpose: Every note must be able to read from standardized forat - """ - raise Exception('Must define read_standard() for derived class') - diff --git a/clicon/notes/note.py b/clicon/notes/note.py deleted file mode 100644 index 38d3077..0000000 --- a/clicon/notes/note.py +++ /dev/null @@ -1,376 +0,0 @@ -from __future__ import with_statement - -###################################################################### -# CliNER - note.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Internal representation of data for CliNER. # -###################################################################### - - -__author__ = 'Willie Boag' -__date__ = 'Nov. 6, 2014' - - - -import re -import string -from copy import copy -import nltk.data -import os.path - - -from utilities_for_notes import lineno_and_tokspan - - -# Master Class -class Note: - - # Memoize results from static method calls - supported_formats = [] - dict_of_format_to_extensions = [] - - # Constructor - def __init__(self, format): - - # Error-check input - if format not in Note.supportedFormats(): - raise Exception('Cannot create Note object for format %s' % format) - - # Instantiate the given format derived class - cmd = 'from note_%s import Note_%s as DerivedNote' % (format,format) - exec(cmd) - self.derived_note = DerivedNote() - - # Helpful for debugging - self.format = format - - # Memoizations of selectors - self.data = [] - self.concepts = [] - self.iob_labels = [] - self.text_chunks = [] - - - - @staticmethod - def supportedFormats(): - """ returns a list of data formats supported by CliNER """ - - # Memoized? - if Note.supported_formats: return Note.supported_formats - - # Note files - cands = os.listdir(os.path.join(os.getenv('CLICON_DIR'),'clicon/notes')) - notes = filter(lambda f:f.startswith('note_'), cands) - notes = filter(lambda f: f.endswith('.py' ), notes) - - # Extract format name from all files like 'note_i2b2.py' - formats = [] - for filename in notes: - f = re.search('note_(.*)\\.py', filename).groups(1)[0] - formats.append(f) - - return formats - - - @staticmethod - def supportedFormatExtensions(): - return Note.dictOfFormatToExtensions().values() - - - - @staticmethod - def dictOfFormatToExtensions(): - # Memoized? - if Note.dict_of_format_to_extensions: - return Note.dict_of_format_to_extensions - - # Get each format's extension - extensions = {} - for format in Note.supportedFormats(): - # Import next note format - cmd1 = 'from note_%s import Note_%s' % (format,format) - exec(cmd1) - - # Get extension for note - cmd2 = 'extensions[format] = Note_%s().getExtension()' % format - exec(cmd2) - - Note.dict_of_format_to_extensions = extensions - return extensions - - - - ################################################################## - #### Pass right on to derived format note #### - #### (does not change as new formats are introduced) #### - ################################################################## - - def getExtension(self): - """ - Purpose: returns the filename extension for a given data format - """ - return self.derived_note.getExtension() - - def read(self, txt_file, con_file=None): - """ - Purpose: Call derived object's reader - """ - retVal = self.derived_note.read(txt_file, con_file) - self.getIOBLabels() - return retVal - - def write(self, con_file=None): - """ - Purpose: Call derived object's writer - """ - return self.derived_note.write(con_file) - - def getTokenizedSentences(self): - """ - Purpose: Return list of list of tokens from text file. - """ - if not self.data: - self.data = self.derived_note.getTokenizedSentences() - return self.data - - def read_standard(self, txt, con=None): - """ - Purpose: Every note must be able to read from standard forat - """ - self.derived_note.read_standard(txt,con) - self.getIOBLabels() - - - - ################################################################## - #### Internal Logic #### - #### (does not change as new formats are introduced) #### - ################################################################## - - - def getConceptLabels(self): - """ - Purpose: return a list of concept labels for second pass training - """ - classifications = self.derived_note.getClassificationTuples() - return [ c[0] for c in classifications ] - - - def getIOBLabels(self): - """ - Purpose: return a list of list of IOB labels - """ - - # Only comput if not already memoized - if self.iob_labels: return self.iob_labels - - # Build list of proper dimensions (1:1 with self.data) - self.getTokenizedSentences() - iobs = [ ['O' for tok in sent] for sent in self.data ] - - line_inds = self.derived_note.getLineIndices() - data = self.derived_note.data - text = self.derived_note.text - - # Add 'B's and 'I's from concept spans - for classification in self.derived_note.getClassificationTuples(): - concept,char_spans = classification - - # Each span (could be noncontiguous span) - for span in char_spans: - lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span) - start,end = tokspan - - # Update concept tokens to 'B's and 'I's - iobs[lineno][start] = 'B' - for i in range(start+1,end+1): - iobs[lineno][i] = 'I' - - # Memoize for next call - self.iob_labels = iobs - return iobs - - - - def setIOBLabels(self, iobs): - """ - Purpose: Set the IOB labels for the derived note - """ - # Must be proper form - for iob in iobs: - for label in iob: - assert (label == 'O') or (label == 'B') or (label == 'I'), \ - "All labels must be I, O, or B. Given: " + label - - self.iob_labels = iobs - - - - def getChunkedText(self): - """ - Purpose: List of list of tokens, except combine all 'I's into 'B' chunks - """ - - # Memoized? - if self.text_chunks: return self.text_chunks() - - # Line-by-line chunking - text = self.getTokenizedSentences() - for sent,iobs in zip(text,self.iob_labels): - - # One line of chunked phrases - line = [] - - # Chunk phrase (or single word if 'O' iob tag) - phrase = '' - - # Word-by-word grouping - for word,iob in zip(sent,iobs): - - if iob == 'O': - if phrase: line.append(phrase) - phrase = word - - if iob == 'B': - if phrase: line.append(phrase) - phrase = word - - if iob == 'I': - phrase += ' ' + word - - # Add last phrase - if phrase: line.append(phrase) - - # Add line from file - self.text_chunks.append(line) - - return self.text_chunks - - - - def getConceptIndices(self): - - # Return value - inds_list = [] - - # Line-by-line chunking - for iobs in self.iob_labels: - - # One line of chunked phrases - line = [] - seen_chunks = 0 - - # Word-by-word grouping - for iob in iobs: - - if iob == 'O': - seen_chunks += 1 - - if iob == 'B': - line.append(seen_chunks) - seen_chunks += 1 - - # Add line from file - inds_list.append(line) - - - return inds_list - - - - def write_standard(self, labels=None): - """ - Note::write_standard() - - Purpose: Every note must be able to read from standardized format - - @param labels. A list of classifications - @return A string of starndardized formatted data - """ - - # Standard will have: - # 1. concept type - # 2. concept span inds in character offsets - - # return value - retStr = '' - - # Get data - classifications = self.derived_note.getClassificationTuples() - - # Output classifications into standardized format - for concept,span_inds in classifications: - retStr += concept - for span in span_inds: - retStr += '||%d||%d' % span - retStr += '\n' - - return retStr.strip('\n') - - - - - ################################################################## - #### Only used during developmnt and testing #### - ################################################################## - - - def conlist(self): - """ - Useful during evaluation - """ - - # Cached for later calls - if self.concepts: return self.concepts - - # For each word, store a corresponding concept label - # Initially, all labels will be stored as 'none' - for line in self.data: - tmp = [] - for word in line: - tmp.append('none') - self.concepts.append(tmp) - - # Use the classifications to correct all mislabled 'none's - for classification in self.derived_note.getClassificationTuples(): - concept = classification[0] - char_spans = classification[1] - - # Assumption - assumes no clustering third pass - line_inds = self.derived_note.getLineIndices() - data = self.derived_note.getTokenizedSentences() - text = self.derived_note.getText() - for span in char_spans: - lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span) - start,end = tokspan - - self.concepts[lineno][start] = concept - for i in range(start, end): - self.concepts[lineno][i+1] = concept - - return self.concepts - - - - -# Concept labels -concept_labels = { - "none":0, - "treatment":1, - "problem":2, - "test":3 -} -reverse_concept_labels = {v:k for k, v in concept_labels.items()} - - -# IOB labels -IOB_labels = { - 'O':0, - 'B':1, - 'I':2 -} -reverse_IOB_labels = {v:k for k,v in IOB_labels.items()} - diff --git a/clicon/notes/note_i2b2.py b/clicon/notes/note_i2b2.py deleted file mode 100644 index 0148ec8..0000000 --- a/clicon/notes/note_i2b2.py +++ /dev/null @@ -1,314 +0,0 @@ -from __future__ import with_statement - - -###################################################################### -# CliNER - note_i2b2.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Derived note object for reading i2b2 formatted data. # -###################################################################### - - -__author__ = 'Willie Boag' -__date__ = 'Nov. 6, 2014' - - - -import re -import string -from copy import copy -import nltk.data -import os.path - -from abstract_note import AbstractNote -from utilities_for_notes import classification_cmp, lineno_and_tokspan - - -class Note_i2b2(AbstractNote): - - def __init__(self): - # Internal representation natural for i2b2 format - self.data = [] # list of list of tokens - self.classifications = [] # list of concept tuples - self.line_inds = [] # list of (start,end) indices for every line - - - def getExtension(self): - return 'con' - - - def getText(self): - return self.text - - - def getTokenizedSentences(self): - return map(lambda s: (' '.join(s)).split(), self.data) - - - def getClassificationTuples(self): - - # return value - retVal = [] - - # Build list of standardized classification tuples - for classification in self.classifications: - concept,lineno,tok_start,tok_end = classification - - # character offset of beginning of line - begin = self.line_inds[lineno-1][0] - - # Sweep through line to get character offsets from line start - start = 0 - for word in self.data[lineno-1][:tok_start]: - start += len(word) + 1 - - # Length of concept span - end = start - for word in self.data[lineno-1][tok_start:tok_end+1]: - end += len(word) + 1 - end -= 1 - - #print begin - #print begin+start, begin+end - #print '~~' + self.text[begin+start:begin+end] + '~~' - - retVal.append( (concept,[(begin+start,begin+end)]) ) - - return retVal - - - def getLineIndices(self): - return self.line_inds - - - def read_standard(self, txt, con=None): - - start = 0 - end = 0 - - with open(txt) as f: - - # Get entire file - text = f.read().strip('\n') - self.text = text - - # Split into lines - self.data = map(lambda s: s.split(), text.split('\n')) - - # Tokenize each sentence into words (and save line number indices) - toks = [] - gold = [] # Actual lines - - for sent in self.data: - - gold.append(sent) - - # Keep track of which indices each line has - for word in sent: - end += len(word) + 1 - - self.line_inds.append( (start,end-1) ) - start = end - - # Skip ahead to next non-whitespace - while (start < len(text)) and text[start].isspace(): start += 1 - - - # If an accompanying concept file was specified, read it - if con: - classifications = [] - with open(con) as f: - for line in f: - - # Empty line - if line == '\n': continue - - # Parse concept file line - fields = line.strip().split('||') - #print fields - concept = fields[0] - span_inds = [] - for i in range(1,len(fields),2): - span = int(fields[i]), int(fields[i+1]) - span_inds.append( span ) - - # FIXME - For now, treat non-contiguous spans as separate - for span in span_inds: - # Add the classification to the Note object - l,(start,end) = lineno_and_tokspan(span) - #print 'span: ', span - #print 'lineno: ', l - #print 'start: ', start - #print 'end: ', end - #print '\n' - classifications.append((concept,l+1,start,end)) - - # Safe guard against concept file having duplicate entries - classifications = list(set(classifications)) - - # Concept file does not guarantee ordering by line number - self.classifications = sorted(classifications,cmp=classification_cmp) - - - - def read(self, txt, con=None): - """ - Note_i2b2::read() - - @param txt. A file path for the tokenized medical record - @param con. A file path for the i2b2 annotated concepts for txt - """ - - # Character indices of each line - start = 0 - end = 0 - - # Read in the medical text - with open(txt) as f: - - # Original text file - self.text = f.read().strip('\n') - - i = 0 - for line in self.text.split('\n'): - end += len(line) + 1 - self.line_inds.append( (start,end-1) ) - start = end - - # Strip away non-printable characters - line = filter(lambda x: x in string.printable, line) - - # Add sentence to the data list - self.data.append(line.split(' ')) - - # TEST - is line_inds correct? - #print self.line_inds - #i = 0 - #for line,span in zip(self.data,self.line_inds): - # start,end = span - # print '' + self.text[start:end] + '' - # print '' + ' '.join(line) + '' - # print - # i += 1 - # if i == 13: exit() - - # If an accompanying concept file was specified, read it - if con: - classifications = [] - with open(con) as f: - for line in f: - - # Empty line - if line == '\n': continue - - # concept - prefix, suffix = line.split('||') - text = prefix.split() - conc = suffix[3:-2] - - start = text[-2].split(':') - end = text[-1].split(':') - - assert "concept spans one line", start[0] == end[0] - - # lineno - l = int(start[0]) - - # starttok - # endtok - start = int(start[1]) - end = int( end[1]) - - # Add the classification to the Note object - classifications.append( (conc,l,start,end) ) - - #print "txt: ", txt - #print "l: ", l - #print "start: ", start - #print "end: ", end - #print "line: ", self.data[l-1] - - #print "\n" + "-" * 80 - - # Safe guard against concept file having duplicate entries - classifications = list(set(classifications)) - - # Concept file does not guarantee ordering by line number - self.classifications = sorted(classifications, - cmp=classification_cmp) - - - def write(self, labels=None): - - """ - Note_i2b2::write() - - Purpose: Return the given concept label predictions in i2b2 format - - @param labels. A list of classifications - @return A string of i2b2-concept-file-formatted data - """ - - # Return value - retStr = '' - - # List of list of words (line-by-line) - tlist = self.data - - - # If given labels to write, use them. Default to self.classifications - if labels != None: - classifications = labels - elif self.classifications != None: - classifications = self.classifications - else: - raise Exception('Cannot write concept file: must specify labels') - - - # For each classification - for classification in classifications: - - # Ensure 'none' classifications are skipped - if classification[0] == 'none': - raise('Classification label "none" should never happen') - - concept = classification[0] - lineno = classification[1] - start = classification[2] - end = classification[3] - - # A list of words (corresponding line from the text file) - text = tlist[lineno-1] - - #print "\n" + "-" * 80 - #print "classification: ", classification - #print "lineno: ", lineno - #print "start: ", start - #print "end ", end - #print "text: ", text - #print "text[start]: ", text[start] - #print "concept: ", concept - - # The text string of words that has been classified - datum = text[start] - for j in range(start, end): - datum += " " + text[j+1] - - # Line:TokenNumber of where the concept starts and ends - idx1 = "%d:%d" % (lineno, start) - idx2 = "%d:%d" % (lineno, end ) - - # Classification - label = concept - - # Fixing issue involving i2b2 format (remove capitalization) - lowercased = [w.lower() for w in datum.split()] - datum = ' '.join(lowercased) - - # Print format - retStr += "c=\"%s\" %s %s||t=\"%s\"\n" % (datum, idx1, idx2, label) - - # return formatted data - return retStr - diff --git a/clicon/notes/note_semeval.py b/clicon/notes/note_semeval.py deleted file mode 100644 index 0c42ce5..0000000 --- a/clicon/notes/note_semeval.py +++ /dev/null @@ -1,286 +0,0 @@ -from __future__ import with_statement - - -###################################################################### -# CliNER - note_semeval.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Derived note object for reading semeval formatted data. # -###################################################################### - - -__author__ = 'Willie Boag' -__date__ = 'Nov. 6, 2014' - - - -import re -import string -from copy import copy -import os.path - - -from utilities_for_notes import concept_cmp, SentenceTokenizer, WordTokenizer, lno_and_tokspan__to__char_span -from abstract_note import AbstractNote - - - -class Note_semeval(AbstractNote): - - def __init__(self): - # For parsing text file - self.sent_tokenizer = SentenceTokenizer() - self.word_tokenizer = WordTokenizer() - - # Internal representation natural for i2b2 format - self.text = '' - self.data = [] # list of list of tokens - self.line_inds = [] - self.classifications = [] - self.fileName = 'no-file' - - - def getExtension(self): - return 'pipe' - - - def getText(self): - return self.text - - - def getTokenizedSentences(self): - return self.data - - - def getClassificationTuples(self): - return self.classifications - - - def getLineIndices(self): - return self.line_inds - - def read_standard(self, txt, con=None): - - start = 0 - end = 0 - - with open(txt) as f: - - # Get entire file - text = f.read() - self.text = text - - # Sentence splitter - sents = self.sent_tokenizer.tokenize(txt) - - # Tokenize each sentence into words (and save line number indices) - toks = [] - gold = [] # Actual lines - - for s in sents: - gold.append(s) - - # Store data - toks = self.word_tokenizer.tokenize(s) - self.data.append(toks) - - # Keep track of which indices each line has - end = start + len(s) - - self.line_inds.append( (start,end) ) - start = end + 1 - - # Skip ahead to next non-whitespace - while (start < len(text)) and text[start].isspace(): start += 1 - - - # If an accompanying concept file was specified, read it - if con: - classifications = [] - with open(con) as f: - for line in f: - - # Empty line - if line == '\n': continue - - # Parse concept file line - fields = line.strip().split('||') - #print fields - concept = fields[0] - span_inds = [] - for i in range(1,len(fields),2): - span = int(fields[i]), int(fields[i+1]) - span_inds.append( span ) - - #print '\t', concept - #print '\t', span_inds - - classifications.append( (concept, span_inds) ) - - # Concept file does not guarantee ordering by line number - self.classifications = sorted(classifications, cmp=concept_cmp) - - - - - def read(self, txt, con=None): - - # Filename - self.filename = os.path.split(txt)[1] - - start = 0 - end = 0 - with open(txt) as f: - - # Get entire file - text = f.read() - #print "\nTEXT:------------------" - #print text - - self.text = text - - # Sentence splitter - sents = self.sent_tokenizer.tokenize(txt) - - #print "\nSENTS:-----------------------------" - #print sents - - # Tokenize each sentence into words (and save line number indices) - toks = [] - gold = [] # Actual lines - - for s in sents: - - gold.append(s) - - #print "\nsentence:-------------------------------" - #print s - - #print s - - # Store data - toks = self.word_tokenizer.tokenize(s) - - #print "\ntokenized sentence:---------------------------------" - #print toks - - self.data.append(toks) - - # Keep track of which indices each line has - end = start + len(s) - - #print "\nindices:--------------------------------------------" - #print (start, end) - - #print "\nusing index on entire txt----------------------------" - #print text[start:end] - - #print "\nEQUAL?" - #print text[start:end] == s - - self.line_inds.append( (start,end) ) - start = end + 1 - - # Skip ahead to next non-whitespace - while (start < len(text)) and text[start].isspace(): start += 1 - - ''' - for line,inds in zip(gold,self.line_inds): - print '!!!' + line + '!!!' - print '\t', 'xx'*10 - print inds - print '\t', 'xx'*10 - print '!!!' + text[inds[0]: inds[1]] + '!!!' - print '---' - print '\n' - print 'Xx' * 20 - ''' - - #lno,span = lineno_and_tokspan((2329, 2351)) - #lno,span = lineno_and_tokspan((1327, 1344)) - #print self.data[lno][span[0]:span[1]+1] - - - # If an accompanying concept file was specified, read it - if con: - offset_classifications = [] - classifications = [] - with open(con) as f: - for line in f: - - # Empty line - if line == '\n': continue - - # Parse concept file line - fields = line.strip().split('||') - #print fields - concept = fields[1] - cui = fields[2] - span_inds = [] - for i in range(3,len(fields),2): - span = int(fields[i]), int(fields[i+1]) - span_inds.append( span ) - - #print '\t', concept - #print '\t', span_inds - - # Everything is a Disease_Disorder - concept = 'problem' - - # FIXME - For now, treat non-contiguous spans as separate - for span in span_inds: - #l,(start,end) = lineno_and_tokspan(span) - # Add the classification to the Note object - offset_classifications.append((concept,span[0],span[1])) - classifications.append( (concept, span_inds) ) - - # Safe guard against concept file having duplicate entries - #classifications = list(set(classifications)) - - # Concept file does not guarantee ordering by line number - self.classifications = sorted(classifications, cmp=concept_cmp) - - - - - def write(self, labels): - - # If given labels to write, use them. Default to self.classifications - if labels != None: - # Translate token-level annotations to character offsets - classifications = [] - for classification in labels: - inds = self.line_inds - data = self.data - text = self.text - - # FIXME - Assumes that token-level does not have noncontig - concept = classification[0] - lno = classification[1] - 1 - start = classification[2] - end = classification[3] - tokspan = start,end - - # Get character offset span - span = lno_and_tokspan__to__char_span(inds,data,text,lno,tokspan) - classifications.append( (concept,span) ) - - elif self.classifications != None: - classifications = self.classifications - else: - raise Exception('Cannot write concept file: must specify labels') - - exit() - - # return value - retStr = '' - - for concept,span_inds in classifications: - retStr += self.fileName + '.text||%s||CUI-less' % concept - for span in span_inds: - retStr += '||' + str(span[0]) + "||" + str(span[1]) - retStr += '\n' - - return retStr - diff --git a/clicon/notes/note_xml.py b/clicon/notes/note_xml.py deleted file mode 100644 index 13dd08b..0000000 --- a/clicon/notes/note_xml.py +++ /dev/null @@ -1,296 +0,0 @@ -from __future__ import with_statement - - -###################################################################### -# CliNER - note_xml.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Derived note object for reading xml formatted data. # -###################################################################### - - -__author__ = 'Willie Boag' -__date__ = 'Nov. 6, 2014' - - - -import re -import string -from copy import copy -import nltk.data -import os.path - - -from abstract_note import AbstractNote -from utilities_for_notes import classification_cmp, lineno_and_tokspan - - -class Note_xml(AbstractNote): - - def __init__(self): - # Internal representation natural for i2b2 format - self.data = [] # list of list of tokens - self.classifications = [] # list of concept tuples - self.line_inds = [] # list of (start,end) line character offsets - - - def getExtension(self): - return 'xml' - - - def getText(self): - return self.text - - - def getTokenizedSentences(self): - return self.data - - - def getClassificationTuples(self): - # return value - retVal = [] - - # Indices of each line - line_inds = [] - start = 0 - end = 0 - for sent in self.data: - for word in sent: - end += len(word) + 1 - line_inds.append( (start,end-1) ) - start = end - - # Build list of standardized classification tuples - for classification in self.classifications: - concept,lineno,tok_start,tok_end = classification - - # character offset of beginning of line - begin = line_inds[lineno-1][0] - - # Sweep through line to get character offsets from line start - start = 0 - for word in self.data[lineno-1][:tok_start]: - start += len(word) + 1 - - # Length of concept span - end = start - for word in self.data[lineno-1][tok_start:tok_end+1]: - end += len(word) + 1 - end -= 1 - - #print begin - #print begin+start, begin+end - #print '~~' + self.text[begin+start:begin+end] + '~~' - - retVal.append( (concept,[(begin+start,begin+end)]) ) - - return retVal - - - - def read(self, txt, con=None): - - """ - Note_xml::read() - - @param txt. A file path for the tokenized medical record - @param con. A file path for the xml annotated concepts for txt - """ - - start = 0 - end = 0 - - # Read in the medical text - with open(txt) as f: - for line in f: - - # Keep track of line's character offsets - for word in line.split(): - end += len(word) + 1 - self.line_inds.append( (start,end-1) ) - start = end - - # Strip away non-printable characters - line = filter(lambda x: x in string.printable, line) - - # Add sentence to the data list - self.data.append(line.split()) - - - # Read in the medical text - if con: - with open(con, 'r') as f: - - for lineno,line in enumerate(f.readlines()): - - # Stored data for self.classifications - concept = 'N/A' - start_ind = -1 - i = 0 - - for word in line.split(): - - # Search for xml tag - match = re.search('<(.*)>', word) - if match: - - con = match.group(1) - - # begin tag - if con[0] != '/': - # store data - concept = con - start_ind = i - - # end tag - else: - # store data - tup = (concept,lineno+1,start_ind,i-1) - self.classifications.append(tup) - - # non-tag text - else: - - # Next token - i += 1 - - - - def write(self, labels=None): - - """ - Note_xml::write() - - Purpose: Write the concept predictions in xml format - - @param labels. A list of predictions of labels for the given text. - @return A string for the xml-annotated file - """ - - - # If given labels to write, use them. Default to self.classifications - if labels != None: - classifications = labels - elif self.classifications: - classifications = self.classifications - else: - raise Exception('Cannot write concept file: must specify labels') - - - # Intermediate copy - toks = copy(self.data) - - # Order classification tuples so they are accessed right to left - # Assumption: sorted() is a stable sort - tags = sorted(classifications, key=lambda x:x[2], reverse=True) - tags = sorted(tags , key=lambda x:x[1] ) - - #print toks - #print '' - #print '' - - # Insert each xml tag into its proper location - for tag in tags: - - # Decode classification tuple - con = tag[0] - line = tag[1] - 1 - start = tag[2] - end = tag[3] - - #print tag - #print 'line: ', toks[line] - #print 'phrase: ', toks[line][start:end+1] - - # Insert tags - toks[line].insert(end+1, '') - toks[line].insert(start, '<' + con + '>') - - #print 'line: ', toks[line] - #print '' - - # Stitch text back together - toks = [ ' '.join(s) for s in toks ] - output = '\n'.join(toks) - - return output - - - - def read_standard(self, txt, con=None): - - """ - Note_xml::read_standard() - - @param txt. A file path for the tokenized medical record - @param con. A file path for the standardized annotated concepts for txt - """ - - start = 0 - end = 0 - - with open(txt) as f: - - # Get entire file - text = f.read() - self.text = text - - # Split into lines - self.data = map(lambda s: s.split(), text.split('\n')) - - # Tokenize each sentence into words (and save line number indices) - toks = [] - gold = [] # Actual lines - - for sent in self.data: - - gold.append(sent) - - # Keep track of which indices each line has - for word in sent: - end += len(word) + 1 - - self.line_inds.append( (start,end-1) ) - start = end - - # Skip ahead to next non-whitespace - while (start < len(text)) and text[start].isspace(): start += 1 - - - # If an accompanying concept file was specified, read it - if con: - classifications = [] - with open(con) as f: - for line in f: - - # Empty line - if line == '\n': continue - - # Parse concept file line - fields = line.strip().split('||') - #print fields - concept = fields[0] - span_inds = [] - for i in range(1,len(fields),2): - span = int(fields[i]), int(fields[i+1]) - span_inds.append( span ) - - # FIXME - For now, treat non-contiguous spans as separate - for span in span_inds: - # Add the classification to the Note object - l,(start,end) = lineno_and_tokspan(self.line_inds, self.data, self.text, span) - #print 'span: ', span - #print 'lineno: ', l - #print 'start: ', start - #print 'end: ', end - #print '\n' - classifications.append((concept,l+1,start,end)) - - # Safe guard against concept file having duplicate entries - classifications = list(set(classifications)) - - # Concept file does not guarantee ordering by line number - self.classifications = sorted(classifications,cmp=classification_cmp) - - diff --git a/clicon/notes/utilities_for_notes.py b/clicon/notes/utilities_for_notes.py deleted file mode 100644 index 986246d..0000000 --- a/clicon/notes/utilities_for_notes.py +++ /dev/null @@ -1,194 +0,0 @@ -###################################################################### -# CliNER - utilities.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Helper tools for Note objects # -###################################################################### - - -import nltk.data -import re - - -def classification_cmp(a,b): - """ - concept_cmp() - - Purpose: Compare concept classification tokens - """ - a = (int(a[1]), int(a[2])) - b = (int(b[1]), int(b[2])) - - # Sort by line number - if a[0] < b[0]: - return -1 - if a[0] > b[0]: - return 1 - else: - # Resolve lineno ties with indices - if a[1] < b[1]: - return -1 - if a[1] > b[1]: - return 1 - else: - return 0 - - - -def concept_cmp(a,b): - """ - concept_cmp() - - Purpose: Compare concept classification tokens - """ - return a[1][0] < b[1][0] - - - -# Helper function -def lineno_and_tokspan(line_inds, data, text, char_span): - """ File character offsets => line number and index into line """ - for i,span in enumerate(line_inds): - if char_span[1] <= span[1]: - - #print - #print "span: ", span - - # start and end of span relative to sentence - start = char_span[0] - span[0] - end = char_span[1] - span[0] - - #print "START: ", start - #print "END: ", end - - #print "USING span on text: ~" + text[span[0]:span[1]] + '~' - #print "USING start and end: ~" + text[span[0]:span[1]][start:end]+'~' - - #print "data", data[i] - tok_span = [0,len(data[i])-1] - char_count = 0 - - dataWithEmptyChars = re.split(" |\n|\t", text[span[0]:span[1] + 1]) - - index = 0 - for j,tok in enumerate(dataWithEmptyChars): - if char_count > end: - tok_span[1] = index -1 - break - elif char_count == start: - tok_span[0] = index - char_count += len(tok) + 1 - if len(tok) > 0: - index += 1 - #print '\t',j, '\t', tok, '(', char_count, ')' - - #print start, end - #print tok_span - #print text[span[0]:span[1]] - #print data[i][tok_span[0]:tok_span[1]] - #print - - # return line number AND token span - #print "LINE: ", i - #print "TOK SPAN: ", tok_span - #print data[i] - #print tok_span - - #print "USING char_span on text: ", text[char_span[0]:char_span[1]] - #print "USING tok_span on data[i]", data[i][tok_span[0]], data[i][tok_span[1]] - #print "USING char_span on text: ", text[char_span[0]], text[char_span[1]] - - return (i, tuple(tok_span)) - - return None - - - - -# Helper function -def lno_and_tokspan__to__char_span(line_inds, data, text, lineno, tokspan): - """ File character offsets => line number and index into line """ - - start,end = line_inds[lineno] - - dataWithEmpty= text[start:end].replace('\n',' ').replace('\t',' ').split(' ') - - print 'start: ', start - print 'end: ', end - print 'dataWith: ', dataWithEmpty - print - print 'data: ', data[lineno] - print '\n\n\n' - - tokPosRelToSent = [] - count = 0 - for string in dataWithEmptyChars: - if string != '': - tokPosRelToSent.append((count, count + len(string)-1)) - count += len(string) + 1 - else: # empty string - count += 1 - - #print tokPosRelToSent - #print tokPosRelToSent[startTok:endTok+1] - - startOfTokRelToText = tokPosRelToSent[startTok][0] + start - endOfTokRelToText = tokPosRelToSent[ endTok][1] + start - - #print '---' + self.text[endOfTokRelToText-3:endOfTokRelToText+4] + '---' - - #print startOfTokRelToText, ' ', endOfTokRelToText - - # Heuristc / Hack for determining when to include extra space - if ( self.text[endOfTokRelToText ].isalpha()) and \ - (not self.text[endOfTokRelToText+1].isalpha()) : - endOfTokRelToText += 1 - - #print startOfTokRelToText, ' ', endOfTokRelToText - #print '\n' - - if line not in spans: - spans[line] = (self.fileName + ".text||Disease_Disorder||CUI-less||" + str(startOfTokRelToText) + "||" + str(endOfTokRelToText)) - else: - spans[line] += ("\n" + self.fileName + ".text||Disease_Disorder||CUI-less||" + str(startOfTokRelToText) + "||" + str(endOfTokRelToText)) - - - print lineno - print tokspan - - line = data[lineno] - - print line - - print - - return 0,0 - - - - -# Break file into sentences. -class SentenceTokenizer: - - def __init__(self): - self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') - - def tokenize(self, text_file): - """ Split the document into sentences """ - text = open(text_file, 'r').read() - return self.sent_tokenizer.tokenize(text) - - - -# Break sentence into words -class WordTokenizer: - - # TODO - PunktWordTokenizer (http://www.nltk.org/api/nltk.tokenize.html) - def __init__(self): - pass - - def tokenize(self, sent): - """ Split the sentence into tokens """ - return sent.split() - diff --git a/clicon/predict.py b/clicon/predict.py deleted file mode 100644 index 9e64fca..0000000 --- a/clicon/predict.py +++ /dev/null @@ -1,131 +0,0 @@ -###################################################################### -# CliNER - predict.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Use trained model to predict concept labels for data. # -###################################################################### - - -__author__ = 'Willie Boag' -__date__ = 'Oct. 5, 2014' - - -import os -import sys -import glob -import argparse -import helper - -from model import Model -from notes.note import Note - - -def main(): - - parser = argparse.ArgumentParser() - - parser.add_argument("-i", - dest = "input", - help = "The input files to predict", - default = os.path.join(os.getenv('CLICON_DIR'), 'data/test_data/*') - ) - - parser.add_argument("-o", - dest = "output", - help = "The directory to write the output", - default = os.path.join(os.getenv('CLICON_DIR'), 'data/test_predictions') - ) - - parser.add_argument("-m", - dest = "model", - help = "The model to use for prediction", - default = os.path.join(os.getenv('CLICON_DIR'), 'models/run.model') - ) - - parser.add_argument("-f", - dest = "format", - help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )", - default = 'i2b2' - ) - - parser.add_argument("-crf", - dest = "with_crf", - help = "Specify where to find crfsuite", - - default = None - ) - - args = parser.parse_args() - - - # Parse arguments - files = glob.glob(args.input) - helper.mkpath(args.output) - format = args.format - - - # Predict - predict(files, args.model, args.output, format=format) - - - -def predict(files, model_path, output_dir, format): - - # Must specify output format - if format not in Note.supportedFormats(): - print >>sys.stderr, '\n\tError: Must specify output format' - print >>sys.stderr, '\tAvailable formats: ', ' | '.join(Note.supportedFormats()) - print >>sys.stderr, '' - exit(1) - - - - # Load model - model = Model.load(model_path) - - - # Tell user if not predicting - if not files: - print >>sys.stderr, "\n\tNote: You did not supply any input files\n" - exit() - - - # For each file, predict concept labels - n = len(files) - for i,txt in enumerate(sorted(files)): - - # Read the data into a Note object - note = Note(format) - note.read(txt) - - - print '-' * 30 - print '\n\t%d of %d' % (i+1,n) - print '\t', txt, '\n' - - - # Predict concept labels - labels = model.predict(note) - - # Get predictions in proper format - extension = note.getExtension() - output = note.write(labels) - - #print output - - # Output file - fname = os.path.splitext(os.path.basename(txt))[0] + '.' + extension - out_path = os.path.join(output_dir, fname) - - # Output the concept predictions - print '\n\nwriting to: ', out_path - with open(out_path, 'w') as f: - print >>f, output - print - - - - -if __name__ == '__main__': - main() diff --git a/clicon/train.py b/clicon/train.py deleted file mode 100644 index 7e9d30c..0000000 --- a/clicon/train.py +++ /dev/null @@ -1,139 +0,0 @@ -###################################################################### -# CliNER - train.py # -# # -# Willie Boag wboag@cs.uml.edu # -# # -# Purpose: Build model for given training data. # -###################################################################### - - -__author__ = 'Willie Boag' -__date__ = 'Oct. 5, 2014' - - -import os -import os.path -import glob -import argparse -import cPickle as pickle - -import helper -from sets import Set -from model import Model -from notes.note import Note - - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument("-t", - dest = "txt", - help = "The files that contain the training examples", - default = os.path.join(os.getenv('CLICON_DIR'), 'data/train/txt/*') - ) - - parser.add_argument("-c", - dest = "con", - help = "The files that contain the labels for the training examples", - default = os.path.join(os.getenv('CLICON_DIR'), 'data/train/con/*') - ) - - parser.add_argument("-m", - dest = "model", - help = "Path to the model that should be generated", - default = os.path.join(os.getenv('CLICON_DIR'), 'models/run.model') - ) - - parser.add_argument("-f", - dest = "format", - help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )", - default = 'i2b2' - ) - - parser.add_argument("-g", - dest = "grid", - help = "A flag indicating whether to perform a grid search", - action = "store_true" - ) - - parser.add_argument("-no-crf", - dest = "nocrf", - help = "A flag indicating whether to use crfsuite for pass one.", - action = "store_true" - ) - - # Parse the command line arguments - args = parser.parse_args() - is_crf = not args.nocrf - - - # A list of text file paths - # A list of concept file paths - txt_files = glob.glob(args.txt) - con_files = glob.glob(args.con) - - - # data format - format = args.format - - - # Must specify output format - if format not in Note.supportedFormats(): - print >>sys.stderr, '\n\tError: Must specify output format' - print >>sys.stderr, '\tAvailable formats: ', ' | '.join(Note.supportedFormats()) - print >>sys.stderr, '' - exit(1) - - - # Collect training data file paths - txt_files_map = helper.map_files(txt_files) # ex. {'record-13': 'record-13.con'} - con_files_map = helper.map_files(con_files) - - training_list = [] # ex. training_list = [ ('record-13.txt', 'record-13.con') ] - for k in txt_files_map: - if k in con_files_map: - training_list.append((txt_files_map[k], con_files_map[k])) - - - # display file names (for user to see data was properly located) - print '\n', training_list, '\n' - - - # Train the model - train(training_list, args.model, format, is_crf=is_crf, grid=args.grid) - - - -def train(training_list, model_path, format, is_crf=True, grid=False): - - # Read the data into a Note object - notes = [] - for txt, con in training_list: - note_tmp = Note(format) # Create Note - note_tmp.read(txt, con) # Read data into Note - notes.append(note_tmp) # Add the Note to the list - - - # file names - if not notes: - print 'Error: Cannot train on 0 files. Terminating train.' - return 1 - - - # Create a Machine Learning model - model = Model(is_crf=is_crf) - - - # Train the model using the Note's data - model.train(notes, grid) - - - # Pickle dump - print 'pickle dump' - with open(model_path, "wb") as m_file: - pickle.dump(model, m_file) - - - -if __name__ == '__main__': - main() diff --git a/cliner b/cliner new file mode 100755 index 0000000..c9d90ec --- /dev/null +++ b/cliner @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +###################################################################### +# CliNER - cliner # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: Command Line Interface for working with cliner. # +###################################################################### + + +import sys +import os + + +def main(): + + commands = ['train', 'predict', 'evaluate'] + + help_msg = \ + ''' + Usage: cliner [OPTIONS] COMMAND [ARGS]... + + Options: + --help Show this message and exit. + + Commands: + %s + ''' % '\n '.join(commands) + + # Is argument correct? + if len(sys.argv)<2 or sys.argv[1] not in commands or sys.argv[1] == '--help': + sys.stderr.write('%s\n\n'%(help_msg)) + exit(1) + + # select appropriate sub-command + subcmd = sys.argv[1] + del sys.argv[1] + + # Where to import code from + homedir = os.path.dirname(os.path.abspath(__file__)) + codedir = os.path.join(homedir, 'code') + if codedir not in sys.path: + sys.path.append(codedir) + + # Call appropriate sub-command + if subcmd == 'train': + import train + train.main() + elif subcmd == 'predict': + import predict + predict.main() + elif subcmd == 'evaluate': + import evaluate + evaluate.main() + + + +if __name__ == '__main__': + main() + + diff --git a/code/.gitignore b/code/.gitignore new file mode 100644 index 0000000..fca57bc --- /dev/null +++ b/code/.gitignore @@ -0,0 +1,3 @@ +*.pyc + +__evaluate.py diff --git a/code/DatasetCliner_experimental.py b/code/DatasetCliner_experimental.py new file mode 100644 index 0000000..d938ccb --- /dev/null +++ b/code/DatasetCliner_experimental.py @@ -0,0 +1,441 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Oct 27 12:55:40 2017 + +@author: elena +""" + +import sklearn.preprocessing +import collections +import codecs +#import utils_nlp +import re +import time +#import token +import os +import pickle +import random +import numpy as np +import helper_dataset as hd + + + + +def lists_to_dataset_structure(sentences_tokens,sentence_tags,total_token_counter,token_count,label_count,character_count): + labels=[] + tokens=[] + new_label_sequence=[] + new_token_sequence=[] + + features="" + feature_file_name="" + feature_vector_size=0 + + for idx,sentence in enumerate(sentences_tokens): + for token_idx,token_i in enumerate(sentence): + new_token_sequence.append(token_i) + new_label_sequence.append(sentence_tags[idx][token_idx]) + + token_count[token_i] += 1 + label_count[sentence_tags[idx][token_idx]] += 1 + + if token_idx == len(sentence) - 1: + labels.append(new_label_sequence) + tokens.append(new_token_sequence) + new_token_sequence = [] + new_label_sequence = [] + # FEATURES ARE NOT SUPPORTED: Can be done if we are getting a third list that looks like [[f1,f2,f3],[f1,f2,f3]... for each token] + token_features=[] + features_as_array=np.array(token_features,dtype=np.dtype('int32')) + features_as_array=features_as_array.reshape((features_as_array.shape[0],1)) + features_as_array=np.transpose(features_as_array) + + features="" + feature_file_name="" + feature_vector_size=0 + + + total_token_counter+=1 + for character in token_i: + character_count[character] += 1 + + return labels, tokens, token_count, label_count, character_count,features,feature_file_name,feature_vector_size + + + + + + +class Dataset(object): + """A class for handling data sets.""" + + def __init__(self, name='', verbose=False, debug=False): + self.name = name + self.verbose = verbose + self.debug = debug + + + def _parse_dataset(self, dataset_filepath, dataset_type, sentences_list=[],tags_list=[], Not_here=False): + + token_count = collections.defaultdict(lambda: 0) #initialized by a function + label_count = collections.defaultdict(lambda: 0) + character_count = collections.defaultdict(lambda: 0) + longest_sentence=0 + + # Currently Not supported, features + #feature_file_name=os.getcwd()+os.sep+"test_cliner"+dataset_type+".hdf5" + # size_of_features=0 + + + # Currentlt Not supported - features + # f = h5py.File(feature_file_name, "w") + # dset = f.create_dataset("word-features", (0, size_of_features), maxshape=(None, size_of_features),dtype=np.dtype('int32'), chunks=True) #44 + #dt = h5py.special_dtype(vlen=np.dtype('int32')) + #sentence_words=f.create_dataset("sentences-words",(0,),dtype=dt,chunks=True,maxshape=(None,)) + + line_count =-1 + sent_count=-1 + total_token_counter=0 + token_counter_offset_sent=0 + + sentence_counter=0 + + tokens=[] + labels=[] + features=[] + + characters=[] # NOT USED (?) + + #extract token features for agumentation + token_features=[] + + token_lengths=[] + new_token_sequence=[] + new_label_sequence = [] + #new_token_features_sequence=[] + + + + + #labels, tokens, token_count, label_count, character_count,features,feature_file_name,feature_vector_size + if Not_here==False: + labels, tokens, token_count, label_count, character_count,features,feature_file_name,feature_vector_size=lists_to_dataset_structure(sentences_list,tags_list,total_token_counter,token_count,label_count,character_count) + + + + return labels, tokens, token_count, label_count, character_count,features,feature_file_name,feature_vector_size + + + + + def _convert_to_indices(self, dataset_types): + # Frank and Jennies Function + + tokens = self.tokens + labels = self.labels + token_to_index = self.token_to_index + character_to_index = self.character_to_index + label_to_index = self.label_to_index + index_to_label = self.index_to_label + + # Map tokens and labels to their indices + token_indices = {} + label_indices = {} + characters = {} + token_lengths = {} + character_indices = {} + character_indices_padded = {} + for dataset_type in dataset_types: + print (dataset_type) + token_indices[dataset_type] = [] + characters[dataset_type] = [] + character_indices[dataset_type] = [] + token_lengths[dataset_type] = [] + character_indices_padded[dataset_type] = [] + + for token_sequence in tokens[dataset_type]: + token_indices[dataset_type].append([token_to_index.get(token, self.UNK_TOKEN_INDEX) for token in token_sequence]) + characters[dataset_type].append([list(token) for token in token_sequence]) + character_indices[dataset_type].append([[character_to_index.get(character, random.randint(1, max(self.index_to_character.keys()))) for character in token] for token in token_sequence]) + token_lengths[dataset_type].append([len(token) for token in token_sequence]) + longest_token_length_in_sequence = max(token_lengths[dataset_type][-1]) + character_indices_padded[dataset_type].append([hd.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) for temp_token_indices in character_indices[dataset_type][-1]]) + + label_indices[dataset_type] = [] + for label_sequence in labels[dataset_type]: + label_indices[dataset_type].append([label_to_index[label] for label in label_sequence]) + + label_binarizer = sklearn.preprocessing.LabelBinarizer() + label_binarizer.fit(range(max(index_to_label.keys()) + 1)) + label_vector_indices = {} + for dataset_type in dataset_types: + label_vector_indices[dataset_type] = [] + for label_indices_sequence in label_indices[dataset_type]: + label_vector_indices[dataset_type].append(label_binarizer.transform(label_indices_sequence)) + + return token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices + + def update_dataset(self, dataset_filepaths, dataset_types, Datasets_tokens, Datasets_labels): + + ''' + dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' + Overwrites the data of type specified in dataset_types using the existing token_to_index, character_to_index, and label_to_index mappings. + ''' + + + # def _parse_dataset(self, dataset_filepath, dataset_type, sentences_list=[],tags_list=[], Not_here=False): + for dataset_type in dataset_types: + print (dataset_type) + self.labels[dataset_type], self.tokens[dataset_type], _, _, _,_,_,_= self._parse_dataset("",dataset_type, Datasets_tokens[dataset_type],Datasets_labels[dataset_type]) + + token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(dataset_types) + + self.token_indices.update(token_indices) + self.label_indices.update(label_indices) + self.character_indices_padded.update(character_indices_padded) + self.character_indices.update(character_indices) + self.token_lengths.update(token_lengths) + self.characters.update(characters) + self.label_vector_indices.update(label_vector_indices) + + def load_dataset(self,avaliable_datasets_sent,avaliable_datasets_labels, dataset_filepaths, parameters, token_to_vector=None,pretrained_dataset=None): + ''' + dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' + ''' + start_time = time.time() + print('Load dataset... \n') + if parameters['token_pretrained_embedding_filepath'] != '': + if token_to_vector==None: + token_to_vector = hd.load_pretrained_token_embeddings(parameters) + else: + token_to_vector = {} + + all_tokens_in_pretraining_dataset = [] + all_characters_in_pretraining_dataset = [] + + if parameters['use_pretrained_model']: + + + + #temp_pretrained_dataset_adress="./models/NN_models/1235-4/dataset.pickle" #"./models/NN_models/1234-5/dataset.pickle" + if pretrained_dataset==None: + temp_pretrained_dataset_adress=parameters['model_folder']+os.sep+"dataset.pickle" + pretraining_dataset = pickle.load(open(temp_pretrained_dataset_adress, "rb")) + print ("Pre-loading Pre-trained dataset objects") + else: + pretraining_dataset=pretrained_dataset + print ("Pretrained dataset was pre-loaded") + + all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values() + all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values() + + + remap_to_unk_count_threshold = 1 + self.UNK_TOKEN_INDEX = 0 + self.PADDING_CHARACTER_INDEX = 0 + self.tokens_mapped_to_unk = [] + self.UNK = 'UNK' + self.unique_labels = [] + labels = {} + tokens = {} + label_count = {} + token_count = {} + character_count = {} + + + features={} + features_file_names={} + feature_vector_size={} + #deploy + + for dataset_type in ['train', 'valid', 'test','deploy']: + Not_here=False + + if dataset_type not in avaliable_datasets_sent: + Not_here=True + #_parse_dataset(self, dataset_filepath,dataset_type,sentences_list="",tags_list="") + if Not_here==False: + labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type], features[dataset_type], \ + features_file_names[dataset_type],feature_vector_size[dataset_type] \ + = self._parse_dataset("", dataset_type, sentences_list=avaliable_datasets_sent[dataset_type], tags_list=avaliable_datasets_labels[dataset_type]) + + + if Not_here==True: + labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type], features[dataset_type], \ + features_file_names[dataset_type],feature_vector_size[dataset_type] \ + = self._parse_dataset("", dataset_type, sentences_list=[], tags_list=[]) # + + + token_count['all'] = {} + for token in list(token_count['train'].keys()) + list(token_count['valid'].keys()) + list(token_count['test'].keys()) + list(token_count['deploy'].keys()): + token_count['all'][token] = token_count['train'][token] + token_count['valid'][token] + token_count['test'][token] + token_count['deploy'][token] + + + + if parameters['load_all_pretrained_token_embeddings']: + for token in token_to_vector: + if token not in token_count['all']: + token_count['all'][token] = -1 + token_count['train'][token] = -1 + for token in all_tokens_in_pretraining_dataset: + if token not in token_count['all']: + token_count['all'][token] = -1 + token_count['train'][token] = -1 + + character_count['all'] = {} + for character in list(character_count['train'].keys()) + list(character_count['valid'].keys()) + list(character_count['test'].keys()) + list(character_count['deploy'].keys()): + character_count['all'][character] = character_count['train'][character] + character_count['valid'][character] + character_count['test'][character] + character_count['deploy'][character] + + for character in all_characters_in_pretraining_dataset: + if character not in character_count['all']: + character_count['all'][character] = -1 + character_count['train'][character] = -1 + + + label_count['all'] = {} + for character in list(label_count['train'].keys()) + list(label_count['valid'].keys()) + list(label_count['test'].keys()) + list(label_count['deploy'].keys()): + label_count['all'][character] = label_count['train'][character] + label_count['valid'][character] + label_count['test'][character] + label_count['deploy'][character] + + token_count['all'] = hd.order_dictionary(token_count['all'], 'value_key', reverse = True) + label_count['all'] = hd.order_dictionary(label_count['all'], 'key', reverse = False) + character_count['all'] = hd.order_dictionary(character_count['all'], 'value', reverse = True) + if self.verbose: print('character_count[\'all\']: {0}'.format(character_count['all'])) + + + + token_to_index = {} + token_to_index[self.UNK] = self.UNK_TOKEN_INDEX + iteration_number = 0 + number_of_unknown_tokens = 0 + if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format(parameters['remap_unknown_tokens_to_unk'])) + if self.verbose: print("len(token_count['train'].keys()): {0}".format(len(token_count['train'].keys()))) + for token, count in token_count['all'].items(): + if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 + + if parameters['remap_unknown_tokens_to_unk'] == 1 and \ + (token_count['train'][token] == 0 or \ + parameters['load_only_pretrained_token_embeddings']) and \ + not hd.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \ + token not in all_tokens_in_pretraining_dataset: + token_to_index[token] = self.UNK_TOKEN_INDEX + number_of_unknown_tokens += 1 + self.tokens_mapped_to_unk.append(token) + else: + token_to_index[token] = iteration_number + iteration_number += 1 + + infrequent_token_indices = [] + for token, count in token_count['train'].items(): + if 0 < count <= remap_to_unk_count_threshold: + infrequent_token_indices.append(token_to_index[token]) + #if self.verbose: print("len(token_count['train']): {0}".format(len(token_count['train']))) + # if self.verbose: print("len(infrequent_token_indices): {0}".format(len(infrequent_token_indices))) + + # Ensure that both B- and I- versions exist for each label + labels_without_bio = set() + for label in label_count['all'].keys(): + new_label = hd.remove_bio_from_label_name(label) + labels_without_bio.add(new_label) + for label in labels_without_bio: + if label == 'O': + continue + if parameters['tagging_format'] == 'bioes': + prefixes = ['B-', 'I-', 'E-', 'S-'] + else: + prefixes = ['B-', 'I-'] + for prefix in prefixes: + l = prefix + label + if l not in label_count['all']: + label_count['all'][l] = 0 + label_count['all'] = hd.order_dictionary(label_count['all'], 'key', reverse = False) + + if parameters['use_pretrained_model']: + + print ("USE_PRETRAINED_MODEL ACTIVE") + self.unique_labels = sorted(list(pretraining_dataset.label_to_index.keys())) + # Make sure labels are compatible with the pretraining dataset. + for label in label_count['all']: + if label not in pretraining_dataset.label_to_index: + raise AssertionError("The label {0} does not exist in the pretraining dataset. ".format(label) + + "Please ensure that only the following labels exist in the dataset: {0}".format(', '.join(self.unique_labels))) + label_to_index = pretraining_dataset.label_to_index.copy() + else: + label_to_index = {} + iteration_number = 0 + for label, count in label_count['all'].items(): + label_to_index[label] = iteration_number + iteration_number += 1 + self.unique_labels.append(label) + + + character_to_index = {} + iteration_number = 0 + for character, count in character_count['all'].items(): + if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 + character_to_index[character] = iteration_number + iteration_number += 1 + + + token_to_index = hd.order_dictionary(token_to_index, 'value', reverse = False) + if self.verbose: print('token_to_index: {0}'.format(token_to_index)) + index_to_token = hd.reverse_dictionary(token_to_index) + if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK + if self.verbose: print('index_to_token: {0}'.format(index_to_token)) + + label_to_index = hd.order_dictionary(label_to_index, 'value', reverse = False) + index_to_label = hd.reverse_dictionary(label_to_index) + + character_to_index = hd.order_dictionary(character_to_index, 'value', reverse = False) + index_to_character = hd.reverse_dictionary(character_to_index) + + self.token_to_index = token_to_index + self.index_to_token = index_to_token + self.index_to_character = index_to_character + self.character_to_index = character_to_index + self.index_to_label = index_to_label + self.label_to_index = label_to_index + + + self.tokens = tokens + self.labels = labels + + + + dataset_types=['train','test','valid','deploy'] + token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(dataset_types) + + self.token_indices = token_indices + self.label_indices = label_indices + self.character_indices_padded = character_indices_padded + self.character_indices = character_indices + self.token_lengths = token_lengths + self.characters = characters + self.label_vector_indices = label_vector_indices + + self.number_of_classes = max(self.index_to_label.keys()) + 1 + self.vocabulary_size = max(self.index_to_token.keys()) + 1 + self.alphabet_size = max(self.index_to_character.keys()) + 1 + + + # unique_labels_of_interest is used to compute F1-scores. + self.unique_labels_of_interest = list(self.unique_labels) + self.unique_labels_of_interest.remove('O') + + self.unique_label_indices_of_interest = [] + for lab in self.unique_labels_of_interest: + self.unique_label_indices_of_interest.append(label_to_index[lab]) + + self.infrequent_token_indices = infrequent_token_indices + + + elapsed_time = time.time() - start_time + print('done ({0:.2f} seconds)'.format(elapsed_time)) + + + + self.feature_vector_size=0 + + + return token_to_vector + diff --git a/clicon/features_dir/__init__.py b/code/LSTM/__init__.py similarity index 100% rename from clicon/features_dir/__init__.py rename to code/LSTM/__init__.py diff --git a/clicon/__init__.py b/code/__init__.py similarity index 100% rename from clicon/__init__.py rename to code/__init__.py diff --git a/code/conlleval b/code/conlleval new file mode 100644 index 0000000..70e4ad2 --- /dev/null +++ b/code/conlleval @@ -0,0 +1,315 @@ +#!/usr/bin/perl -w +# conlleval: evaluate result of processing CoNLL-2000 shared task +# usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file +# README: http://cnts.uia.ac.be/conll2000/chunking/output.html +# options: l: generate LaTeX output for tables like in +# http://cnts.uia.ac.be/conll2003/ner/example.tex +# r: accept raw result tags (without B- and I- prefix; +# assumes one word per chunk) +# d: alternative delimiter tag (default is single space) +# o: alternative outside tag (default is O) +# note: the file should contain lines with items separated +# by $delimiter characters (default space). The final +# two items should contain the correct tag and the +# guessed tag in that order. Sentences should be +# separated from each other by empty lines or lines +# with $boundary fields (default -X-). +# url: http://lcg-www.uia.ac.be/conll2000/chunking/ +# started: 1998-09-25 +# version: 2004-01-26 +# author: Erik Tjong Kim Sang + +use strict; + +my $false = 0; +my $true = 42; + +my $boundary = "-X-"; # sentence boundary +my $correct; # current corpus chunk tag (I,O,B) +my $correctChunk = 0; # number of correctly identified chunks +my $correctTags = 0; # number of correct chunk tags +my $correctType; # type of current corpus chunk tag (NP,VP,etc.) +my $delimiter = " "; # field delimiter +my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) +my $firstItem; # first feature (for sentence boundary checks) +my $foundCorrect = 0; # number of chunks in corpus +my $foundGuessed = 0; # number of identified chunks +my $guessed; # current guessed chunk tag +my $guessedType; # type of current guessed chunk tag +my $i; # miscellaneous counter +my $inCorrect = $false; # currently processed chunk is correct until now +my $lastCorrect = "O"; # previous chunk tag in corpus +my $latex = 0; # generate LaTeX formatted output +my $lastCorrectType = ""; # type of previously identified chunk tag +my $lastGuessed = "O"; # previously identified chunk tag +my $lastGuessedType = ""; # type of previous chunk tag in corpus +my $lastType; # temporary storage for detecting duplicates +my $line; # line +my $nbrOfFeatures = -1; # number of features per line +my $precision = 0.0; # precision score +my $oTag = "O"; # outside tag, default O +my $raw = 0; # raw input: add B to every token +my $recall = 0.0; # recall score +my $tokenCounter = 0; # token counter (ignores sentence breaks) + +my %correctChunk = (); # number of correctly identified chunks per type +my %foundCorrect = (); # number of chunks in corpus per type +my %foundGuessed = (); # number of identified chunks per type + +my @features; # features on line +my @sortedTypes; # sorted list of chunk type names + +# sanity check +while (@ARGV and $ARGV[0] =~ /^-/) { + if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } + elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } + elsif ($ARGV[0] eq "-d") { + shift(@ARGV); + if (not defined $ARGV[0]) { + die "conlleval: -d requires delimiter character"; + } + $delimiter = shift(@ARGV); + } elsif ($ARGV[0] eq "-o") { + shift(@ARGV); + if (not defined $ARGV[0]) { + die "conlleval: -o requires delimiter character"; + } + $oTag = shift(@ARGV); + } else { die "conlleval: unknown argument $ARGV[0]\n"; } +} +if (@ARGV) { die "conlleval: unexpected command line argument\n"; } +# process input +while () { + chomp($line = $_); + @features = split(/$delimiter/,$line); + if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } + elsif ($nbrOfFeatures != $#features and @features != 0) { + printf STDERR "unexpected number of features: %d (%d)\n", + $#features+1,$nbrOfFeatures+1; + exit(1); + } + if (@features == 0 or + $features[0] eq $boundary) { @features = ($boundary,"O","O"); } + if (@features < 2) { + die "conlleval: unexpected number of features in line $line\n"; + } + if ($raw) { + if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } + if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } + if ($features[$#features] ne "O") { + $features[$#features] = "B-$features[$#features]"; + } + if ($features[$#features-1] ne "O") { + $features[$#features-1] = "B-$features[$#features-1]"; + } + } + # 20040126 ET code which allows hyphens in the types + if ($features[$#features] =~ /^([^-]*)-(.*)$/) { + $guessed = $1; + $guessedType = $2; + } else { + $guessed = $features[$#features]; + $guessedType = ""; + } + pop(@features); + if ($features[$#features] =~ /^([^-]*)-(.*)$/) { + $correct = $1; + $correctType = $2; + } else { + $correct = $features[$#features]; + $correctType = ""; + } + pop(@features); +# ($guessed,$guessedType) = split(/-/,pop(@features)); +# ($correct,$correctType) = split(/-/,pop(@features)); + $guessedType = $guessedType ? $guessedType : ""; + $correctType = $correctType ? $correctType : ""; + $firstItem = shift(@features); + + # 1999-06-26 sentence breaks should always be counted as out of chunk + if ( $firstItem eq $boundary ) { $guessed = "O"; } + + if ($inCorrect) { + if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and + &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and + $lastGuessedType eq $lastCorrectType) { + $inCorrect=$false; + $correctChunk++; + $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? + $correctChunk{$lastCorrectType}+1 : 1; + } elsif ( + &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != + &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or + $guessedType ne $correctType ) { + $inCorrect=$false; + } + } + + if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and + &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and + $guessedType eq $correctType) { $inCorrect = $true; } + + if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { + $foundCorrect++; + $foundCorrect{$correctType} = $foundCorrect{$correctType} ? + $foundCorrect{$correctType}+1 : 1; + } + if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { + $foundGuessed++; + $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? + $foundGuessed{$guessedType}+1 : 1; + } + if ( $firstItem ne $boundary ) { + if ( $correct eq $guessed and $guessedType eq $correctType ) { + $correctTags++; + } + $tokenCounter++; + } + + $lastGuessed = $guessed; + $lastCorrect = $correct; + $lastGuessedType = $guessedType; + $lastCorrectType = $correctType; +} +if ($inCorrect) { + $correctChunk++; + $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? + $correctChunk{$lastCorrectType}+1 : 1; +} + +if (not $latex) { + # compute overall precision, recall and FB1 (default values are 0.0) + $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); + $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); + $FB1 = 2*$precision*$recall/($precision+$recall) + if ($precision+$recall > 0); + + # print overall performance + printf "processed $tokenCounter tokens with $foundCorrect phrases; "; + printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; + if ($tokenCounter>0) { + printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; + printf "precision: %6.2f%%; ",$precision; + printf "recall: %6.2f%%; ",$recall; + printf "FB1: %6.2f\n",$FB1; + } +} + +# sort chunk type names +undef($lastType); +@sortedTypes = (); +foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { + if (not($lastType) or $lastType ne $i) { + push(@sortedTypes,($i)); + } + $lastType = $i; +} +# print performance per chunk type +if (not $latex) { + for $i (@sortedTypes) { + $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; + if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } + else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } + if (not($foundCorrect{$i})) { $recall = 0.0; } + else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } + if ($precision+$recall == 0.0) { $FB1 = 0.0; } + else { $FB1 = 2*$precision*$recall/($precision+$recall); } + printf "%17s: ",$i; + printf "precision: %6.2f%%; ",$precision; + printf "recall: %6.2f%%; ",$recall; + printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; + } +} else { + print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; + for $i (@sortedTypes) { + $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; + if (not($foundGuessed{$i})) { $precision = 0.0; } + else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } + if (not($foundCorrect{$i})) { $recall = 0.0; } + else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } + if ($precision+$recall == 0.0) { $FB1 = 0.0; } + else { $FB1 = 2*$precision*$recall/($precision+$recall); } + printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", + $i,$precision,$recall,$FB1; + } + print "\\hline\n"; + $precision = 0.0; + $recall = 0; + $FB1 = 0.0; + $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); + $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); + $FB1 = 2*$precision*$recall/($precision+$recall) + if ($precision+$recall > 0); + printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", + $precision,$recall,$FB1; +} + +exit 0; + +# endOfChunk: checks if a chunk ended between the previous and current word +# arguments: previous and current chunk tags, previous and current types +# note: this code is capable of handling other chunk representations +# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong +# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 + +sub endOfChunk { + my $prevTag = shift(@_); + my $tag = shift(@_); + my $prevType = shift(@_); + my $type = shift(@_); + my $chunkEnd = $false; + + if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } + if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } + if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } + if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } + + if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } + if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } + if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } + if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } + + if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { + $chunkEnd = $true; + } + + # corrected 1998-12-22: these chunks are assumed to have length 1 + if ( $prevTag eq "]" ) { $chunkEnd = $true; } + if ( $prevTag eq "[" ) { $chunkEnd = $true; } + + return($chunkEnd); +} + +# startOfChunk: checks if a chunk started between the previous and current word +# arguments: previous and current chunk tags, previous and current types +# note: this code is capable of handling other chunk representations +# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong +# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 + +sub startOfChunk { + my $prevTag = shift(@_); + my $tag = shift(@_); + my $prevType = shift(@_); + my $type = shift(@_); + my $chunkStart = $false; + + if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } + if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } + if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } + if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } + + if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } + if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } + if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } + if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } + + if ($tag ne "O" and $tag ne "." and $prevType ne $type) { + $chunkStart = $true; + } + + # corrected 1998-12-22: these chunks are assumed to have length 1 + if ( $tag eq "[" ) { $chunkStart = $true; } + if ( $tag eq "]" ) { $chunkStart = $true; } + + return($chunkStart); +} diff --git a/code/entity_lstm.py b/code/entity_lstm.py new file mode 100644 index 0000000..c9013f7 --- /dev/null +++ b/code/entity_lstm.py @@ -0,0 +1,491 @@ +import tensorflow as tf +import numpy as np +import codecs +import re +import time +#import utils_tf +#import utils_nlp +import helper_dataset as hd +import tensorflow.contrib.layers as layers +import os +import pickle +import utils_tf + +# TO DO: ADD CNN LAYER + +def bidirectional_GRU(input,hidden_state_dimension,initializer,sequence_length=None, output_sequence=True): + print ("Biderectional GRU") + with tf.variable_scope("biderectional_GRU"): + if sequence_length==None: + batch_size=1 # ONE WORD(char) + sequence_length = tf.shape(input)[1] + sequence_length = tf.expand_dims(sequence_length, axis=0, name='sequence_length') #NOT SURE IF IT EVER HAPPENS + else: + batch_size= tf.shape(sequence_length)[0] + + + gru_cell={} + initial_state={} + for direction in ["forward","backward"]: + gru_cell[direction] = tf.contrib.rnn.GRUCell(hidden_state_dimension) + initial_state[direction]=gru_cell[direction].zero_state(batch_size, tf.float32) + outputs,final_states = tf.nn.bidirectional_dynamic_rnn(gru_cell["forward"],gru_cell["backward"],input, sequence_length=sequence_length,initial_state_fw=initial_state["forward"],initial_state_bw=initial_state["backward"]) + + + if output_sequence==True: + outputs_forward, outputs_backward = outputs + output = tf.concat([outputs_forward, outputs_backward], axis=2, name='output_sequence') + + else: + final_states_forward, final_states_backward = final_states + + output = tf.concat([final_states_forward, final_states_backward], axis=1, name='output') #111 + + return output + + + + +def bidirectional_LSTM(input, hidden_state_dimension, initializer, sequence_length=None, output_sequence=True): + + print ("Biderectional LSTM") + with tf.variable_scope("bidirectional_LSTM"): + if sequence_length == None: + batch_size = 1 + sequence_length = tf.shape(input)[1] + sequence_length = tf.expand_dims(sequence_length, axis=0, name='sequence_length') + else: + batch_size = tf.shape(sequence_length)[0] + + lstm_cell = {} + initial_state = {} + for direction in ["forward", "backward"]: + with tf.variable_scope(direction): + # LSTM cell + lstm_cell[direction] = tf.contrib.rnn.CoupledInputForgetGateLSTMCell(hidden_state_dimension, use_peepholes=False, forget_bias=1.0, initializer=initializer, state_is_tuple=True, activation=tf.tanh) # tf.tanh (default to RELU) + # lstm_cell[direction] = tf.contrib.rnn_cell.GRUCell(hidden_state_dimension,activation=tf.tanh,) + + + # initial state: http://stackoverflow.com/questions/38441589/tensorflow-rnn-initial-state + initial_cell_state = tf.get_variable("initial_cell_state", shape=[1, hidden_state_dimension], dtype=tf.float32, initializer=initializer) + initial_output_state = tf.get_variable("initial_output_state", shape=[1, hidden_state_dimension], dtype=tf.float32, initializer=initializer) + c_states = tf.tile(initial_cell_state, tf.stack([batch_size, 1])) + h_states = tf.tile(initial_output_state, tf.stack([batch_size, 1])) + initial_state[direction] = tf.contrib.rnn.LSTMStateTuple(c_states, h_states) + + # sequence_length must be provided for tf.nn.bidirectional_dynamic_rnn due to internal bug + outputs, final_states = tf.nn.bidirectional_dynamic_rnn(lstm_cell["forward"], + lstm_cell["backward"], + input, + dtype=tf.float32, + sequence_length=sequence_length, + initial_state_fw=initial_state["forward"], + initial_state_bw=initial_state["backward"]) + if output_sequence == True: + outputs_forward, outputs_backward = outputs + output = tf.concat([outputs_forward, outputs_backward], axis=2, name='output_sequence') + else: + # max pooling +# outputs_forward, outputs_backward = outputs +# output = tf.concat([outputs_forward, outputs_backward], axis=2, name='output_sequence') +# output = tf.reduce_max(output, axis=1, name='output') + # last pooling + final_states_forward, final_states_backward = final_states + output = tf.concat([final_states_forward[1], final_states_backward[1]], axis=1, name='output') + + return output + + + + +class EntityLSTM(object): + """ + An LSTM architecture for named entity recognition. + Uses a character embedding layer followed by an LSTM to generate vector representation from characters for each token. + Then the character vector is concatenated with token embedding vector, which is input to another LSTM followed by a CRF layer. + """ + def __init__(self, dataset, parameters): + + self.verbose = False + self.feature_vector_length=parameters['Feature_vector_length'] + + # Placeholders for input, output and dropout + self.input_token_indices = tf.placeholder(tf.int32, [None], name="input_token_indices") + self.input_label_indices_vector = tf.placeholder(tf.float32, [None, dataset.number_of_classes], name="input_label_indices_vector") + self.input_label_indices_flat = tf.placeholder(tf.int32, [None], name="input_label_indices_flat") + self.input_token_character_indices = tf.placeholder(tf.int32, [None, None], name="input_token_indices") + self.input_token_lengths = tf.placeholder(tf.int32, [None], name="input_token_lengths") + self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") + + self.input_features=tf.placeholder(tf.float32, [None,self.feature_vector_length], name="features") + + + self.vocabulary_size=dataset.vocabulary_size + + # Internal parameters + initializer = tf.contrib.layers.xavier_initializer() + + if parameters['use_character_lstm']: + with tf.variable_scope("character_embedding"): + self.character_embedding_weights = tf.get_variable( + "character_embedding_weights", + shape=[dataset.alphabet_size, parameters['character_embedding_dimension']], + initializer=initializer) + embedded_characters = tf.nn.embedding_lookup(self.character_embedding_weights, self.input_token_character_indices, name='embedded_characters') + if self.verbose: print("embedded_characters: {0}".format(embedded_characters)) + # utils_tf.variable_summaries(self.character_embedding_weights) + + # Character LSTM layer + with tf.variable_scope('character_lstm') as vs: + if parameters['Use_LSTM']==True: + character_lstm_output = bidirectional_LSTM(embedded_characters, parameters['character_lstm_hidden_state_dimension'], initializer, + sequence_length=self.input_token_lengths, output_sequence=False) + self.character_lstm_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) + else: + character_lstm_output = bidirectional_GRU(embedded_characters, parameters['character_lstm_hidden_state_dimension'], initializer, + sequence_length=self.input_token_lengths, output_sequence=False) + # Attention, not implemented + + # with tf.variable_scope('attention') as scope: + # word_level_output = task_specific_attention(character_lstm_output,dataset.token_lengths,scope=scope) + # print (w) + + # sentence_inputs = tf.reshape(word_level_output, [self.document_size, self.sentence_size, self.word_output_size]) + + + + + + # Token embedding layer + with tf.variable_scope("token_embedding"): + self.token_embedding_weights = tf.get_variable( + "token_embedding_weights", + shape=[dataset.vocabulary_size, parameters['token_embedding_dimension']], + initializer=initializer, + trainable=not parameters['freeze_token_embeddings']) + embedded_tokens = tf.nn.embedding_lookup(self.token_embedding_weights, self.input_token_indices) + # utils_tf.variable_summaries(self.token_embedding_weights) + + # Concatenate character LSTM outputs and token embeddings + if parameters['use_character_lstm']: + with tf.variable_scope("concatenate_token_and_character_vectors"): + if self.verbose: print('embedded_tokens: {0}'.format(embedded_tokens)) + token_lstm_input = tf.concat([character_lstm_output, embedded_tokens], axis=1, name='token_lstm_input') + if self.verbose: print("token_lstm_input: {0}".format(token_lstm_input)) + else: + token_lstm_input = embedded_tokens + + if parameters['use_features_before_final_lstm']: + with tf.variable_scope("features_argumentation_pre_LSTM"): + token_lstm_input=tf.concat([token_lstm_input, self.input_features], 1) + print (token_lstm_input) + + + # Add dropout + with tf.variable_scope("dropout"): + token_lstm_input_drop = tf.nn.dropout(token_lstm_input, self.dropout_keep_prob, name='token_lstm_input_drop') + if self.verbose: print("token_lstm_input_drop: {0}".format(token_lstm_input_drop)) + # https://www.tensorflow.org/api_guides/python/contrib.rnn + # Prepare data shape to match `rnn` function requirements + # Current data input shape: (batch_size, n_steps, n_input) + # Required shape: 'n_steps' tensors list of shape (batch_size, n_input) + token_lstm_input_drop_expanded = tf.expand_dims(token_lstm_input_drop, axis=0, name='token_lstm_input_drop_expanded') + if self.verbose: print("token_lstm_input_drop_expanded: {0}".format(token_lstm_input_drop_expanded)) + + #if parameters['use_features_before_final_lstm']: + # with tf.variable_scope("features_argumentation_pre_LSTM"): + # token_lstm_input_drop_expanded=tf.concat([token_lstm_input_drop_expanded, self.input_features], 1) + # print (token_lstm_input_drop_expanded) + + # Token LSTM layer + with tf.variable_scope('token_lstm') as vs: + if parameters['Use_LSTM']==True: token_lstm_output = bidirectional_LSTM(token_lstm_input_drop_expanded, parameters['token_lstm_hidden_state_dimension'], initializer, output_sequence=True) + else: token_lstm_output = bidirectional_GRU(token_lstm_input_drop_expanded, parameters['token_lstm_hidden_state_dimension'], initializer, output_sequence=True) + token_lstm_output_squeezed = tf.squeeze(token_lstm_output, axis=0) + self.token_lstm_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) + + # Needed only if Bidirectional LSTM is used for token level + with tf.variable_scope("feedforward_after_lstm") as vs: + W = tf.get_variable( + "W", + shape=[2 * parameters['token_lstm_hidden_state_dimension'], parameters['token_lstm_hidden_state_dimension']], + initializer=initializer) + b = tf.Variable(tf.constant(0.0, shape=[parameters['token_lstm_hidden_state_dimension']]), name="bias") + outputs = tf.nn.xw_plus_b(token_lstm_output_squeezed, W, b, name="output_before_tanh") + outputs = tf.nn.tanh(outputs, name="output_after_tanh") + self.token_lstm_variables += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) + + with tf.variable_scope("feedforward_before_crf") as vs: + W = tf.get_variable( + "W", + shape=[parameters['token_lstm_hidden_state_dimension'], dataset.number_of_classes], + initializer=initializer) + b = tf.Variable(tf.constant(0.0, shape=[dataset.number_of_classes]), name="bias") + scores = tf.nn.xw_plus_b(outputs, W, b, name="scores") + self.unary_scores = scores + self.predictions = tf.argmax(self.unary_scores, 1, name="predictions") + #utils_tf.variable_summaries(W) + # utils_tf.variable_summaries(b) + self.feedforward_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) + + # CRF layer + if parameters['use_crf']: + print ("CRF IS IN USE") + with tf.variable_scope("crf") as vs: + # Add start and end tokens + small_score = -1000.0 + large_score = 0.0 + sequence_length = tf.shape(self.unary_scores)[0] + unary_scores_with_start_and_end = tf.concat([self.unary_scores, tf.tile( tf.constant(small_score, shape=[1, 2]) , [sequence_length, 1])], 1) + start_unary_scores = [[small_score] * dataset.number_of_classes + [large_score, small_score]] + end_unary_scores = [[small_score] * dataset.number_of_classes + [small_score, large_score]] + self.unary_scores = tf.concat([start_unary_scores, unary_scores_with_start_and_end, end_unary_scores], 0) + start_index = dataset.number_of_classes + end_index = dataset.number_of_classes + 1 + input_label_indices_flat_with_start_and_end = tf.concat([ tf.constant(start_index, shape=[1]), self.input_label_indices_flat, tf.constant(end_index, shape=[1]) ], 0) + + # Apply CRF layer + sequence_length = tf.shape(self.unary_scores)[0] + sequence_lengths = tf.expand_dims(sequence_length, axis=0, name='sequence_lengths') + unary_scores_expanded = tf.expand_dims(self.unary_scores, axis=0, name='unary_scores_expanded') + input_label_indices_flat_batch = tf.expand_dims(input_label_indices_flat_with_start_and_end, axis=0, name='input_label_indices_flat_batch') + if self.verbose: print('unary_scores_expanded: {0}'.format(unary_scores_expanded)) + if self.verbose: print('input_label_indices_flat_batch: {0}'.format(input_label_indices_flat_batch)) + if self.verbose: print("sequence_lengths: {0}".format(sequence_lengths)) + # https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/crf + # Compute the log-likelihood of the gold sequences and keep the transition params for inference at test time. + self.transition_parameters=tf.get_variable( + "transitions", + shape=[dataset.number_of_classes+2, dataset.number_of_classes+2], + initializer=initializer) + #utils_tf.variable_summaries(self.transition_parameters) + log_likelihood, _ = tf.contrib.crf.crf_log_likelihood( + unary_scores_expanded, input_label_indices_flat_batch, sequence_lengths, transition_params=self.transition_parameters) + self.loss = tf.reduce_mean(-log_likelihood, name='cross_entropy_mean_loss') + self.accuracy = tf.constant(1) + + self.crf_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) # LATER FOR RESTORE + + # Do not use CRF layer + else: + with tf.variable_scope("crf") as vs: + self.transition_parameters = tf.get_variable( + "transitions", + shape=[dataset.number_of_classes+2, dataset.number_of_classes+2], + initializer=initializer) + # utils_tf.variable_summaries(self.transition_parameters) + self.crf_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) + + # Calculate mean cross-entropy loss + with tf.variable_scope("loss"): + losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.unary_scores, labels=self.input_label_indices_vector, name='softmax') + self.loss = tf.reduce_mean(losses, name='cross_entropy_mean_loss') + with tf.variable_scope("accuracy"): + correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_label_indices_vector, 1)) + self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy') + + self.define_training_procedure(parameters) + self.summary_op = tf.summary.merge_all() + self.saver = tf.train.Saver(max_to_keep=100) + + + + def define_training_procedure(self, parameters): + # Define training procedure + self.global_step = tf.Variable(0, name="global_step", trainable=False) + if parameters['optimizer'] == 'adam': + self.optimizer = tf.train.AdamOptimizer(parameters['learning_rate']) + elif parameters['optimizer'] == 'sgd': + self.optimizer = tf.train.GradientDescentOptimizer(parameters['learning_rate']) + elif parameters['optimizer'] == 'adadelta': + self.optimizer = tf.train.AdadeltaOptimizer(parameters['learning_rate']) + else: + raise ValueError('The lr_method parameter must be either adadelta, adam or sgd.') + + grads_and_vars = self.optimizer.compute_gradients(self.loss) + #MODIFY: + if parameters['gradient_clipping_value']: + def ClipIfNotNone(grad): + if grad is None: + return grad + return tf.clip_by_value(grad, -5.0, 5.0) + grads_and_vars = [(ClipIfNotNone(grad), var) for grad, var in grads_and_vars] + + self.train_op = self.optimizer.apply_gradients(grads_and_vars, global_step=self.global_step) + + + def load_pretrained_token_embeddings(self, sess, dataset, parameters, token_to_vector=None): + if parameters['token_pretrained_embedding_filepath'] == '': + return + # Load embeddings + start_time = time.time() + print('Load token embeddings... ', end='', flush=True) + if token_to_vector == None: + token_to_vector = hd.load_pretrained_token_embeddings(parameters) + + initial_weights = sess.run(self.token_embedding_weights.read_value()) + number_of_loaded_word_vectors = 0 + number_of_token_original_case_found = 0 + number_of_token_lowercase_found = 0 + number_of_token_digits_replaced_with_zeros_found = 0 + number_of_token_lowercase_and_digits_replaced_with_zeros_found = 0 + for token in dataset.token_to_index.keys(): + if token in token_to_vector.keys(): + initial_weights[dataset.token_to_index[token]] = token_to_vector[token] + number_of_token_original_case_found += 1 + elif parameters['check_for_lowercase'] and token.lower() in token_to_vector.keys(): + initial_weights[dataset.token_to_index[token]] = token_to_vector[token.lower()] + number_of_token_lowercase_found += 1 + elif parameters['check_for_digits_replaced_with_zeros'] and re.sub('\d', '0', token) in token_to_vector.keys(): + initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token)] + number_of_token_digits_replaced_with_zeros_found += 1 + elif parameters['check_for_lowercase'] and parameters['check_for_digits_replaced_with_zeros'] and re.sub('\d', '0', token.lower()) in token_to_vector.keys(): + initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token.lower())] + number_of_token_lowercase_and_digits_replaced_with_zeros_found += 1 + else: + continue + number_of_loaded_word_vectors += 1 + elapsed_time = time.time() - start_time + print('done ({0:.2f} seconds)'.format(elapsed_time)) + print("number_of_token_original_case_found: {0}".format(number_of_token_original_case_found)) + print("number_of_token_lowercase_found: {0}".format(number_of_token_lowercase_found)) + print("number_of_token_digits_replaced_with_zeros_found: {0}".format(number_of_token_digits_replaced_with_zeros_found)) + print("number_of_token_lowercase_and_digits_replaced_with_zeros_found: {0}".format(number_of_token_lowercase_and_digits_replaced_with_zeros_found)) + print('number_of_loaded_word_vectors: {0}'.format(number_of_loaded_word_vectors)) + print("dataset.vocabulary_size: {0}".format(dataset.vocabulary_size)) + sess.run(self.token_embedding_weights.assign(initial_weights)) + + + def load_embeddings_from_pretrained_model(self, sess, dataset, pretraining_dataset, pretrained_embedding_weights, embedding_type='token'): + if embedding_type == 'token': + embedding_weights = self.token_embedding_weights + index_to_string = dataset.index_to_token + pretraining_string_to_index = pretraining_dataset.token_to_index + elif embedding_type == 'character': + embedding_weights = self.character_embedding_weights + index_to_string = dataset.index_to_character + pretraining_string_to_index = pretraining_dataset.character_to_index + # Load embeddings + start_time = time.time() + print('Load {0} embeddings from pretrained model... '.format(embedding_type), end='', flush=True) + initial_weights = sess.run(embedding_weights.read_value()) + + if embedding_type == 'token': + initial_weights[dataset.UNK_TOKEN_INDEX] = pretrained_embedding_weights[pretraining_dataset.UNK_TOKEN_INDEX] + elif embedding_type == 'character': + initial_weights[dataset.PADDING_CHARACTER_INDEX] = pretrained_embedding_weights[pretraining_dataset.PADDING_CHARACTER_INDEX] + + number_of_loaded_vectors = 1 + for index, string in index_to_string.items(): + if index == dataset.UNK_TOKEN_INDEX: + continue + if string in pretraining_string_to_index.keys(): + initial_weights[index] = pretrained_embedding_weights[pretraining_string_to_index[string]] + number_of_loaded_vectors += 1 + elapsed_time = time.time() - start_time + print('done ({0:.2f} seconds)'.format(elapsed_time)) + print("number_of_loaded_vectors: {0}".format(number_of_loaded_vectors)) + if embedding_type == 'token': + print("dataset.vocabulary_size: {0}".format(dataset.vocabulary_size)) + elif embedding_type == 'character': + print("dataset.alphabet_size: {0}".format(dataset.alphabet_size)) + sess.run(embedding_weights.assign(initial_weights)) + + + def resize_without_redoing_model(self, parameters,new_dataset_vocab_size,sess): + "" + utils_tf.resize_tensor_variable(sess, self.token_embedding_weights, [new_dataset_vocab_size, parameters['token_embedding_dimension']]) + + + + def restore_from_pretrained_model(self, parameters, dataset, sess, token_to_vector=None,pretrained_dataset=None): + + temp_pretrained_dataset_adress=parameters['model_folder']+os.sep+"dataset.pickle" + temp_pretrained_model_adress=parameters['model_folder']+os.sep+parameters['model_name'] + + print (temp_pretrained_model_adress) + + if pretrained_dataset==None: + pretraining_dataset = pickle.load(open(temp_pretrained_dataset_adress, 'rb')) + else: + print ("PRETRAINING HERE") + pretraining_dataset=pretrained_dataset + + + pretrained_model_checkpoint_filepath = temp_pretrained_model_adress + + assert pretraining_dataset.index_to_label == dataset.index_to_label # DEBUG fron F&J + + # If the token and character mappings are exactly the same + if pretraining_dataset.index_to_token == dataset.index_to_token and pretraining_dataset.index_to_character == dataset.index_to_character: + + # Restore the pretrained model + self.saver.restore(sess, pretrained_model_checkpoint_filepath) # Works only when the dimensions of tensor variables are matched. + del pretraining_dataset + + # If the token and character mappings are different between the pretrained model and the current model + else: + print ("INDEX TO TOKEN DO NOT MATCH") + + # Resize the token and character embedding weights to match them with the pretrained model (required in order to restore the pretrained model) + utils_tf.resize_tensor_variable(sess, self.character_embedding_weights, [pretraining_dataset.alphabet_size, parameters['character_embedding_dimension']]) + utils_tf.resize_tensor_variable(sess, self.token_embedding_weights, [pretraining_dataset.vocabulary_size, parameters['token_embedding_dimension']]) + + # Restore the pretrained model + self.saver.restore(sess, pretrained_model_checkpoint_filepath) # Works only when the dimensions of tensor variables are matched. + + # Get pretrained embeddings + character_embedding_weights, token_embedding_weights = sess.run([self.character_embedding_weights, self.token_embedding_weights]) + + # Restore the sizes of token and character embedding weights + utils_tf.resize_tensor_variable(sess, self.character_embedding_weights, [dataset.alphabet_size, parameters['character_embedding_dimension']]) + utils_tf.resize_tensor_variable(sess, self.token_embedding_weights, [dataset.vocabulary_size, parameters['token_embedding_dimension']]) + + # Re-initialize the token and character embedding weights + sess.run(tf.variables_initializer([self.character_embedding_weights, self.token_embedding_weights])) + + # Load embedding weights from pretrained token embeddings first + self.load_pretrained_token_embeddings(sess, dataset, parameters, token_to_vector=token_to_vector) + self.load_embeddings_from_pretrained_model(sess, dataset, pretraining_dataset, token_embedding_weights, embedding_type='token') + self.load_embeddings_from_pretrained_model(sess, dataset, pretraining_dataset, character_embedding_weights, embedding_type='character') + + del pretraining_dataset + del character_embedding_weights + del token_embedding_weights + + # Get transition parameters + transition_params_trained = sess.run(self.transition_parameters) + + parameters={'reload_character_embeddings': True, 'reload_character_lstm':True, 'reload_token_embeddings':True, 'reload_token_lstm':True, 'reload_feedforward':True, 'reload_crf':True} + if not parameters['reload_character_embeddings']: + sess.run(tf.variables_initializer([self.character_embedding_weights])) + if not parameters['reload_character_lstm']: + sess.run(tf.variables_initializer(self.character_lstm_variables)) + if not parameters['reload_token_embeddings']: + sess.run(tf.variables_initializer([self.token_embedding_weights])) + if not parameters['reload_token_lstm']: + sess.run(tf.variables_initializer(self.token_lstm_variables)) + if not parameters['reload_feedforward']: + sess.run(tf.variables_initializer(self.feedforward_variables)) + if not parameters['reload_crf']: + sess.run(tf.variables_initializer(self.crf_variables)) + + + + return transition_params_trained + + + + + + + + + + + + + + + + diff --git a/code/evaluate.py b/code/evaluate.py new file mode 100644 index 0000000..1045341 --- /dev/null +++ b/code/evaluate.py @@ -0,0 +1,126 @@ +###################################################################### +# CliNER - evaluate.py # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: Evaluate predictions of concept labels against gold. # +###################################################################### + +import os +import sys +import argparse +import glob +import random +import shutil +import subprocess + +import tools + +def main(): + + # Parse command line arguments + parser = argparse.ArgumentParser(prog='cliner evaluate') + parser.add_argument("--predictions", + dest = "pred", + help = "Directory where predictions are stored.", + ) + parser.add_argument("--gold", + dest = "gold", + help = "Directory where gold standard is stored.", + ) + parser.add_argument("--format", + dest = "format", + help = "Data format ( con ) " + ) + + args = parser.parse_args() + + + if not args.pred: + sys.stderr.write('\n\tERROR: must provide --pred argument\n\n') + parser.print_help(sys.stderr) + sys.stderr.write('\n') + exit(1) + + if not args.gold: + sys.stderr.write('\n\tERROR: must provide --gold argument\n\n') + parser.print_help(sys.stderr) + sys.stderr.write('\n') + exit(1) + + if args.format: + format = args.format + else: + sys.stderr.write('\n\tERROR: must provide --format argument\n\n') + parser.print_help(sys.stderr) + sys.stderr.write('\n') + exit(1) + + + # Must specify output format + if format not in ['i2b2']: + sys.stderr.write('\n\tError: Must specify output format\n') + sys.stderr.write('\tAvailable formats: i2b2\n') + sys.stderr.write('\n') + parser.print_help(sys.stderr) + sys.stderr.write('\n') + exit(1) + + + ref_files = os.listdir(args.gold) + ref_files = map(lambda f: os.path.join(args.gold, f), ref_files) + + pred_files = os.listdir(args.pred) + pred_files = map(lambda f: os.path.join(args.pred, f), pred_files) + + ref_files_map = tools.map_files( ref_files) + pred_files_map = tools.map_files(pred_files) + + files = [] + for k in ref_files_map: + if k in pred_files_map: + files.append((pred_files_map[k], ref_files_map[k])) + + gold_list, pred_list = zip(*files) + + #print gold_list + #print pred_list + + + # create temporary directory for these files + cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + tmp_dir = os.path.join(cliner_dir, 'data', 'tmp') + tempdir_name = os.path.join(tmp_dir, 'cliner_eval_%d' % random.randint(0,256)) + #print tempdir_name + + #text_dir = os.path.join(tempdir_name, 'text/') + pred_dir = os.path.join(tempdir_name, 'pred/') + gold_dir = os.path.join(tempdir_name, 'gold/') + + os.mkdir(tempdir_name) + os.mkdir(pred_dir) + os.mkdir(gold_dir) + + # copy files + for pred_file in pred_list: + shutil.copy(pred_file, pred_dir) + for gold_file in gold_list: + shutil.copy(gold_file, gold_dir) + + + # eval jar + cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + eval_dir = os.path.join(cliner_dir, 'tools',) + eval_jar = os.path.join(eval_dir, 'i2b2va-eval.jar') + + cmd = 'java -jar %s -rcp %s -scp %s -ft con -ex all' % (eval_jar, gold_dir, pred_dir) + status = subprocess.call(cmd, shell=True, stdout=sys.stdout) + + # cleanup after yourself + shutil.rmtree(tempdir_name) + + + +if __name__ == '__main__': + main() + diff --git a/code/evaluation_LSTM.py b/code/evaluation_LSTM.py new file mode 100644 index 0000000..4fb276e --- /dev/null +++ b/code/evaluation_LSTM.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +""" +Created on Tue Aug 8 13:38:57 2017 + +@author: elena +""" +import numpy as np +import sklearn.metrics +import os +import time +import helper_dataset as hd + +def assess_model(y_pred, y_true, labels, target_names, labels_with_o, target_names_with_o, evaluation_mode='bio', verbose=False): + results = {} + assert len(y_true) == len(y_pred) + + # Classification report + classification_report = sklearn.metrics.classification_report(y_true, y_pred, labels=labels, target_names=target_names, sample_weight=None, digits=4) + results['classification_report'] = classification_report + + # F1 scores + results['f1_score'] = {} + for f1_average_style in ['weighted', 'micro', 'macro']: + results['f1_score'][f1_average_style] = sklearn.metrics.f1_score(y_true, y_pred, average=f1_average_style, labels=labels)*100 + results['f1_score']['per_label'] = [x*100 for x in sklearn.metrics.precision_recall_fscore_support(y_true, y_pred, average=None, labels=labels)[2].tolist()] + results['accuracy_score'] = sklearn.metrics.accuracy_score(y_true, y_pred)*100 + + print (results['classification_report']) + print (results['f1_score']['per_label']) + + return results + +def remap_labels(y_pred, y_true, dataset, evaluation_mode='bio'): + ''' + y_pred: list of predicted labels + y_true: list of gold labels + evaluation_mode: 'bio', 'token', or 'binary' + + Both y_pred and y_true must use label indices and names specified in the dataset +# (dataset.unique_label_indices_of_interest, dataset.unique_label_indices_of_interest). + ''' + all_unique_labels = dataset.unique_labels + if evaluation_mode == 'bio': + # sort label to index + new_label_names = all_unique_labels[:] + new_label_names.remove('O') + new_label_names.sort(key=lambda x: (hd.remove_bio_from_label_name(x), x)) + new_label_names.append('O') + new_label_indices = list(range(len(new_label_names))) + new_label_to_index = dict(zip(new_label_names, new_label_indices)) + + remap_index = {} + for i, label_name in enumerate(new_label_names): + label_index = dataset.label_to_index[label_name] + remap_index[label_index] = i + + else: + raise ValueError("At this point only 'bio' is accepted") + + new_y_pred = [ remap_index[label_index] for label_index in y_pred ] + new_y_true = [ remap_index[label_index] for label_index in y_true ] + + new_label_indices_with_o = new_label_indices[:] + new_label_names_with_o = new_label_names[:] + new_label_names.remove('O') + new_label_indices.remove(new_label_to_index['O']) + + return new_y_pred, new_y_true, new_label_indices, new_label_names, new_label_indices_with_o, new_label_names_with_o diff --git a/clicon/features_dir/genia_dir/__init__.py b/code/feature_extraction/__init__.py similarity index 100% rename from clicon/features_dir/genia_dir/__init__.py rename to code/feature_extraction/__init__.py diff --git a/code/feature_extraction/features.py b/code/feature_extraction/features.py new file mode 100644 index 0000000..9fad852 --- /dev/null +++ b/code/feature_extraction/features.py @@ -0,0 +1,247 @@ +###################################################################### +# CliCon - features.py # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: Isolate the model's features from model.py # +###################################################################### + + +# What modules are available +from .utils import load_pos_tagger +from .read_config import enabled_modules +from . import word_features as feat_word + + + +################################################ +# Build a few expensive one-time objects + + +# what to build requires knowing what tools are enabled +enabled = enabled_modules() + + +# Import feature modules +feat_genia=None +if enabled['GENIA']: + from .genia_dir.genia_features import GeniaFeatures + + +# Only create UMLS cache if module is available +if enabled['UMLS']: + from umls_dir import interface_umls + from umls_dir import interpret_umls + import umls_dir.umls_features as feat_umls + from umls_dir.umls_cache import UmlsCache + umls_cache = UmlsCache() + + +# POS tagger +nltk_tagger = load_pos_tagger() + + + +################################################ + + +# which features are enabled +enabled_IOB_prose_sentence_features = [] +enabled_IOB_prose_sentence_features.append('unigram_context') +enabled_IOB_prose_sentence_features.append('pos') +enabled_IOB_prose_sentence_features.append('pos_context') +enabled_IOB_prose_sentence_features.append('prev') +enabled_IOB_prose_sentence_features.append('prev2') +enabled_IOB_prose_sentence_features.append('next') +enabled_IOB_prose_sentence_features.append('next2') +enabled_IOB_prose_sentence_features.append('GENIA') +enabled_IOB_prose_sentence_features.append('UMLS') + + + +def extract_features(tok_sents): + """ + extract_features() + @param data A list of split sentences (1 sent = 1 line from file) + @param Y A list of list of IOB (1:1 mapping with data) + @return tuple: list of IOB_prose_features, list of IOB + """ + # Genia preprocessing + sentence_features_preprocess(tok_sents) + + # iterate through all data & extract features (sentence-by-sentence) + prose_feats = [] + for sentence in tok_sents: + prose_feats.append(extract_features_sentence(sentence)) + return prose_feats + + + +def sentence_features_preprocess(data): + global feat_genia + tagger = enabled['GENIA'] + # Only run GENIA tagger if module is available + if tagger: + feat_genia = GeniaFeatures(tagger,data) + + + +def extract_features_sentence(sentence): + """ + extract_features_sentence + Compute a list of dict-based feature representation for a list of tokens. + @param sentence. A list of tokens. + @return A list of feature dictionaries. + """ + features_list = [] + + # Get a feature set for each word in the sentence + for i,word in enumerate(sentence): + features_list.append(feat_word.IOB_prose_features(sentence[i])) + + # Feature: Bag of Words unigram conext (window=3) + if 'unigram_context' in enabled_IOB_prose_sentence_features: + window = 3 + n = len(sentence) + + # Previous unigrams + for i in range(n): + end = min(i, window) + unigrams = sentence[i-end:i] + for j,u in enumerate(unigrams): + features_list[i][('prev_unigrams-%d'%j,u)] = 1 + + # Next unigrams + for i in range(n): + end = min(i + window, n-1) + unigrams = sentence[i+1:end+1] + for j,u in enumerate(unigrams): + features_list[i][('next_unigrams-%d'%j,u)] = 1 + + # Only POS tag once + if 'pos' in enabled_IOB_prose_sentence_features: + pos_tagged = nltk_tagger.tag(sentence) + + # Allow for particular features to be enabled + for feature in enabled_IOB_prose_sentence_features: + + # Feature: Part of Speech + if feature == 'pos': + for (i,(_,pos)) in enumerate(pos_tagged): + features_list[i].update( { ('pos',pos) : 1} ) + + + # Feature: POS context + if 'pos_context' in enabled_IOB_prose_sentence_features: + window = 3 + n = len(sentence) + + # Previous POS + for i in range(n): + end = min(i, window) + for j,p in enumerate(pos_tagged[i-end:i]): + pos = p[1] + features_list[i][('prev_pos_context-%d'%j,pos)] = 1 + + # Next POS + for i in range(n): + end = min(i + window, n-1) + for j,p in enumerate(pos_tagged[i+1:i+end+1]): + pos = p[1] + features_list[i][('prev_pos_context-%d'%j,pos)] = 1 + + + # GENIA features + if (feature == 'GENIA') and enabled['GENIA']: + + # Get GENIA features + genia_feat_list = feat_genia.features(sentence) + + ''' + print( '\t', sentence) + print( '\n\n') + for gf in genia_feat_list: + print( '\t', gf) + print() + print ('\n\n') + ''' + + for i,feat_dict in enumerate(genia_feat_list): + features_list[i].update(feat_dict) + + + # Feature: UMLS Word Features (only use prose ones) + if (feature == "UMLS") and enabled['UMLS']: + umls_features = feat_umls.extract_umls_features(sentence) + for i in range(len(sentence)): + features_list[i].update( umls_features[i] ) + + ####### + # TODO: This section is ugly... factorize it. + ####### + + # Used for 'prev' and 'next' features + ngram_features = [{} for i in range(len(features_list))] + if "prev" in enabled_IOB_prose_sentence_features: + prev = lambda f: {("prev_"+k[0], k[1]): v for k,v in f.items()} + prev_list = list(map(prev, features_list)) + for i in range(len(features_list)): + if i == 0: + ngram_features[i][("prev", "*")] = 1 + else: + ngram_features[i].update(prev_list[i-1]) + + if "prev2" in enabled_IOB_prose_sentence_features: + prev2 = lambda f: {("prev2_"+k[0], k[1]): v/2.0 for k,v in f.items()} + prev_list = list(map(prev2, features_list)) + for i in range(len(features_list)): + if i == 0: + ngram_features[i][("prev2", "*")] = 1 + elif i == 1: + ngram_features[i][("prev2", "*")] = 1 + else: + ngram_features[i].update(prev_list[i-2]) + + if "next" in enabled_IOB_prose_sentence_features: + next = lambda f: {("next_"+k[0], k[1]): v for k,v in f.items()} + next_list = list(map(next, features_list)) + for i in range(len(features_list)): + if i < len(features_list) - 1: + ngram_features[i].update(next_list[i+1]) + else: + ngram_features[i][("next", "*")] = 1 + + if "next2" in enabled_IOB_prose_sentence_features: + next2 = lambda f: {("next2_"+k[0], k[1]): v/2.0 for k,v in f.items()} + next_list = list(map(next2, features_list)) + for i in range(len(features_list)): + if i < len(features_list) - 2: + ngram_features[i].update(next_list[i+2]) + elif i == len(features_list) - 2: + ngram_features[i][("next2", "**")] = 1 + else: + ngram_features[i][("next2", "*")] = 1 + + merged = lambda d1, d2: dict(list(d1.items()) + list(d2.items())) + features_list = [merged(features_list[i], ngram_features[i]) + for i in range(len(features_list))] + + ''' + for f in features_list: + print (sorted(f.items())) + print () + print ('\n\n\n') + ''' + + return features_list + + + +def display_enabled_modules(): + print() + for module,status in enabled.items(): + if status: + print ('\t', module, '\t', ' ENABLED') + else: + print ('\t', module, '\t', 'DISABLED') + print() diff --git a/code/feature_extraction/func_cache.py b/code/feature_extraction/func_cache.py new file mode 100644 index 0000000..7f7988a --- /dev/null +++ b/code/feature_extraction/func_cache.py @@ -0,0 +1,64 @@ +""" + File Name : func_cache.py + + Creation Date : 19-09-2015 +<<<<<<< HEAD + + Last Modified : Sun 25 Oct 2015 05:15:04 PM EDT + + Created By : Renan Campos + + Purpose : Class wrapper for third-party lru cache. + Adds method used for reporting hits/misses. +======= + + Last Modified : Wed 07 Oct 2015 09:06:27 PM EDT + + Created By : Renan Campos + + Purpose : Class wrapper for third-party lru cache. + Adds method used for reporting hits/misses. +>>>>>>> 6403f56bb687b79483093ed2a3281df607b690ce +""" + +from repoze.lru import lru_cache + +class func_cache(lru_cache): + + def __init__(self, verbose=False): + super(func_cache, self).__init__(500) + self.verbose = verbose + + def ShowInfo(self): + # This function is only implicitly called if verbose flag is set. + print "Cache results for:", self.FuncName + print " hits:", self.cache.hits + print " misses:", self.cache.misses + print " lookups:", self.cache.lookups, "\n" + + def __call__(self, f): + lru_cached = super(func_cache, self).__call__(f) + lru_cached.ShowInfo = self.ShowInfo + self.FuncName = f.__name__ + return lru_cached + + def __del__(self): + if (self.verbose): + self.ShowInfo() + +# Test functionality +if __name__ == '__main__': + @func_cache() + def rec(n): + if not n: + return n + return rec(n-1) + + rec.ShowInfo() + print + rec(3) + rec.ShowInfo() + print + rec(3) + rec.ShowInfo() + diff --git a/clicon/features_dir/genia_dir/.gitignore b/code/feature_extraction/genia_dir/.gitignore similarity index 61% rename from clicon/features_dir/genia_dir/.gitignore rename to code/feature_extraction/genia_dir/.gitignore index bb6a4d3..2c51ff5 100644 --- a/clicon/features_dir/genia_dir/.gitignore +++ b/code/feature_extraction/genia_dir/.gitignore @@ -1,2 +1,2 @@ geniatagger-3.0.1* -*pyc +genia_cache diff --git a/clicon/features_dir/umls_dir/__init__.py b/code/feature_extraction/genia_dir/__init__.py similarity index 100% rename from clicon/features_dir/umls_dir/__init__.py rename to code/feature_extraction/genia_dir/__init__.py diff --git a/clicon/features_dir/genia_dir/genia_cache.py b/code/feature_extraction/genia_dir/genia_cache.py similarity index 63% rename from clicon/features_dir/genia_dir/genia_cache.py rename to code/feature_extraction/genia_dir/genia_cache.py index 128ad87..27bfc1b 100644 --- a/clicon/features_dir/genia_dir/genia_cache.py +++ b/code/feature_extraction/genia_dir/genia_cache.py @@ -1,17 +1,25 @@ -import cPickle as pickle +import pickle import os +import sys + +parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if parentdir not in sys.path: + sys.path.append(parentdir) + +from .. import utils + class GeniaCache: def __init__(self): try: prefix = os.path.dirname(__file__) self.filename = os.path.join( prefix, 'genia_cache' ) - self.cache = pickle.load( open( self.filename , "rb" ) ) ; + self.cache = utils.load_pickled_obj(self.filename) except IOError: self.cache = {} def has_key(self, key): - return self.cache.has_key( str(key) ) + return str(key) in self.cache def add_map(self, key, value): self.cache[ str(key) ] = value @@ -21,4 +29,3 @@ def get_map(self, key): def __del__(self): pickle.dump( self.cache, open( self.filename, "wb" ) ) - diff --git a/clicon/features_dir/genia_dir/genia_features.py b/code/feature_extraction/genia_dir/genia_features.py similarity index 52% rename from clicon/features_dir/genia_dir/genia_features.py rename to code/feature_extraction/genia_dir/genia_features.py index b9274aa..4d96adc 100644 --- a/clicon/features_dir/genia_dir/genia_features.py +++ b/code/feature_extraction/genia_dir/genia_features.py @@ -8,62 +8,89 @@ -import interface_genia -from features_dir import utilities +from . import interface_genia +from .. import utils class GeniaFeatures: def __init__(self, tagger, data): - """ Constructor. @param data. A list of split sentences """ + data = [ [w for w in sent if w!=''] for sent in data] # Filter out nonprose sentences - prose = [ sent for sent in data if utilities.prose_sentence(sent) ] + prose = [ sent for sent in data if utils.is_prose_sentence(sent) ] # Process prose sentences with GENIA tagger - self.GENIA_features = iter(interface_genia.genia(tagger, prose)) + #self.GENIA_features = iter(interface_genia.genia(tagger, prose)) + self.gfeatures = {} + gf = interface_genia.genia(tagger, prose) + for sent,feats in zip(prose, gf): + key = '%'.join(sent) + self.gfeatures[key] = feats + #self.GENIA_features = iter(interface_genia.genia(tagger, prose)) - def features(self, sentence, is_prose=True): + def features(self, sentence): """ features() @param sentence. A list of words to bind features to - @param is_prose. Mechanism for skipping nonprose (for alignment) @return list of dictionaries (of features) Note: All data is tagged upon instantiation of GeniaFeatures object. This function MUST take each line of the file (in order) as input """ - # Return value is a list of dictionaries (of features) - features_list = [ {} for _ in sentence ] - + sentence = [w for w in sentence if w!=''] # Mechanism to allow for skipping nonprose - if not is_prose: return [] + if not utils.is_prose_sentence(sentence): return [] + # Return value is a list of dictionaries (of features) + features_list = [ {} for _ in sentence ] - # Get the GENIA features of the current sentence - genia_feats = next( self.GENIA_features ) + #print 'sentence: ', sentence + #print 'len(sentence): ', len(sentence) + # Get the GENIA features of the current sentence + #genia_feats = next( self.GENIA_features ) + key = '%'.join(sentence) + genia_feats = self.gfeatures[key] + + ''' + print [ c['GENIA-word'] for c in genia_feats] + print sentence + print + ''' + + #print('\n\n\n') + #print(len(sentence), len(genia_feats)) + for i in range(len(sentence)): + #print(i) + #print(sentence[i]) + #print(genia_feats[i]) + #print() + assert len(sentence[i]) == len(genia_feats[i]['GENIA-word']) + #print 'genia_feats: ', [ f['GENIA-word'] for f in genia_feats ] + #print 'len(genia_feats): ', len(genia_feats) + assert len(sentence) == len(genia_feats) # Feature: Current word's GENIA features for i,curr in enumerate(genia_feats): + assert curr['GENIA-word'] == sentence[i] keys = ['GENIA-stem','GENIA-POS','GENIA-chunktag'] #keys = ['GENIA-stem','GENIA-POS','GENIA-chunktag', 'GENIA-NEtag'] output = dict( ((k, curr[k]), 1) for k in keys if k in curr ) features_list[i].update(output) - return features_list diff --git a/clicon/features_dir/genia_dir/interface_genia.py b/code/feature_extraction/genia_dir/interface_genia.py similarity index 58% rename from clicon/features_dir/genia_dir/interface_genia.py rename to code/feature_extraction/genia_dir/interface_genia.py index bf402a8..4d29907 100644 --- a/clicon/features_dir/genia_dir/interface_genia.py +++ b/code/feature_extraction/genia_dir/interface_genia.py @@ -16,11 +16,14 @@ import os import sys -from commands import getstatusoutput - -from genia_cache import GeniaCache +import tempfile +#from commands import getstatusoutput +from subprocess import Popen, PIPE +from .genia_cache import GeniaCache +cliner_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +tmp_dir = os.path.join(cliner_dir, 'data', 'tmp') def genia(geniatagger, data): @@ -44,24 +47,50 @@ def genia(geniatagger, data): if not cache.has_key(sent): uncached.append(sent) - if uncached: # write list to file and then feed it to GENIA genia_dir = os.path.dirname(geniatagger) - out = os.path.join(genia_dir,'clicon_genia_tmp_file.txt') + + os_handle,out = tempfile.mkstemp(dir=tmp_dir, suffix="genia_temp") + with open(out, 'w') as f: for line in uncached: f.write(line + '\n') # Run genia tagger - print '\t\tRunning GENIA tagger' + print('\t\tRunning GENIA tagger') genia_dir = os.path.dirname(geniatagger) - stream = getstatusoutput('cd %s ; ./geniatagger -nt %s' %(genia_dir,out)) - print '\t\tFinished GENIA tagger' + #stream = getstatusoutput('cd %s ; ./geniatagger -nt %s' %(genia_dir,out)) + p = Popen('cd %s ; ./geniatagger -nt %s' %(genia_dir,out),shell=True,stdout=PIPE,stderr=PIPE) + stream_b, err = p.communicate() + + stream = stream_b.decode('ascii') + + #print '\t\tFinished GENIA tagger' # Organize tagger output linetags = [] tagged = [] - for tag in stream[1].split('\n')[4:]: + + # if the sentence is too long genia outputs an error. + stream_lines = stream.split('\n') + + #print('\n\n\n') + #print(stream_lines) + #print('\n\n\n') + + # get the line the warning might be on. + #potential_warning = "" if len(stream_lines[4:5]) == 0 else stream_lines[4:5][0] + + genia_stream = None + + #genia_stream = stream_lines[4:] + genia_stream = stream_lines + + for tag in genia_stream: + if tag.startswith('warning: the sentence seems to be too long'): + print('WARNING:', tag) + continue + if tag.split(): # Part of line linetags.append(tag) else: # End of line @@ -70,21 +99,44 @@ def genia(geniatagger, data): # Add tagger output to cache for line,tags in zip(uncached,tagged): + #print(line) + for w,feat in zip(line.split(),tags): + #print('\t', w, feat.split('\t')[0]) + assert w == feat.split('\t')[0] + #print('\n\n\n') cache.add_map(line,tags) + #print('-'*80) # Remove temp file + os.close(os_handle) + + #print 'GENIA OUTPUT: ', open(out,"rb").read() + os.remove(out) + for sent in data: + feats = cache.get_map(' '.join(sent)) + #print(sent) + for w,feat in zip(sent,feats): + #print('\t', w, feat.split('\t')[0]) + assert w == feat.split('\t')[0] + #print('\n\n\n') + #exit() # Extract features linefeats = [] retlist = [] for line in data: + + #print 'line: ', line + line = ' '.join(line) # Get tagged output from cache tags = cache.get_map(line) + #print 'tags: ', tags + for tag in tags: tag = tag.split() output = { 'GENIA-word' : tag[0] , @@ -92,12 +144,13 @@ def genia(geniatagger, data): 'GENIA-POS' : tag[2] , 'GENIA-chunktag': tag[3] , 'GENIA-NEtag' : tag[4] } - + linefeats.append(output) retlist.append(linefeats) linefeats = [] + #print 'retlist: ', retlist return retlist diff --git a/clicon/features_dir/read_config.py b/code/feature_extraction/read_config.py similarity index 51% rename from clicon/features_dir/read_config.py rename to code/feature_extraction/read_config.py index 443be12..069bded 100644 --- a/clicon/features_dir/read_config.py +++ b/code/feature_extraction/read_config.py @@ -10,26 +10,27 @@ import os +import sys +CLINER_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), *["..", ".."]) - -# -# enabled_modules -# -# @return dictionary of (name,resource path) pairs. -# -# ex. {'UMLS': None, 'GENIA': 'genia/geniatagger-3.0.1/geniatagger'} -# def enabled_modules(): + """ + enabled_modules() + + @return a dictionary of {name, resource} pairs. + ex. {'UMLS': None, 'GENIA': 'genia/geniatagger-3.0.1/geniatagger'} + + >>> enabled_modules() is not None + True + """ # Open config file - filename = os.path.join( os.getenv('CLICON_DIR'), 'config.txt' ) + filename = os.path.join(CLINER_DIR, 'config.txt' ) f = open(filename, 'r') specs = {} - module_list = [ 'GENIA', 'UMLS' ] - - + module_list = ['GENIA', 'UMLS'] for line in f.readlines(): words = line.split() if words: @@ -39,11 +40,19 @@ def enabled_modules(): if words[1] == 'None': specs[words[0]] = None else: - specs[words[0]] = words[1] + specs[words[0]]=os.path.expandvars(words[1]).strip('\"').strip('\'') - return specs + # check if paths are actually valid + if specs["GENIA"] is not None: + if os.path.isfile(specs["GENIA"]) is False: + sys.exit("Invalid path to genia executable.") + if specs["UMLS"] is not None: + if os.path.isdir(specs["UMLS"]) is False: + sys.exit("Invalid path to directory containing UMLS database tables.") + + return specs -# Read from config file when module is imported -print enabled_modules() +if __name__ == "__main__": + print(enabled_modules()) diff --git a/code/feature_extraction/sentence_features.py b/code/feature_extraction/sentence_features.py new file mode 100644 index 0000000..8c02cf1 --- /dev/null +++ b/code/feature_extraction/sentence_features.py @@ -0,0 +1,377 @@ +##################################################################### +# CliCon - sentence_features.py # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: Isolate the model's sentence-level features # +###################################################################### + + +import sys +import os +import re + +from utilities import load_pos_tagger + +# What modules are available +from read_config import enabled_modules + +CLINER_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), *["..", ".."]) + +# Import feature modules +enabled = enabled_modules() +if enabled['GENIA']: + from genia_dir.genia_features import GeniaFeatures + +# Only create UMLS cache if module is available +if enabled['UMLS']: + from umls_dir import interface_umls + from umls_dir import interpret_umls + + import umls_dir.umls_features as feat_umls + + from umls_dir.umls_cache import UmlsCache + + umls_cache = UmlsCache() + +import word_features as feat_word + +nltk_tagger = load_pos_tagger() + +# Feature Enabling +enabled_concept_features = frozenset( ["UMLS", "grammar_features"] ) + +if enabled['GENIA']: + feat_genia=None + +enabled_IOB_nonprose_sentence_features = [] +enabled_IOB_nonprose_sentence_features.append('pos') +enabled_IOB_nonprose_sentence_features.append('pos_context') +enabled_IOB_nonprose_sentence_features.append('prev') +enabled_IOB_nonprose_sentence_features.append('next') +enabled_IOB_nonprose_sentence_features.append('unigram_context') +enabled_IOB_nonprose_sentence_features.append('UMLS') + +enabled_IOB_prose_sentence_features = [] +enabled_IOB_prose_sentence_features.append('unigram_context') +enabled_IOB_prose_sentence_features.append('pos') +enabled_IOB_prose_sentence_features.append('pos_context') +enabled_IOB_prose_sentence_features.append('prev') +enabled_IOB_prose_sentence_features.append('prev2') +enabled_IOB_prose_sentence_features.append('next') +enabled_IOB_prose_sentence_features.append('next2') +enabled_IOB_prose_sentence_features.append('GENIA') +enabled_IOB_prose_sentence_features.append('UMLS') + + + + +def display_enabled_modules(): + print + for module,status in enabled.items(): + if status: + print '\t', module, '\t', ' ENABLED' + else: + print '\t', module, '\t', 'DISABLED' + print + + + +def sentence_features_preprocess(data): + global feat_genia + tagger = enabled['GENIA'] + # Only run GENIA tagger if module is available + if tagger: + feat_genia = GeniaFeatures(tagger,data) + + + +def IOB_prose_features(sentence, data=None): + """ + IOB_prose_features + + @param sentence. A list of strings + @return A list of dictionaries of features + + """ + features_list = [] + + # Initialize feat_genia if not done so already + global feat_genia + if data and enabled['GENIA'] and not feat_genia: + # Only run GENIA tagger if module is available + tagger = enabled['GENIA'] + feat_genia = GeniaFeatures(tagger,data) + + # Get a feature set for each word in the sentence + for i,word in enumerate(sentence): + features_list.append(feat_word.IOB_prose_features(sentence[i])) + + # Feature: Bag of Words unigram conext (window=3) + if 'unigram_context' in enabled_IOB_prose_sentence_features: + window = 3 + n = len(sentence) + + # Previous unigrams + for i in range(n): + end = min(i, window) + unigrams = sentence[i-end:i] + for j,u in enumerate(unigrams): + features_list[i][('prev_unigrams-%d'%j,u)] = 1 + + # Next unigrams + for i in range(n): + end = min(i + window, n-1) + unigrams = sentence[i+1:end+1] + for j,u in enumerate(unigrams): + features_list[i][('next_unigrams-%d'%j,u)] = 1 + + # Only POS tag once + if 'pos' in enabled_IOB_prose_sentence_features: + pos_tagged = nltk_tagger.tag(sentence) + + # Allow for particular features to be enabled + for feature in enabled_IOB_prose_sentence_features: + + + # Feature: Part of Speech + if feature == 'pos': + for (i,(_,pos)) in enumerate(pos_tagged): + features_list[i].update( { ('pos',pos) : 1} ) + + + # Feature: POS context + if 'pos_context' in enabled_IOB_prose_sentence_features: + window = 3 + n = len(sentence) + + # Previous POS + for i in range(n): + end = min(i, window) + for j,p in enumerate(pos_tagged[i-end:i]): + pos = p[1] + features_list[i][('prev_pos_context-%d'%j,pos)] = 1 + + # Next POS + for i in range(n): + end = min(i + window, n-1) + for j,p in enumerate(pos_tagged[i+1:i+end+1]): + pos = p[1] + features_list[i][('prev_pos_context-%d'%j,pos)] = 1 + + + # GENIA features + if (feature == 'GENIA') and enabled['GENIA']: + + # Get GENIA features + genia_feat_list = feat_genia.features(sentence) + + ''' + print '\t', sentence + + print '\n\n' + for gf in genia_feat_list: + print '\t', gf + print + print '\n\n' + ''' + + for i,feat_dict in enumerate(genia_feat_list): + features_list[i].update(feat_dict) + + + # Feature: UMLS Word Features (only use prose ones) + if (feature == "UMLS") and enabled['UMLS']: + umls_features = feat_umls.IOB_prose_features(sentence) + for i in range(len(sentence)): + features_list[i].update( umls_features[i] ) + + # Used for 'prev' and 'next' features + ngram_features = [{} for i in range(len(features_list))] + if "prev" in enabled_IOB_prose_sentence_features: + prev = lambda f: {("prev_"+k[0], k[1]): v for k,v in f.items()} + prev_list = map(prev, features_list) + for i in range(len(features_list)): + if i == 0: + ngram_features[i][("prev", "*")] = 1 + else: + ngram_features[i].update(prev_list[i-1]) + + if "prev2" in enabled_IOB_prose_sentence_features: + prev2 = lambda f: {("prev2_"+k[0], k[1]): v/2.0 for k,v in f.items()} + prev_list = map(prev2, features_list) + for i in range(len(features_list)): + if i == 0: + ngram_features[i][("prev2", "*")] = 1 + elif i == 1: + ngram_features[i][("prev2", "*")] = 1 + else: + ngram_features[i].update(prev_list[i-2]) + + if "next" in enabled_IOB_prose_sentence_features: + next = lambda f: {("next_"+k[0], k[1]): v for k,v in f.items()} + next_list = map(next, features_list) + for i in range(len(features_list)): + if i < len(features_list) - 1: + ngram_features[i].update(next_list[i+1]) + else: + ngram_features[i][("next", "*")] = 1 + + if "next2" in enabled_IOB_prose_sentence_features: + next2 = lambda f: {("next2_"+k[0], k[1]): v/2.0 for k,v in f.items()} + next_list = map(next2, features_list) + for i in range(len(features_list)): + if i < len(features_list) - 2: + ngram_features[i].update(next_list[i+2]) + elif i == len(features_list) - 2: + ngram_features[i][("next2", "**")] = 1 + else: + ngram_features[i][("next2", "*")] = 1 + + merged = lambda d1, d2: dict(d1.items() + d2.items()) + features_list = [merged(features_list[i], ngram_features[i]) + for i in range(len(features_list))] + + ''' + for f in features_list: + print sorted(f.items()) + print + print '\n\n\n' + ''' + + return features_list + + +def IOB_nonprose_features(sentence): + """ + IOB_nonprose_features + + @param sentence. A list of strings + @return A list of dictionaries of features + + """ + + # Get a feature set for each word in the sentence + features_list = [] + for i,word in enumerate(sentence): + word_feats = feat_word.IOB_nonprose_features(sentence[i]) + features_list.append( word_feats ) + + + # Feature: Bag of Words unigram conext (window=3) + if 'unigram_context' in enabled_IOB_nonprose_sentence_features: + window = 3 + n = len(sentence) + + # Previous unigrams + for i in range(n): + end = min(i, window) + unigrams = sentence[i-end:i] + for j,u in enumerate(unigrams): + features_list[i][('prev_unigrams-%d'%j,u)] = 1 + + # Next unigrams + for i in range(n): + end = min(i + window, n-1) + unigrams = sentence[i+1:end+1] + for u in unigrams: + features_list[i][('next_unigrams-%d'%j,u)] = 1 + + + # Feature: UMLS Word Features (only use nonprose ones) + if enabled['UMLS'] and 'UMLS' in enabled_IOB_nonprose_sentence_features: + umls_features = feat_umls.IOB_nonprose_features(sentence) + for i in range(len(sentence)): + features_list[i].update( umls_features[i] ) + + + #return features_list + + if 'pos' in enabled_IOB_nonprose_sentence_features: + pos_tagged = nltk_tagger.tag(sentence) + + # Allow for particular features to be enabled + for feature in enabled_IOB_nonprose_sentence_features: + + # Feature: Part of Speech + if feature == 'pos': + for (i,(_,pos)) in enumerate(pos_tagged): + features_list[i][ ('pos',pos) ] = 1 + + + # Feature: POS context + if 'pos_context' in enabled_IOB_nonprose_sentence_features: + window = 3 + n = len(sentence) + + # Previous POS + for i in range(n): + end = min(i, window) + for j,p in enumerate(pos_tagged[i-end:i]): + pos = p[1] + features_list[i][('prev_pos_context-%d'%j,pos)] = 1 + + # Next POS + for i in range(n): + end = min(i + window, n-1) + for j,p in enumerate(pos_tagged[i+1:i+end+1]): + pos = p[1] + features_list[i][('prev_pos_context-%d'%j,pos)] = 1 + + + + ngram_features = [{} for _ in range(len(features_list))] + if "prev" in enabled_IOB_nonprose_sentence_features: + prev = lambda f: {("prev_"+k[0], k[1]): v for k,v in f.items()} + prev_list = map(prev, features_list) + for i in range(len(features_list)): + if i == 0: + ngram_features[i][("prev", "*")] = 1 + else: + ngram_features[i].update(prev_list[i-1]) + + if "next" in enabled_IOB_nonprose_sentence_features: + next = lambda f: {("next_"+k[0], k[1]): v for k,v in f.items()} + next_list = map(next, features_list) + for i in range(len(features_list)): + if i == len(features_list) - 1: + ngram_features[i][("next", "*")] = 1 + else: + ngram_features[i].update(next_list[i+1]) + + + merged = lambda d1, d2: dict(d1.items() + d2.items()) + features_list = [merged(features_list[i], ngram_features[i]) + for i in range(len(features_list))] + + + return features_list + + + + +def concept_features_for_sentence(sentence, chunk_inds): + + """ + concept_features() + + @param sentence. A sentence in list of chunk format + @param chunk_inds. A list of indices for non-None-labeled chunks + @return A list of feature dictionaries + """ + + # Get a feature set for each word in the sentence + features_list = [] + for ind in chunk_inds: + features_list.append( feat_word.concept_features_for_chunk(sentence,ind) ) + + # Allow for particular features to be enabled + for feature in enabled_concept_features: + + # Features: UMLS features + if (feature == "UMLS") and enabled['UMLS']: + umls_features = feat_umls.concept_features_for_chunks(sentence, chunk_inds) + for i in range(len(chunk_inds)): + features_list[i].update( umls_features[i] ) + + return features_list diff --git a/clicon/machine_learning/__init__.py b/code/feature_extraction/umls_dir/__init__.py similarity index 100% rename from clicon/machine_learning/__init__.py rename to code/feature_extraction/umls_dir/__init__.py diff --git a/code/feature_extraction/umls_dir/create_sqliteDB.py b/code/feature_extraction/umls_dir/create_sqliteDB.py new file mode 100644 index 0000000..c7f60df --- /dev/null +++ b/code/feature_extraction/umls_dir/create_sqliteDB.py @@ -0,0 +1,205 @@ +#database.py creates a .db file for performing umls searches. +import sqlite3 +import os +import sys +import os +import atexit + +features_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if features_dir not in sys.path: + sys.path.append(features_dir) + + +# find where umls tables are located +from read_config import enabled_modules +enabled = enabled_modules() +umls_tables = enabled['UMLS'] + +# set to True when create_db() is succesful +success = False +db_path = None +conn = None + +MRSTY_TABLE_FILE = None +MRCON_TABLE_FILE = None +MRREL_TABLE_FILE = None +LRABR_TABLE_FILE = None + +# this ensure files are closed properly and umls.db is removed if not succesful +@atexit.register +def umls_db_cleanup(): + + global success + global conn + global db_path + + global MRSTY_TABLE_FILE + global MRCON_TABLE_FILE + global MRREL_TABLE_FILE + global LRABR_TABLE_FILE + + if conn is not None: + conn.close() + + if MRSTY_TABLE_FILE is not None: + MRSTY_TABLE_FILE.close() + + if MRCON_TABLE_FILE is not None: + MRCON_TABLE_FILE.close() + + if MRREL_TABLE_FILE is not None: + MRREL_TABLE_FILE.close() + + if LRABR_TABLE_FILE is not None: + LRABR_TABLE_FILE.close() + + if success is False: + + # remove umls.db, it is junk now + if db_path is not None: + os.remove(db_path) + + print >>sys.stderr, '\n\tError: umls.db was not created succesfully.\n' + +def create_db(): + + global success + global conn + global db_path + + global MRSTY_TABLE_FILE + global MRCON_TABLE_FILE + global MRREL_TABLE_FILE + global LRABR_TABLE_FILE + + print "\ncreating umls.db" + #connect to the .db file we are creating. + db_path = os.path.join(umls_tables, 'umls.db') + conn = sqlite3.connect( db_path ) + conn.text_factory = str + + print "opening files" + #load data in files. + try: + mrsty_path = os.path.join(umls_tables, 'MRSTY.RRF') + MRSTY_TABLE_FILE = open( mrsty_path, "r" ) + except IOError: + print "\nNo file to use for creating MRSTY.RRF table\n" + sys.exit() + + try: + mrcon_path = os.path.join(umls_tables, 'MRCONSO.RRF') + MRCON_TABLE_FILE = open( mrcon_path , "r" ) + except IOError: + print "\nNo file to use for creating MRCONSO.RRF table\n" + sys.exit() + + try: + mrrel_path = os.path.join(umls_tables, 'MRREL.RRF') + MRREL_TABLE_FILE = open( mrrel_path , "r" ) + except IOError: + print "\nNo file to use for creating MRREL.RRF table\n" + sys.exit() + + try: + lrabr_path = os.path.join(umls_tables, 'LRABR') + LRABR_TABLE_FILE = open( lrabr_path , "r" ) + except IOError: + print "\nNo file to use for creating LRABR table\n" + sys.exit() + + print "creating tables" + c = conn.cursor() + + #create tables. + c.execute( "CREATE TABLE MRSTY( CUI, TUI, STN, STY, ATUI, CVF ) ;" ) + c.execute( "CREATE TABLE MRCON( CUI, LAT, TS, LUI, STT, SUI, ISPREF, AUI, SAUI, SCUI, SDUI, SAB, TTY, CODE, STR, SRL, SUPPRESS, CVF ) ;" ) + c.execute( "CREATE TABLE MRREL( CUI1, AUI1, STYPE1, REL, CUI2, AUI2, STYPE2, RELA, RUI, SRUI, SAB, SL, RG, DIR, SUPPRESS, CVF );") + c.execute( "CREATE TABLE LRABR( EUI1, ABR, TYPE, EUI2, STR);") + + print "inserting data into MRSTY table" + for line in MRSTY_TABLE_FILE: + + line = line.strip('\n') + + assert line[-1] == '|', "str: {}, char: ".format(line, line[-1]) + + line = line.split('|') + + # end will always be empty str + line.pop() + + assert len(line) == 6 + + c.execute( "INSERT INTO MRSTY( CUI, TUI, STN, STY, ATUI, CVF ) values( ?, ?, ?, ?, ?, ?)" , tuple(line)) + + print "inserting data into MRCON table" + for line in MRCON_TABLE_FILE: + + line = line.strip('\n') + + assert line[-1] == '|', "str: {}, char: ".format(line, line[-1]) + + line = line.split('|') + + # end will always be empty str + line.pop() + + assert len(line) == 18 + + c.execute( "INSERT INTO MRCON( CUI, LAT, TS, LUI, STT, SUI, ISPREF, AUI, SAUI, SCUI, SDUI, SAB, TTY, CODE, STR, SRL, SUPPRESS, CVF ) values ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", tuple(line)) + + + print "inserting data into MRREL table" + for line in MRREL_TABLE_FILE: + + line = line.strip('\n') + + assert line[-1] == '|', "str: {}, char: ".format(line, line[-1]) + + line = line.split('|') + + # end will always be empty str + line.pop() + + assert len(line) == 16 + + c.execute( "INSERT INTO MRREL( CUI1, AUI1, STYPE1, REL, CUI2, AUI2, STYPE2, RELA, RUI, SRUI, SAB, SL, RG, DIR, SUPPRESS, CVF ) values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" , tuple(line)) + + + print "inserting into LRABR table" + for line in LRABR_TABLE_FILE: + + line = line.strip('\n') + + assert line[-1] == '|', "str: {}, char: ".format(line, line[-1]) + + line = line.split('|') + + line.pop() + + assert len(line) == 5 + + c.execute( "INSERT INTO LRABR( EUI1, ABR, TYPE, EUI2, STR) values( ?, ?, ?, ?,?)" , tuple(line) ) + + print "creating indices" + + #create indices for faster queries + c.execute( "CREATE INDEX mrsty_cui_map ON MRSTY(CUI)") + c.execute( "CREATE INDEX mrcon_str_map ON MRCON(STR)") + c.execute( "CREATE INDEX mrcon_cui_map ON MRCON(CUI)") + c.execute( "CREATE INDEX mrrel_cui2_map ON MRREL( CUI2 )" ) + c.execute( "CREATE INDEX mrrel_cui1_map on MRREL( CUI1 ) " ) + c.execute( "CREATE INDEX mrrel_rel_map on MRREL( REL )" ) + c.execute( "CREATE INDEX lrabr_abr_map on LRABR(ABR)") + c.execute( "CREATE INDEX lrabr_str_map on LRABR(STR)") + + #save changes to .db + conn.commit() + + success = True + print "\nsqlite database created" + +if __name__ == "__main__": + create_db() + diff --git a/code/feature_extraction/umls_dir/create_trie.py b/code/feature_extraction/umls_dir/create_trie.py new file mode 100644 index 0000000..1999649 --- /dev/null +++ b/code/feature_extraction/umls_dir/create_trie.py @@ -0,0 +1,117 @@ + +#database.py creates a .db file for performing umls searches. +import marisa_trie +import sys +import os +import atexit + +features_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if features_dir not in sys.path: + sys.path.append(features_dir) + +# find where umls tables are located +from read_config import enabled_modules +enabled = enabled_modules() +umls_tables = enabled['UMLS'] + +trie_path = None +success = False +MRCON_TABLE = None + +@atexit.register +def trie_cleanup(): + + global trie_path + global MRCON_TABLE + global success + + if success is False: + + print >>sys.stderr, '\n\tError: trie was not created succesfully.\n' + + if trie_path is not None: + + try: + os.remove(trie_path) + except: + pass + + if MRCON_TABLE is not None: + MRCON_TABLE.close() + + +def create_trie(): + + global trie_path + global MRCON_TABLE + global success + + """ + create_trie() + + Purpose: Build a trie of concepts from MRREL + + @return A trie object + """ + # Is trie already built & pickled? + trie_path = os.path.join(umls_tables, 'umls-concept.trie') + try: + t = marisa_trie.Trie().load(trie_path) + success = True + return t + except IOError: + pass + + + print "\ncreating concept-trie" + + #load data in files. + print "opening file" + try: + mrcon_path = os.path.join(umls_tables, 'MRCONSO.RRF') + MRCON_TABLE = open( mrcon_path , "r" ) + except IOError: + print "\nNo file to use for creating MRCON table\n" + sys.exit() + + print "inserting data into concept-trie" + + #insert data onto database + print "inserting data" + concepts = [] + for line in MRCON_TABLE: + + line = line.split('|') + line.pop() + + assert len(line) == 18 + + if len(line) < 6: continue + + concept = line[14] + + # Ignore non-ascii + try: + concept.decode('ascii') + except: + continue + + #print type(concept) + concepts.append(concept) + + print "creating trie" + t = marisa_trie.Trie(concepts) + + print "concept-trie created" + + # Pickle trie + + t.save(trie_path) + + success = True + + return t + + +if __name__ == '__main__': + t = create_trie() diff --git a/code/feature_extraction/umls_dir/interface_umls.py b/code/feature_extraction/umls_dir/interface_umls.py new file mode 100644 index 0000000..f773a22 --- /dev/null +++ b/code/feature_extraction/umls_dir/interface_umls.py @@ -0,0 +1,135 @@ +# +# Interface to UMLS Databases and concept trie +# +# +# + + +import copy +import sqlite3 +import create_sqliteDB +import os + +import create_trie +import difflib +import string +import sys + +features_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if features_dir not in sys.path: + sys.path.append(features_dir) + +# find where umls tables are located +from read_config import enabled_modules +enabled = enabled_modules() +umls_tables = enabled['UMLS'] + + + + +############################################ +### Setups / Handshakes ### +############################################ + + +#connect to UMLS database +def SQLConnect(): + #try to connect to the sqlite database. + # if database does not exit. Make one. + db_path = os.path.join(umls_tables, "umls.db") + if not os.path.isfile(db_path): + print "\n\tdb doesn't exist (creating one now)\n" + create_sqliteDB.create_db() + + db = sqlite3.connect( db_path ) + return db.cursor() + + + + +############################################ +### Global reource connections ### +############################################ + + +# Global database connection +c = SQLConnect() + +# Global trie +trie = create_trie.create_trie() + + + + +############################################ +### Query Operations ### +############################################ + + +def string_lookup( string ): + """ Get sty for a given string """ + try: + c.execute( "SELECT sty FROM MRCON a, MRSTY b WHERE a.cui = b.cui AND str = ?; " , (string,) ) + return c.fetchall() + except sqlite3.ProgrammingError, e: + return [] + + +def cui_lookup( string ): + """ get cui for a given string """ + try: + # Get cuis + c.execute( "SELECT cui FROM MRCON WHERE str = ?;" , (string,) ) + return c.fetchall() + except sqlite3.ProgrammingError, e: + return [] + + +def abr_lookup( string ): + """ searches for an abbreviation and returns possible expansions for that abbreviation""" + try: + c.execute( "SELECT str FROM LRABR WHERE abr = ?;", (string,)) + return c.fetchall() + except sqlite3.ProgrammingError, e: + return [] + + +def concept_exists(string): + """ Fast query for set membership in trie """ + return unicode(string) in trie + + + +def tui_lookup( string ): + """ takes in a concept id string (ex: C00342143) and returns the TUI of that string which represents the semantic type is belongs to """ + try: + c.execute( "SELECT tui FROM MRSTY WHERE cui = ?;", (string,)) + return c.fetchall() + except sqlite3.ProgrammingError, e: + return [] + + +def substrs_that_exists( lOfStrs , pwl): + """ sees if a sub string exists within trie""" + lOfNormStrs = [string.strip() for string in lOfStrs] + lOfNormStrs = [strip_punct(string) for string in lOfNormStrs] + lOfNormStrs = [( string, string.lower() ) for string in lOfNormStrs] + retVal = False + numThatExist = 0 + # strings are case sensitive. + for normStr1, normStr2 in lOfNormStrs: + strs = difflib.get_close_matches(normStr1,trie.keys(unicode(normStr1)),cutoff=.8) + if len(strs) == 0: + if normStr2 != normStr1: + strs = difflib.get_close_matches(normStr2, trie.keys(unicode(normStr2)), cutoff=.8) + if len(strs) > 0: + numThatExist += 1 + + return numThatExist + + +def strip_punct(stringArg): + for c in string.punctuation: + stringArg = string.replace(stringArg, c, "") + return stringArg + diff --git a/code/feature_extraction/umls_dir/interpret_umls.py b/code/feature_extraction/umls_dir/interpret_umls.py new file mode 100644 index 0000000..76db45a --- /dev/null +++ b/code/feature_extraction/umls_dir/interpret_umls.py @@ -0,0 +1,378 @@ +import os +import sys +import cPickle as pickle +import interface_umls + +import time +import nltk + + +metamap = None + +def umls_semantic_type_word( umls_string_cache , sentence ): + # Already cached? + if False and umls_string_cache.has_key( sentence ): + mapping = umls_string_cache.get_map( sentence ) + else: + concepts = interface_umls.string_lookup( sentence ) + concepts = [ singleton[0] for singleton in set(concepts) ] + umls_string_cache.add_map(sentence , concepts) + mapping = umls_string_cache.get_map(sentence) + + return mapping + + +def umls_semantic_context_of_words( umls_string_cache, sentence ): + + #Defines the largest string span for the sentence. + WINDOW_SIZE = 7 + + # span of the umls concept of the largest substring + umls_context_list = [] + + # keys: tuple of (start,end) index of a substring + concept_span_dict = {} + + # Each sublist functions as the mappings for each word. + for i in sentence: + umls_context_list.append( [] ) + + # finds the span for each substring of length 1 to currentWindowSize. + for currentWindowSize in range( 1 , WINDOW_SIZE ): + for ti in range( 0 , ( len(sentence) - currentWindowSize ) + 1 ): + rawstring = "" + for tj in range( ti , ti + currentWindowSize): + rawstring += ( sentence[tj] + " " ) + + #Each string is of length 1 to currentWindowSize. + rawstring = rawstring.strip() + + # Not in cache yet? + if not( umls_string_cache.has_key( rawstring ) ): + # returns a tuple if there is a result or None is there is not. + concept = interface_umls.string_lookup( rawstring ) + + if not concept: + umls_string_cache.add_map( rawstring, None ) + else: + umls_string_cache.add_map( rawstring, concept ) ; + + #Store the concept into concept_span_dict with its span as a key. + concept_span_dict[(ti,ti+currentWindowSize-1)] = umls_string_cache.get_map( rawstring ) + + # For each substring if there is a span, then + # assign the concept to every word that is within in the substring + if umls_string_cache.get_map(rawstring): + for i in range( ti , ti + currentWindowSize ): + if len( umls_context_list[i] ) == 0: + umls_context_list[i].append([ti,ti+currentWindowSize-1]) + + else: + updated = 0 + for j in umls_context_list[i]: + if j[0] >= ti and j[1] <= (ti+currentWindowSize-1): + j[0] = ti + j[1] = ( ti + currentWindowSize - 1 ) + updated += 1 + if not(updated): + val = [ti,ti+currentWindowSize-1] + if umls_context_list[i].count(val)== 0: + umls_context_list[i].append(val) + + + #create a list of sublists + # each sublist represents the contexts for which the word appears + mappings = [] + for i in umls_context_list: + spans = i + if len(spans) == 0: + mappings.append( None ) + else: + sub_mappings = [] + for j in spans: + sub_mappings.append( concept_span_dict[tuple(j)]) + + # FIXME - Decided to concat rather than append (not sure why) + mappings += sub_mappings + + return mappings + + +def umls_semantic_type_sentence( cache , sentence ): + + #Defines the largest string span for the sentence. + WINDOW_SIZE = 7 + + longestSpanLength = 0 + longestSpans = [] # List of (start,end) tokens + + for i in range(len(sentence)): + maxVal = min(i+WINDOW_SIZE, len(sentence)) + for j in range(i,maxVal): + # Lookup key + span = sentence[i:j+1] + rawstring = unicode(' '.join(span)) + + # string does have an associated UMLS concept? + if interface_umls.concept_exists(rawstring): + if len(span) == longestSpanLength: + longestSpans.append( (i,j) ) + # new longest span size + elif len(span) > longestSpanLength: + longestSpans = [ (i,j) ] + longestSpanLength = len(span) + + # lookup UMLS concept for a given (start,end) span + def span2concept(span): + rawstring = ' '.join(sentence[span[0]:span[1]+1]) + + # Already cached? + if cache.has_key( rawstring ): + return cache.get_map( rawstring ) + + else: + concept = interface_umls.string_lookup( rawstring ) + + if concept: + cache.add_map( rawstring , concept ) + else: + cache.add_map( rawstring , [] ) + + return cache.get_map( rawstring ) + + mappings = [ span2concept(span) for span in longestSpans ] + return mappings + +def abr_lookup( cache, word): + """ get expansions of an abbreviation """ + if cache.has_key( word + "--abrs"): + abbreviations = cache.get_map( word + "--abrs") + else: + abbreviations = interface_umls.abr_lookup(word) + + if abbreviations != []: + + # the lookup returns a list of tuples so now it will be converted to a list of strings + abbreviations = [tuple[0] for tuple in abbreviations] + + cache.add_map( word + "--abrs", abbreviations) + return abbreviations + +def get_cuis_for_abr(cache, word): + """ gets cui for each possible expansion of abbreviation """ + if cache.has_key( word + "--cuis_of_abr"): + cuis_of_abr = cache.get_map( word + "--cuis_of_abr" ) + else: + cuis_of_abr = {} + for phrase in abr_lookup(cache, word): + # prevents circular loop + cuis_of_abr[phrase] = get_cui(cache, phrase) + + cache.add_map( word + "cuis_of_abr", cuis_of_abr ) + + return cuis_of_abr + +def get_tui( cache, cuiStr ): + """ get tui of a cui """ + if cache.has_key( cuiStr + "--tui"): + tui = cache.get_map( cuiStr + "--tui") + else: + # list of singleton tuples + tui = interface_umls.tui_lookup(cuiStr) + + # change to list of strings + tui = [semanticType[0] for semanticType in tui] + + cache.add_map(cuiStr + "--tui", tui) + + return tui + +# Get the umls concept id for a given word +def get_cui( cache , word ): +# if word != "blood": +# return [] + # If already in cache + if cache.has_key( word + '--cuis' ): + + cuis = cache.get_map( word + '--cuis' ) + + else: + + # Get cui + cuis = interface_umls.cui_lookup(word) + cuis = [c[0] for c in cuis] + # Eliminate duplicates + cuis = list(set(cuis)) + + # Store result in cache + cache.add_map( word + '--cuis', cuis ) + + return cuis + +def get_list_all_possible_cuis_for_abrv(cache, phrase): + """ + get cuis for every possible possible abbreviation expansion. + + To define your own filter go to: + + page 3: + + http://semanticnetwork.nlm.nih.gov/SemGroups/Papers/2003-medinfo-atm.pdf + + look up categories and semantic types and get the tui from: + + http://metamap.nlm.nih.gov/Docs/SemanticTypes_2013AA.txt + + """ + + phrases = get_cuis_for_abr(cache, phrase) + + results = set() + + # change fromdictionary to a set of strings. + for phrase in phrases: + for cui in phrases[phrase]: + results.add(cui) + + return list(results) + + +def get_most_freq_cui(cui_list, cui_freq): + """ + from a list of strings get the cui string that appears the most frequently. + + Note: if there is no frequency stored then this will crash. + """ + + cui_highest_freq = None + + for cui in cui_list: + + if cui in cui_freq: + + # sets an initial cui + if cui_highest_freq is None: + cui_highest_freq = cui + + # assign new highest + elif cui_freq[cui] > cui_freq[cui_highest_freq]: + cui_highest_freq = cui + + # at this point we have not found any concept ids with a frequency greater than 0. + # good chance it is CUI-less + if cui_highest_freq is None: + cui_highest_freq = "CUI-less" + + return cui_highest_freq + +def filter_cuis_by_tui(cache, cuis, filter=["T020", # acquired abnormality + "T190", # Anatomical Abnormality + "T049", # Cell or Molecular Dysfunction + "T019", # Congenital Abnormality + "T047", # Disease or Syndrome + "T050", # Experimental Model of Disease + "T033", # Finding + "T037", # Injury or Poisoning + "T048", # Mental or Behavioral Dysfunction + "T191", # Neoplastic Process + "T046", # Pathologic Function + "T184"]): + """ removes cuis that do not have tui that is in the filter """ + results = set() + + for cui in cuis: + + for tui in get_tui(cache, cui): + + if tui in filter: + results.add(cui) + break + + return list(results) + +def normalize_phrase(phrase, PyPwl=None): + + norm = "" + for char in phrase: + if char.isalnum() is True or char.isspace() is True: + norm += char + + phrase = norm + + phrase = nltk.PorterStemmer().stem(phrase) + + if PyPwl is not None: + + init_time = time.time() + + phrase = spellCheck(phrase, PyPwl=PyPwl) + + print time.time() - init_time + + return phrase + + +def is_valid_phrase(phrase): + + valid_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') + phrase = set(phrase) + + ret_val = len(valid_chars.intersection(phrase)) > 0 + + return ret_val + + +def obtain_concept_ids(cache, phrase, PyPwl=None, cui_freq={}): + + global metamap + + # phrases that do not contain alphanumerica characters will cause metamap to crash. + if is_valid_phrase(phrase) is False: + return ['CUI-less'] + + #phrases = [normalize_phrase(phrase, PyPwl=PyPwl) for phrase in phrases] + + # assumes dependencies are installed properly if this function is called. + if metamap is None: + from cuiLookup import MetaMap + metamap = MetaMap() + + # lvgnorm is used within the metamap java code for efficiency reasons. + conceptIds = metamap.getConceptIds(phrase) + + retVal = [] + + assert(len(conceptIds) == 1) + + for conceptId in conceptIds: + + cuis = set() + + for key in conceptId["mappings"]: + cuis = cuis.union(conceptId["mappings"][key]) + + for normPhrase in conceptId["norms"]: + cuis = cuis.union(get_list_all_possible_cuis_for_abrv(cache, normPhrase) + get_cui(cache, normPhrase)) + + if (len(cuis) == 1 and 'CUI-less' in cuis): + for corrected_phrase in normalize_phrase(phrase, PyPwl=PyPwl): + cuis = cuis.union(get_list_all_possible_cuis_for_abrv(cache, corrected_phrase) + get_cui(cache, corrected_phrase)) + + cuis = filter_cuis_by_tui(cache, cuis) + + retVal = get_most_freq_cui(cuis, cui_freq) + + return retVal + + +if __name__ == "__main__": + strings = ["MCA Aneurysm", "middle cerebral arterial aneurysm"] + + from umls_cache import UmlsCache + cache = UmlsCache() + + for phrase in strings: + print "PHRASE: ", phrase + print "CUI: ", obtain_concept_ids(cache, phrase) + print "nothing to do" + diff --git a/code/feature_extraction/umls_dir/umls_cache.py b/code/feature_extraction/umls_dir/umls_cache.py new file mode 100644 index 0000000..75ae2af --- /dev/null +++ b/code/feature_extraction/umls_dir/umls_cache.py @@ -0,0 +1,52 @@ +import pickle +import sys +import os + +import atexit + + +features_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if features_dir not in sys.path: + sys.path.append(features_dir) + +# find where umls tables are located +from read_config import enabled_modules +enabled = enabled_modules() +umls_tables = enabled['UMLS'] + + +from utilities import load_pickled_obj + +class UmlsCache: + + # static class variables + filename = None + cache = None + + def __init__(self): + + try: + + UmlsCache.filename = os.path.join(umls_tables, 'umls_cache') + UmlsCache.cache = load_pickled_obj(UmlsCache.filename) + + except IOError: + UmlsCache.cache = {} + + def has_key(self , string): + return UmlsCache.cache.has_key( string ) + + def add_map(self , string, mapping): + UmlsCache.cache[string] = mapping + + def get_map(self , string): + return UmlsCache.cache[string] + + @staticmethod + @atexit.register + def destructor(): + + if UmlsCache.filename is not None and UmlsCache.cache is not None: + + pickle.dump(UmlsCache.cache, open(UmlsCache.filename,"wb")) + diff --git a/code/feature_extraction/umls_dir/umls_features.py b/code/feature_extraction/umls_dir/umls_features.py new file mode 100644 index 0000000..095b405 --- /dev/null +++ b/code/feature_extraction/umls_dir/umls_features.py @@ -0,0 +1,51 @@ +###################################################################### +# CliCon - umls_features.py # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: Independent UMLS module # +###################################################################### + + + +from umls_cache import UmlsCache +import interpret_umls + +umls_lookup_cache = UmlsCache() + + +def extract_umls_features(sentence): + features_list = [] + for word in sentence: + + features_list.append( features_for_word(word) ) + + return features_list + + + +def features_for_word(word): + + """ + UMLSFeatures::features_for_word() + + @ param word. word to lookup in UMLS database + @return dictionary of word-level features + """ + + # Return value is a list of dictionaries (of features) + features = {} + + # Feature: UMLS Semantic Types + cuis = interpret_umls.get_cui(umls_lookup_cache , word) + if cuis: + for cui in cuis: + features[('umls_cui',cui)] = 1 + # Feature: UMLS Semantic Type (for each word) + mapping = interpret_umls.umls_semantic_type_word(umls_lookup_cache , word ) + if mapping: + for concept in mapping: + features[('umls_semantc_type', concept )] = 1 + + return features + diff --git a/code/feature_extraction/utils.py b/code/feature_extraction/utils.py new file mode 100644 index 0000000..ee7cdc0 --- /dev/null +++ b/code/feature_extraction/utils.py @@ -0,0 +1,144 @@ +###################################################################### +# CliCon - utilities.py # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: Miscellaneous tools for handling data. # +###################################################################### + + +import re +import pickle +import os +import sys + + +# used as a default path for stashing pos tagger. +dname = os.path.dirname +CLINER_DIR = dname(dname(dname(os.path.abspath(__file__)))) +tagger_name = 'py%d_maxent_treebank_pos_tagger.pickle' % sys.version_info.major +pos_tagger_path = os.path.join(CLINER_DIR, 'tools', tagger_name) + + +def load_pickled_obj(path_to_pickled_obj): + data = None + with open(path_to_pickled_obj, "rb") as f: + data = f.read() + return pickle.loads(data) + + +def pickle_dump(obj, path_to_obj): + f = open(path_to_obj, "wb") + # NOTE: using highest priority makes loading TRAINED models load really slowly. + # use this function for anything BUT THAT!. I mainly made this for loading pos tagger... + pickle.dump(obj, f, -1) + f.close() + + +def dump_pos_tagger(path_to_obj): + tagger = nltk.data.load(nltk.tag._POS_TAGGER) + pickle_dump(tagger, path_to_obj) + + +def load_pos_tagger(path_to_obj=pos_tagger_path): + tagger = load_pickled_obj(path_to_obj) + return tagger + + +def is_prose_sentence(sentence): + """ + is_prose_sentence() + + Purpose: Determine if a sentence of text is 'prose' + + @param sentence A list of words + @return A boolean + + >>> is_prose_sentence(['Admission', 'Date', ':']) + False + >>> is_prose_sentence(['Hello', 'World', '.']) + True + >>> is_prose_sentence(['What', 'do', 'you', 'think', '?']) + True + >>> is_prose_sentence(['Short', 'sentence']) + False + """ + # Empty sentence is not prose + if not sentence: + return False + + if sentence[-1] == '.' or sentence[-1] == '?': + return True + elif sentence[-1] == ':': + return False + elif len(sentence) <= 5: + return False + elif is_at_least_half_nonprose(sentence): + return True + else: + return False + + + +def is_at_least_half_nonprose(sentence): + """ + is_at_least_half_nonprose(sentence) + + Purpose: Checks if at least half of the sentence is considered to be 'nonprose' + + @param sentence. A list of words + @return A boolean + + >>> is_at_least_half_nonprose(['1','2','and','some','words']) + True + >>> is_at_least_half_nonprose(['1', '2', '3', '4', 'and', 'some', 'words', '5']) + False + >>> is_at_least_half_nonprose(['word']) + True + >>> is_at_least_half_nonprose([' ']) + True + """ + count = len( [ w for w in sentence if is_prose_word(w) ] ) + + if count >= len(sentence)/2: + return True + else: + return False + + +def is_prose_word(word): + """ + is_prose_word(word) + + Purpose: Checks if the given word is 'prose' + + @param word. A word + @return A boolean + + >>> is_prose_word('word') + True + >>> is_prose_word('99') + False + >>> is_prose_word('question?') + False + >>> is_prose_word('ALLCAPS') + False + """ + # Punctuation + for punc in ".?,!:\"'": + if punc in word: + return False + + # Digit + if re.match('\d', word): + return False + + # All uppercase + if word == word.upper(): + return False + + # Else + return True + + +#EOF diff --git a/code/feature_extraction/word_features.py b/code/feature_extraction/word_features.py new file mode 100644 index 0000000..993ea6f --- /dev/null +++ b/code/feature_extraction/word_features.py @@ -0,0 +1,581 @@ +###################################################################### +# CliNER - word_features.py # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: Isolate all word-level features into a single file # +###################################################################### + + +__author__ = 'Willie Boag' +__date__ = 'Apr 27, 2014' + +import re +import os +import sys + +from .wordshape import getWordShapes +from nltk import LancasterStemmer, PorterStemmer + +lancaster_st = LancasterStemmer() +porter_st = PorterStemmer() + +def feature_word(word): + return {('word', word.lower()): 1} + +def feature_stem_lancaster(word): + return {('stem_lancaster', lancaster_st.stem(word.lower())): 1} + +def feature_generic(word): + generic = re.sub('[0-9]', '0', word) + return {('Generic#', generic): 1} + +def feature_last_two_letters(word): + return {('last_two_letters', word[-2:]): 1} + +def feature_length(word): + return {('length', ''): len(word)} + +def feature_stem_porter(word): + try: + return {('stem_porter', porter_st.stem(word)): 1} + except Exception as e: + return {} + +def feature_mitre(word): + features = {} + for f in mitre_features: + if re.search(mitre_features[f], word): + features[('mitre', f)] = 1 + return features + +def feature_word_shape(word): + features = {} + wordShapes = getWordShapes(word) + for shape in wordShapes: + features[('word_shape', shape)] = 1 + return features + +def feature_metric_unit(word): + unit = '' + if is_weight(word): + unit = 'weight' + elif is_size(word): + unit = 'size' + elif is_volume(word): + unit = 'volume' + return {('metric_unit', unit): 1} + +def feature_prefix(word): + prefix = word[:4].lower() + return {("prefix", prefix): 1} + +def QANN_features(word): + """ + QANN_features() + + Purpose: Creates a dictionary of QANN features for the given word. + + @param word. A string + @return A dictionary of features + + >>> QANN_features('test') is not None + True + """ + + features = {} + + # Feature: test result + if is_test_result(word): features[('test_result','')] = 1 + + # Feature: measurements + if is_measurement(word): features[('measurement','')] = 1 + + # Feature: directive + if is_directive(word): features[('directive', '')] = 1 + + # Feature: date + if is_date(word): features[('date', '')] = 1 + + # Feature: volume + if is_volume(word): features[('volume', '')] = 1 + + # Feature: weight + if is_weight(word): features[('weight', '')] = 1 + + # Feature: size + if is_size(word): features[('size', '')] = 1 + + # Feature: prognosis location + if is_prognosis_location: features[('prog_location', '')] = 1 + + # Feature: problem form + if has_problem_form(word): features[('problem_form', '')] = 1 + + # Feature: concept class + if is_weight(word): features[('weight', '')] = 1 + + return features + +def feature_prev_word_stem(sentence, ind): + if ind != 0: + prev_ind = ind - 1 + prev_chunk = sentence[prev_ind].split() + prev_word = porter_st.stem( prev_chunk[-1] ) + return {('prev_word_stem', prev_word): 1} + else: + return {('prev_word_stem', ''): 1} + +def feature_next_word_stem(sentence, ind): + if ind != len(sentence)-1: + next_ind = ind + 1 + next_chunk = sentence[next_ind].split() + next_word = porter_st.stem( next_chunk[0] ) + return {('next_word_stem', next_word): 1} + else: + return {('next_word_stem', ''): 1} + + +enabled_IOB_prose_word_features = frozenset( [feature_generic, feature_last_two_letters, feature_word, feature_length, feature_stem_porter, feature_mitre, feature_stem_lancaster, feature_word_shape, feature_metric_unit] ) + +def IOB_prose_features(word): + """ + IOB_prose_features() + + Purpose: Creates a dictionary of prose features for the given word. + + @param word. A string + @return A dictionary of features + + >>> IOB_prose_features('test') is not None + True + """ + + # Feature: + features = {('dummy', ''): 1} # always have >0 dimensions + + # Extract all enabled features + for feature in enabled_IOB_prose_word_features: + current_feat = feature(word) + features.update(current_feat) + + return features + + +enabled_IOB_nonprose_word_features = frozenset( [feature_word, feature_word_shape, feature_mitre, QANN_features] ) + +def IOB_nonprose_features(word): + """ + IOB_nonprose_features() + + Purpose: Creates a dictionary of nonprose features for the given word. + + @param word. A string + @return A dictionary of features + + >>> IOB_nonprose_features('test') is not None + True + """ + + # Feature: + features = {('dummy', ''): 1} # always have >0 dimensions + + # Extract all enabled features + for feature in enabled_IOB_nonprose_word_features: + current_feat = feature(word) + features.update(current_feat) + + return features + + +enabled_word_concept_features = frozenset( [feature_word, feature_prefix, feature_stem_porter, feature_stem_lancaster, feature_word_shape, feature_metric_unit, feature_mitre] ) + +# Note: most of this function is currently commented out so the doctests should be fixed if this is ever changed +def concept_features_for_word(word): + + """ + concept_features_for_word() + + Purpose: Creates a dictionary of concept features for the given word. + + @param word. A word to generate features for + @return A dictionary of features + + >>> concept_features_for_word('test') is not None + True + """ + + features = {} + + # extract all selected features + for feature in enabled_word_concept_features: + current_feat = feature(word) + features.update(current_feat) + + return features + + +enabled_chunk_concept_features = frozenset( [ feature_prev_word_stem, feature_next_word_stem] ) + +def concept_features_for_chunk(sentence, ind): + + """ + concept_features_for_chunk() + + @param sentence A sentence that has been chunked into vectors + ind The index of the concept in question within the sentence vector + @return A dictionary of features + + """ + + features = {'dummy':1} + + # Word-level features for each word of the chunk + for w in sentence[ind].split(): + word_features = concept_features_for_word(w) + features.update(word_features) + + # Context windows + for feature in enabled_chunk_concept_features: + current_feat = feature(sentence, ind) + features.update(current_feat) + + return features + + + +mitre_features = { + "INITCAP": r"^[A-Z].*$", + "ALLCAPS": r"^[A-Z]+$", + "CAPSMIX": r"^[A-Za-z]+$", + "HASDIGIT": r"^.*[0-9].*$", + "SINGLEDIGIT": r"^[0-9]$", + "DOUBLEDIGIT": r"^[0-9][0-9]$", + "FOURDIGITS": r"^[0-9][0-9][0-9][0-9]$", + "NATURALNUM": r"^[0-9]+$", + "REALNUM": r"^[0-9]+.[0-9]+$", + "ALPHANUM": r"^[0-9A-Za-z]+$", + "HASDASH": r"^.*-.*$", + "PUNCTUATION": r"^[^A-Za-z0-9]+$", + "PHONE1": r"^[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]$", + "PHONE2": r"^[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]$", + "FIVEDIGIT": r"^[0-9][0-9][0-9][0-9][0-9]", + "NOVOWELS": r"^[^AaEeIiOoUu]+$", + "HASDASHNUMALPHA": r"^.*[A-z].*-.*[0-9].*$ | *.[0-9].*-.*[0-9].*$", + "DATESEPERATOR": r"^[-/]$", +} + + +# note: make spaces optional? +# Check about the documentation for this. +def is_test_result(context): + """ + is_test_result() + + Purpose: Checks if the context is a test result. + + @param context. A string. + @return it returns the matching object of '[blank] was positive/negative' or None if it cannot find it. + otherwise, it will return True. + + >>> is_test_result('test was 10%') + True + >>> is_test_result('random string of words') + None + >>> is_test_result('Test') + None + >>> is_test_result('patient less than 30') + True + >>> is_test_result(' ') + None + """ + regex = r"^[A-Za-z]+( )*(-|--|:|was|of|\*|>|<|more than|less than)( )*[0-9]+(%)*" + if not re.search(regex, context): + return re.search(r"^[A-Za-z]+ was (positive|negative)", context) + return True + +def is_measurement(word): + """ + is_measurement() + + Purpose: Checks if the word is a measurement. + + @param word. A string. + @return the matched object if it is a measurement, otherwise None. + + >>> is_measurement('10units') is not None + True + >>> is_measurement('7 units') is not None + True + >>> is_measurement('10cc') is not None + True + >>> is_measurement('300 L') is not None + True + >>> is_measurement('20mL') is not None + True + >>> is_measurement('400000 dL') is not None + True + >>> is_measurement('30000') is not None + False + >>> is_measurement('20dl') is not None + False + >>> is_measurement('units') is not None + True + """ + regex = r"^[0-9]*( )?(unit(s)|cc|L|mL|dL)$" + return re.search(regex, word) + +def is_directive(word): + """ + is_directive() + + Purpose: Checks if the word is a directive. + + @param word. A string. + @return the matched object if it is a directive, otherwise None. + + >>> is_directive('q.abc') is not None + True + >>> is_directive('qAD') is not None + True + >>> is_directive('PRM') is not None + True + >>> is_directive('bid') is not None + True + >>> is_directive('prm') is not None + True + >>> is_directive('p.abc') is not None + True + >>> is_directive('qABCD') is not None + False + >>> is_directive('BID') is not None + False + """ + regex = r"^(q\..*|q..|PRM|bid|prm|p\..*)$" + return re.search(regex, word) + +def is_date(word): + """ + is_date() + + Purpose: Checks if word is a date. + + @param word. A string. + @return the matched object if it is a date, otherwise None. + + >>> is_date('2015-03-1') is not None + True + >>> is_date('2014-02-19') is not None + True + >>> is_date('03-27-1995') is not None + True + >>> is_date('201') is not None + False + >>> is_date('0') is not None + False + """ + regex= r'^(\d\d\d\d-\d\d-\d|\d\d?-\d\d?-\d\d\d\d?|\d\d\d\d-\d\d?-\d\d?)$' + return re.search(regex,word) + +def is_volume(word): + """ + is_volume() + + Purpose: Checks if word is a volume. + + @param word. A string. + @return the matched object if it is a volume, otherwise None. + + >>> is_volume('9ml') is not None + True + >>> is_volume('10 mL') is not None + True + >>> is_volume('552 dL') is not None + True + >>> is_volume('73') is not None + False + >>> is_volume('ml') is not None + True + """ + regex = r"^[0-9]*( )?(ml|mL|dL)$" + return re.search(regex, word) + +def is_weight(word): + """ + is_weight() + + Purpose: Checks if word is a weight. + + @param word. A string. + @return the matched object if it is a weight, otherwise None. + + >>> is_weight('1mg') is not None + True + >>> is_weight('10 g') is not None + True + >>> is_weight('78 mcg') is not None + True + >>> is_weight('10000 milligrams') is not None + True + >>> is_weight('14 grams') is not None + True + >>> is_weight('-10 g') is not None + False + >>> is_weight('grams') is not None + True + """ + regex = r"^[0-9]*( )?(mg|g|mcg|milligrams|grams)$" + return re.search(regex, word) + +def is_size(word): + """ + is_size() + + Purpose: Checks if the word is a size. + + @param word. A string. + @return the matched object if it is a weight, otheriwse None. + + >>> is_size('1mm') is not None + True + >>> is_size('10 cm') is not None + True + >>> is_size('36 millimeters') is not None + True + >>> is_size('423 centimeters') is not None + True + >>> is_size('328') is not None + False + >>> is_size('22 meters') is not None + False + >>> is_size('millimeters') is not None + True + """ + regex = r"^[0-9]*( )?(mm|cm|millimeters|centimeters)$" + return re.search(regex, word) + +def is_prognosis_location(word): + """ + is_prognosis_location() + + Purpose: Checks if the word is a prognosis location + + @param word. A string. + @return the matched object if it is a prognosis location, otherwise None. + + >>> is_prognosis_location('c9-c5') is not None + True + >>> is_prognosis_location('C5-C9') is not None + True + >>> is_prognosis_location('test') is not None + False + >>> is_prognosis_location('c-9-C5') is not None + False + """ + regex = r"^(c|C)[0-9]+(-(c|C)[0-9]+)*$" + return re.search(regex, word) + +def has_problem_form(word): + """ + has_problem_form() + + Purpose: Checks if the word has problem form. + + @param word. A string + @return the matched object if it has problem form, otheriwse None. + + >>> has_problem_form('prognosis') is not None + True + >>> has_problem_form('diagnosis') is not None + True + >>> has_problem_form('diagnostic') is not None + True + >>> has_problem_form('arachnophobic') is not None + True + >>> has_problem_form('test') is not None + False + >>> has_problem_form('ice') is not None + False + """ + regex = r".*(ic|is)$" + return re.search(regex, word) + +def get_def_class(word): + """ + get_def_class() + + Purpose: Checks for a definitive classification at the word level. + + @param word. A string + @return 1 if the word is a test term, + 2 if the word is a problem term, + 3 if the word is a treatment term, + 0 otherwise. + >>> get_def_class('eval') + 1 + >>> get_def_class('rate') + 1 + >>> get_def_class('tox') + 1 + >>> get_def_class('swelling') + 2 + >>> get_def_class('mass') + 2 + >>> get_def_class('broken') + 2 + >>> get_def_class('therapy') + 3 + >>> get_def_class('vaccine') + 3 + >>> get_def_class('treatment') + 3 + >>> get_def_class('unrelated') + 0 + """ + test_terms = { + "eval", "evaluation", "evaluations", + "sat", "sats", "saturation", + "exam", "exams", + "rate", "rates", + "test", "tests", + "xray", "xrays", + "screen", "screens", + "level", "levels", + "tox" + } + problem_terms = { + "swelling", + "wound", "wounds", + "symptom", "symptoms", + "shifts", "failure", + "insufficiency", "insufficiencies", + "mass", "masses", + "aneurysm", "aneurysms", + "ulcer", "ulcers", + "trama", "cancer", + "disease", "diseased", + "bacterial", "viral", + "syndrome", "syndromes", + "pain", "pains" + "burns", "burned", + "broken", "fractured" + } + treatment_terms = { + "therapy", + "replacement", + "anesthesia", + "supplement", "supplemental", + "vaccine", "vaccines" + "dose", "doses", + "shot", "shots", + "medication", "medicine", + "treatment", "treatments" + } + if word.lower() in test_terms: + return 1 + elif word.lower() in problem_terms: + return 2 + elif word.lower() in treatment_terms: + return 3 + return 0 + diff --git a/clicon/features_dir/wordshape.py b/code/feature_extraction/wordshape.py similarity index 99% rename from clicon/features_dir/wordshape.py rename to code/feature_extraction/wordshape.py index 06844bd..9f25474 100644 --- a/clicon/features_dir/wordshape.py +++ b/code/feature_extraction/wordshape.py @@ -415,7 +415,6 @@ def wordShapeChris1 (s): else: return "SYMBOL" - # gets Chris1, Dan1, Jenny1, Chris2 and Dan2 word shapes def getWordShapes(word): return [wordShapeChris1(word), wordShapeDan1(word), wordShapeJenny1(word), wordShapeChris2(word, False, None), wordShapeDan2(word, None)] diff --git a/clicon/format.py b/code/format.py similarity index 88% rename from clicon/format.py rename to code/format.py index c205e96..22e2ce5 100644 --- a/clicon/format.py +++ b/code/format.py @@ -1,5 +1,5 @@ ###################################################################### -# CliCon - format.py # +# CliNER - format.py # # # # Willie Boag wboag@cs.uml.edu # # # @@ -7,21 +7,17 @@ ###################################################################### -__author__ = 'Willie Boag' -__date__ = 'Jul. 3, 2014' - - - import argparse import sys import os import glob -import helper from notes.note import Note +import tempfile - +cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +tmp_dir = os.path.join(cliner_dir, 'data', 'tmp') def create_filename(odir, bfile, extension): fname = os.path.basename(bfile) + extension @@ -34,23 +30,23 @@ def main(): # Argument Parser parser = argparse.ArgumentParser() - parser.add_argument("-t", + parser.add_argument("-txt", dest = "txt", help = "The files that contain the training examples", ) - parser.add_argument("-a", + parser.add_argument("-annotations", dest = "annotations", help = "The files that contain the labels for the training examples", ) - parser.add_argument("-o", + parser.add_argument("-out", dest = "out", default = None, help = "Directory to output data", ) - parser.add_argument("-f", + parser.add_argument("-format", dest = "format", help = "Output format (%s)"%str(' or '.join(Note.supportedFormats())), ) @@ -105,20 +101,22 @@ def main(): for f,ext in Note.dictOfFormatToExtensions().items(): if ext == in_extension: in_format = f - + # Read input data into note object in_note = Note(in_format) - in_note.read(txt,annotations) + in_note.read(txt,annotations) # Convert data to standard format internal_output = in_note.write_standard() - tmp_file = 'tmp_file.txt' + + os_handle,tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="format_temp") with open(tmp_file, 'w') as f: f.write(internal_output) + os.close(os_handle) #print internal_output - + # Read internal standard data into new file with given output format out_note = Note(format) out_note.read_standard(txt,tmp_file) diff --git a/code/helper_dataset.py b/code/helper_dataset.py new file mode 100644 index 0000000..0e6a6fd --- /dev/null +++ b/code/helper_dataset.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 16 15:34:02 2017 + +@author: elena +""" +from __future__ import print_function +import codecs +import numpy as np +#from nltk import word_tokenize +import collections +import operator +import re +import numpy as np +import os +import h5py +import tensorflow as tf +import shutil +#import Dataset as dset +#import nltk + +def variable_summaries(var): + ''' + Attach a lot of summaries to a Tensor (for TensorBoard visualization). + From https://www.tensorflow.org/get_started/summaries_and_tensorboard + ''' + with tf.name_scope('summaries'): + mean = tf.reduce_mean(var) + tf.summary.scalar('mean', mean) + with tf.name_scope('stddev'): + stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) + tf.summary.scalar('stddev', stddev) + tf.summary.scalar('max', tf.reduce_max(var)) + tf.summary.scalar('min', tf.reduce_min(var)) + tf.summary.histogram('histogram', var) + +def load_parameters_from_file(parameters_file_name): + d = {} + with open(parameters_file_name) as f: + for line in f: + # if not line.strip(): + (key, val) = line.split() + if is_number(val): + d[key]=int(val) + continue + if is_boolean(val): + d[key]=bool(val) + continue + d[key] = val + return d + +def is_number(s): + try: + int(s) + return True + except ValueError: + return False + +def is_boolean(s): + if s in ['True','False']: + return True + else: return False + + + + + +def get_features_for_sentence(dataset_adress,sentence_number): + + reading_table=h5py.File(dataset_adress,'r') + word_features=reading_table["word-features"] + sentences_words=reading_table["sentences-words"] + current_sentence=sentences_words[sentence_number] + indicies=list(current_sentence) + extracted_feature_matrix=word_features[indicies[0]:indicies[-1]+1,:] + + list_features=extracted_feature_matrix.tolist() + #list_features=[x.tolist() for x in list_features] + return list_features + + + +#z=get_features_for_sentence("./data-fordataset/CATEGORIES-INCLUDED/train.hdf5",0) +#print (len(z)) +#print (len(z[0])) + + +def get_size_of_features(main_data_file_address): + f = codecs.open(main_data_file_address, 'r', 'UTF-8') + size_of_the_features_vector=0 + for line in f: + line = line.strip().split(' ') + if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]: continue + num_of_elem_line=len(line) + token_features=[x for ind, x in enumerate(line) if ind not in [num_of_elem_line-1,num_of_elem_line-2] and x!=""] + size_of_the_features_vector=len(token_features) + # print (size_of_the_features_vector) + break + return size_of_the_features_vector + + + + +def create_folder_if_not_exists(directory): + ''' + Create the folder if it doesn't exist already. + ''' + if not os.path.exists(directory): + os.makedirs(directory) + +def copytree(src, dst, symlinks=False, ignore=None): + ''' + http://stackoverflow.com/questions/1868714/how-do-i-copy-an-entire-directory-of-files-into-an-existing-directory-using-pyth + ''' + for item in os.listdir(src): + s = os.path.join(src, item) + d = os.path.join(dst, item) + if os.path.isdir(s): + shutil.copytree(s, d, symlinks, ignore) + else: + shutil.copy2(s, d) + + + +def string_to_list_of_lists(string): #NOT IN USE, was used for old feature represntation + list_of_tokens=[] + feature_list=[] + features=string.split(" ") + for feature in features: + if feature=="#newtoken#": + list_of_tokens.append(feature_list) + feature_list=[] + else: + try: + feature_list.append(float(feature)) + except ValueError: + continue + return list_of_tokens + +def get_valid_dataset_filepaths(parameters): + dataset_filepaths = {} + for dataset_type in ['train', 'test']: + dataset_filepaths[dataset_type] = os.path.join(parameters['dataset_text_folder'], '{0}.txt'.format(dataset_type)) + return dataset_filepaths + +def remove_file_name_from_the_path_string(path_string): + get_separator=os.sep + break_path=path_string.split(get_separator) + new_path=[n for idx,n in enumerate(break_path) if idx!=len(break_path)-1 ] + new_path=get_separator.join(new_path) + return new_path + + +def order_dictionary(dictionary, mode, reverse=False): + ''' + Order a dictionary by 'key' or 'value'. + mode should be either 'key' or 'value' + http://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value + ''' + + if mode =='key': + return collections.OrderedDict(sorted(dictionary.items(), + key=operator.itemgetter(0), + reverse=reverse)) + elif mode =='value': + return collections.OrderedDict(sorted(dictionary.items(), + key=operator.itemgetter(1), + reverse=reverse)) + elif mode =='key_value': + return collections.OrderedDict(sorted(dictionary.items(), + reverse=reverse)) + elif mode =='value_key': + return collections.OrderedDict(sorted(dictionary.items(), + key=lambda x: (x[1], x[0]), + reverse=reverse)) + else: + raise ValueError("Unknown mode. Should be 'key' or 'value'") + + +def reverse_dictionary(dictionary): + ''' + http://stackoverflow.com/questions/483666/python-reverse-inverse-a-mapping + http://stackoverflow.com/questions/25480089/right-way-to-initialize-an-ordereddict-using-its-constructor-such-that-it-retain + ''' + #print('type(dictionary): {0}'.format(type(dictionary))) + if type(dictionary) is collections.OrderedDict: + #print(type(dictionary)) + return collections.OrderedDict([(v, k) for k, v in dictionary.items()]) + else: + return {v: k for k, v in dictionary.items()} + +def is_token_in_pretrained_embeddings(token, all_pretrained_tokens, parameters): + #return token in all_pretrained_tokens or \ + return re.sub('\d', '0', token.lower()) in all_pretrained_tokens + + + +def remove_bio_from_label_name(label_name): + if label_name[:2] in ['B-', 'I-', 'E-', 'S-']: + #print (label_name[:2]) + new_label_name = label_name[2:] + else: + assert(label_name == 'O') + new_label_name = label_name + return new_label_name + + +def load_pretrained_token_embeddings(parameters): + file_input = codecs.open(parameters['token_pretrained_embedding_filepath'], 'r', 'UTF-8') + count = -1 + token_to_vector = {} + for cur_line in file_input: + count += 1 + #if count > 1000:break + cur_line = cur_line.strip() + cur_line = cur_line.split(' ') + if len(cur_line)==0:continue + token = cur_line[0] + vector = np.array([float(x) for x in cur_line[1:]]) + token_to_vector[token] = vector + file_input.close() + return token_to_vector #Dictionary of token-vectors + + +def load_tokens_from_pretrained_token_embeddings(parameters): + file_input = codecs.open(parameters['token_pretrained_embedding_filepath'], 'r', 'UTF-8') + count = -1 + tokens = set() + number_of_loaded_word_vectors = 0 + for cur_line in file_input: + count += 1 + cur_line = cur_line.strip() + cur_line = cur_line.split(' ') + if len(cur_line)==0:continue + token=cur_line[0] + tokens.add(token) + number_of_loaded_word_vectors += 1 + file_input.close() + return tokens + + +def pad_list(old_list, padding_size, padding_value): # ONE SIDED, might have issues for BIDIRECTIONAL LSTM BATCH NORMALIZATION + ''' + http://stackoverflow.com/questions/3438756/some-built-in-to-pad-a-list-in-python + Example: pad_list([6,2,3], 5, 0) returns [6,2,3,0,0] + ''' + assert padding_size >= len(old_list) + return old_list + [padding_value] * (padding_size-len(old_list)) + + +def get_parsed_conll_output(conll_output_filepath): + conll_output = [l.rstrip().replace('%','').replace(';','').replace(':', '').strip() for l in codecs.open(conll_output_filepath, 'r', 'utf8')] + parsed_output = {} + line = conll_output[1].split() + parsed_output['all'] = {'accuracy': float(line[1]), + 'precision': float(line[3]), + 'recall':float(line[5]), + 'f1':float(line[7])} + total_support = 0 + for line in conll_output[2:]: + line = line.split() + phi_type = line[0].replace('_', '-') + #print (phi_type) + # print (line) + support = int(line[7]) + total_support += support + parsed_output[phi_type] = {'precision': float(line[2]), + 'recall':float(line[4]), + 'f1':float(line[6]), + 'support':support} + parsed_output['all']['support'] = total_support + + print (parsed_output['all']) + return parsed_output + + +#z=get_parsed_conll_output("./RESULTS/CONLL-TEST/epoche_1.txt_conll_evaluation.txt") + + +#extract_from_the_tree("FIXED_I2B2_XML/i2b2_2012/training/28.xml","") + + #tokenize=word_tokenize(test) + # print tokenize +#write_all_files_into_one_file("FIXED_I2B2_XML/i2b2_2012/training/") # Add flag "Deal with double qotes as if they were marked -1,1 text global span-move+2 +#timeexp,spanlist = extract_from_the_tree("28.xml","28.xml.txt") +#z=map_time_exp_to_text(spanlist,timeexp) +#write_to_file_pseudo_conLL(z) + +#opening_path={'token_pretrained_embedding_filepath':'glove.6B.100d.txt',"freeze_token_embeddings" :'True'} +#tokens=load_tokens_from_pretrained_token_embeddings(opening_path) +#horrible_list=load_pretrained_token_embeddings(opening_path) + +#print horrible_list["cancer"] +#print tokens \ No newline at end of file diff --git a/clicon/notes/__init__.py b/code/machine_learning/__init__.py similarity index 100% rename from clicon/notes/__init__.py rename to code/machine_learning/__init__.py diff --git a/clicon/machine_learning/crf.py b/code/machine_learning/crf.py old mode 100644 new mode 100755 similarity index 63% rename from clicon/machine_learning/crf.py rename to code/machine_learning/crf.py index e978f50..edd86a6 --- a/clicon/machine_learning/crf.py +++ b/code/machine_learning/crf.py @@ -9,13 +9,15 @@ import sys import os - +import tempfile import pycrfsuite -count = 0 - +from tools import compute_performance_stats +from feature_extraction.read_config import enabled_modules +cliner_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +tmp_dir = os.path.join(cliner_dir, 'data', 'tmp') def format_features(rows, labels=None): @@ -98,9 +100,17 @@ def pycrf_instances(fi, labeled): if labeled: yseq.append(fields[0]) - - -def train(X, Y, do_grid): +def train(X, Y, val_X=None, val_Y=None, test_X=None, test_Y=None): + ''' + train() + Train a Conditional Random Field for sequence tagging. + + @param X. List of sparse-matrix sequences. Each sequence is one sentence. + @param Y. List of sequence tags. Each sequence is the sentence's per-token tags. + @param val_X. More X data, but a heldout dev set. + @param val_Y. More Y data, but a heldout dev set. + @return A tuple of encoded parameter weights and hyperparameters for predicting. + ''' # Sanity Check detection: features & label #with open('a','w') as f: @@ -109,40 +119,55 @@ def train(X, Y, do_grid): # print >>f, y, '\t', x.nonzero()[1][0] # print >>f - # Format features fot crfsuite feats = format_features(X,Y) - # Create a Trainer object. trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in pycrf_instances(feats, labeled=True): trainer.append(xseq, yseq) - - # Set paramters - if do_grid: - 'Grid Search not implemented yet' - - # Train the model - tmp_file = 'clicon-crf-tmp.txt' + os_handle,tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="crf_temp") trainer.train(tmp_file) - - # Read the trained model into a string + # Read the trained model into a string (so it can be pickled) model = '' with open(tmp_file, 'rb') as f: model = f.read() - + os.close(os_handle) # Remove the temporary file os.remove(tmp_file) + ###################################################################### + + # information about fitting the model + scores = {} + + # how well does the model fir the training data? + train_pred = predict(model, X) + train_stats = compute_performance_stats('train', train_pred, Y) + scores['train'] = train_stats + + if val_X: + val_pred = predict(model, val_X) + val_stats = compute_performance_stats('dev', val_pred, val_Y) + scores['dev'] = val_stats - return model + if test_X: + test_pred = predict(model, test_X) + test_stats = compute_performance_stats('test', test_pred, test_Y) + scores['test'] = test_stats + # keep track of which external modules were used for building this model! + scores['hyperparams'] = {} + enabled_mods = enabled_modules() + for module,enabled in enabled_mods.items(): + e = bool(enabled) + scores['hyperparams'][module] = e + return model, scores def predict(clf, X): @@ -150,22 +175,20 @@ def predict(clf, X): # Format features fot crfsuite feats = format_features(X) - # Dump the model into a temp file - tmp_file = 'clicon-crf-tmp.txt' + os_handle,tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="crf_temp") with open(tmp_file, 'wb') as f: - f.write(clf) - + clf_byte = bytearray(clf, 'latin1') + f.write(clf_byte) # Create the Tagger object tagger = pycrfsuite.Tagger() tagger.open(tmp_file) - # Remove the temp file + os.close(os_handle) os.remove(tmp_file) - - + # Tag the sequence retVal = [] Y = [] @@ -173,7 +196,6 @@ def predict(clf, X): yseq = [ int(n) for n in tagger.tag(xseq) ] retVal += list(yseq) Y.append(list(yseq)) - # Sanity Check detection: feature & label predictions #with open('a','w') as f: # for x,y in zip(xseq,Y): @@ -181,5 +203,4 @@ def predict(clf, X): # print >>f, y, '\t', x[:-2] # print >>f - - return retVal + return Y diff --git a/code/model.py b/code/model.py new file mode 100644 index 0000000..e54ab5a --- /dev/null +++ b/code/model.py @@ -0,0 +1,711 @@ +###################################################################### +# CliNER - model.py # +# # +# Willie Boag # +# # +# Purpose: Define the model for clinical concept extraction. # +###################################################################### + +import sys +from sklearn.feature_extraction import DictVectorizer +import os +import random +import math +import io +import numpy as np +from time import localtime, strftime +from collections import defaultdict + +from notes.documents import labels as tag2id, id2tag +from tools import flatten, save_list_structure, reconstruct_list +from tools import print_str, print_vec, print_files, write + + +cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +tmp_dir = os.path.join(cliner_dir, 'data', 'tmp') + +class ClinerModel: + + def log(self, out, model_file=None): + ''' + ClinerModel::log() + Log training information of model. + @param out. Either a filename or file channel to output the log string. + @param model_file. A path to optionally identify where the model was saved. + @return None + ''' + if not self._log: + log = self.__log_str(model_file) + else: + log = self._log + + # depending on whether it is already opened as a channel + if isinstance(out,type(sys.stdout)): + write(out, '%s\n' % log) + else: + with open(out, 'a') as f: + write(f, '%s\n' % log) + + + def __log_str_NEURAL(self,model_file=None): + "" + + + def __log_str(self, model_file=None): + ''' + ClinerModel::__log_str() + Build a string of information about training for the model's log file. + @param model_file. A path to optionally identify where the model was saved. + @return A string of the model's training information + ''' + + assert self._is_trained, 'ClinerModel not trained' + with io.StringIO() as f: + write(f, u'\n') + write(f, '-'*40) + write(f, u'\n\n') + if model_file: + write(f, 'model : %s\n' % os.path.abspath(model_file)) + write(f, u'\n') + + if self._use_lstm: + write(f, u'modeltype: LSTM\n') + else: + write(f, u'modeltype: CRF\n') + + if 'hyperparams' in self._score: + for name,value in self._score['hyperparams'].items(): + write(f, u'\t%-10s: %s\n' % (name,value)) + write(f, u'\n') + + print_str(f, 'features', self._features) + write(f, u'\n') + + write(f, u'\n') + write(f, 'training began: %s\n' % self._time_train_begin) + write(f, 'training ended: %s\n' % self._time_train_end) + write(f, u'\n') + + write(f, u'scores\n') + print_vec(f, 'train precision', self._score['train']['precision']) + print_vec(f, 'train recall ', self._score['train']['recall' ]) + print_vec(f, 'train f1 ', self._score['train']['f1' ]) + write(f, self._score['train']['conf']) + + if 'dev' in self._score: + print_vec(f, u'dev precision ', self._score['dev']['precision']) + print_vec(f, u'dev recall ', self._score['dev']['recall' ]) + print_vec(f, u'dev f1 ', self._score['dev']['f1' ]) + write(f, self._score['dev']['conf']) + + if 'test' in self._score: + print_vec(f, u'test precision ', self._score['test']['precision']) + print_vec(f, u'test recall ', self._score['test']['recall' ]) + print_vec(f, u'test f1 ', self._score['test']['f1' ]) + write(f, self._score['test']['conf']) + + if 'history' in self._score: + for label,vec in self._score['history'].items(): + print_vec(f, '%-16s'%label, vec) + write(f, u'\n') + + if self._training_files: + write(f, u'\n') + write(f, u'Training Files\n') + if len(self._training_files) < 200: + print_files(f, self._training_files) + else: + write(f, '\t%d files\n'%len(self._training_files)) + write(f, u'\n') + + write(f, u'-'*40) + write(f, u'\n\n') + + # get output as full string + contents = f.getvalue() + return contents + + + def __init__(self, use_lstm): + + """ + ClinerModel::__init__() + + Instantiate a ClinerModel object. + + @param use_lstm. Bool indicating whether to train a CRF or LSTM. + """ + + self._use_lstm = use_lstm + self._is_trained = False + self._clf = "latin1" + self._vocab = None + self._training_files = None + self._log = None + self._text_feats = None + + # Import the tools for either CRF or LSTM + if use_lstm: + # NEW + import DatasetCliner_experimental as Exp + + import tensorflow as tf + import entity_lstm as entity_model + import training_predict_LSTM + import pickle + import copy + import helper_dataset as hd + import shutil + + self._pretrained_dataset=None + self._pretrained_wordvectors=None + + self._current_model=None + self._parameters=None + + + + + def train(self, train_notes, val=[], test=[]): + """ + ClinerModel::train() + + Purpose: Train a Machine Learning model on annotated data + + @param notes. A list of Note objects (containing text and annotations) + @return None + """ + + # Extract formatted data + train_sents = flatten([n.getTokenizedSentences() for n in train_notes]) + train_labels = flatten([n.getTokenLabels() for n in train_notes]) + + if test: + test_sents = flatten([n.getTokenizedSentences() for n in test]) + test_labels = flatten([n.getTokenLabels() for n in test]) + else: + test_sents = [] + test_labels = [] + + if val: + print ("VAL") + val_sents = flatten([n.getTokenizedSentences() for n in val]) + val_labels = flatten([n.getTokenLabels() for n in val]) + self.train_fit(train_sents,train_labels,val_sents=val_sents,val_labels=val_labels,test_sents=test_sents,test_labels=test_labels) + + else: + print ("NO DEV") + self.train_fit(train_sents, train_labels, dev_split=0.1, + test_sents=test_sents, test_labels=test_labels) + + self._train_files = [ n.getName() for n in train_notes+val ] + + + def train_fit(self, train_sents, train_labels, val_sents=None, val_labels=None, + test_sents=None, test_labels=None, dev_split=None): + """ + ClinerModel::train_fit() + + Purpose: Train clinical concept extraction model using annotated data. + + @param train_sents. A list of sentences, where each sentence is tokenized into words. + @param train_labels. Parallel to 'train_sents', 7-way labels for concept spans. + @param val_sents. Validation data. Same format as tokenized_sents + @param val_labels. Validation data. Same format as iob_nested_labels + @param dev_split A real number from 0 to 1 + """ + # metadata + self._time_train_begin = strftime("%Y-%m-%d %H:%M:%S", localtime()) + + # train classifier + if self._use_lstm==False: + voc, clf, dev_score, enabled_features = generic_train('all', + train_sents , + train_labels , + self._use_lstm , + val_sents=val_sents , + val_labels=val_labels , + test_sents=test_sents , + test_labels=test_labels , + dev_split=dev_split ) + self._is_trained = True + self._vocab = voc + self._clf = clf + self._score = dev_score + self._features = enabled_features + # metadata + self._time_train_end = strftime("%Y-%m-%d %H:%M:%S", localtime()) + + + + + else: + print ("IN ERROR CHECK") + print (dev_split) + parameters,dataset,best = generic_train('all', + train_sents , + train_labels , + self._use_lstm , + val_sents=val_sents , + val_labels=val_labels , + test_sents=test_sents , + test_labels=test_labels , + dev_split=dev_split ) + self._is_trained = True + self.pretrained_dataset=dataset + self.parameters=parameters + self._score=best + self._time_train_end = strftime("%Y-%m-%d %H:%M:%S", localtime()) + print ("BEST EPOCH") + print (best) + #self._vocab = voc + #self._clf = clf + #self._score = dev_score + #self._features = enabled_features + + # metadata + #self._time_train_end = strftime("%Y-%m-%d %H:%M:%S", localtime()) + + + def predict_classes_from_document(self, document): + """ + ClinerModel::predict_classes_from_documents() + + Predict concept annotations for a given document + + @param note. A Document object (containing text and annotations) + @return List of predictions + """ + # Extract formatted data + tokenized_sents = document.getTokenizedSentences() + + return self.predict_classes(tokenized_sents) + + + def predict_classes(self, tokenized_sents): + """ + ClinerModel::predict_classes() + + Predict concept annotations for unlabeled, tokenized sentences + + @param tokenized_sents. A list of sentences, where each sentence is tokenized + into words + @return List of predictions + """ + + hyperparams = {} + + # Predict labels for prose + if self._use_lstm: + if self.parameters==None: + hyperprams['parameters'] = hd.load_parameters_from_file("LSTM_parameters.txt") + + if self._pretrained_dataset==None: + temp_pretrained_dataset = os.path.join(hyperparams['parameters']['model_folder'], + "dataset.pickle") + hyperparams['pretrained_dataset'] = pickle.load(open(temp_pretrained_dataset_adress, 'rb')) + + vectorized_pred = generic_predict('all' , + tokenized_sents , + vocab = self._vocab , + clf = self._clf , + use_lstm = self._use_lstm, + hyperparams = hyperparams) + #pretrained_dataset=self._pretrained_dataset, + #tokens_to_vec=self._pretrained_wordvector, + #current_model=self._current_model, + #parameters=self.parameters) + + #self._current_model=model + + if self._use_lstm: + iob_pred = vectorized_pred + else: + iob_pred = [ [id2tag[p] for p in seq] for seq in vectorized_pred ] + + return iob_pred + + + +############################################################################ +### Lowest-level (interfaces to ML modules) ### +############################################################################ + +def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, val_labels=None, test_sents=None, test_labels=None, dev_split=None): + + ''' + generic_train() + + Train a model that works for both prose and nonprose + + @param p_or_n. A string that indicates "prose", "nonprose", or "all" + @param train_sents. A list of sentences; each sentence is tokenized into words + @param train_labels. Parallel to `train_sents`, 7-way labels for concept spans + @param use_lstm Bool indicating whether to train CRF or LSTM. + @param val_sents. Validation data. Same format as train_sents + @param val_labels. Validation data. Same format as train_labels + @param dev_split. A real number from 0 to 1 + ''' + + # Must have data to train on: + if len(train_sents) == 0: + raise Exception('Training must have %s training examples' % p_or_n) + + # if you should split the data into train/dev yourself + if (not val_sents) and (dev_split > 0.0) and (len(train_sents)>10): + + p = int(dev_split*100) + sys.stdout.write('\tCreating %d/%d train/dev split\n' % (100-p,p)) + + perm = list(range(len(train_sents))) + random.shuffle(perm) + + train_sents = [ train_sents[i] for i in perm ] + train_labels = [ train_labels[i] for i in perm ] + + ind = int(dev_split*len(train_sents)) + + val_sents = train_sents[:ind ] + train_sents = train_sents[ ind:] + + val_labels = train_labels[:ind ] + train_labels = train_labels[ ind:] + else: + sys.stdout.write('\tUsing existing validation data\n') + + + sys.stdout.write('\tvectorizing words %s\n' % p_or_n) + + + if use_lstm: + print ("TESTING NEW DATSET OBJECT") + dataset = Exp.Dataset() + + parameters=hd.load_parameters_from_file("LSTM_parameters.txt") + parameters['use_pretrained_model']=False + + + + Datasets_tokens={} + Datasets_labels={} + + Datasets_tokens['train']=train_sents + Datasets_labels['train']=train_labels + + if val_sents!=None: + Datasets_tokens['valid']=val_sents + Datasets_labels['valid']=val_labels + + if test_sents!=None: + Datasets_tokens['test']=test_sents + Datasets_labels['test']=test_labels + + dataset.load_dataset(Datasets_tokens,Datasets_labels,"",parameters) + pickle.dump(dataset, open(os.path.join(parameters['model_folder'], 'dataset.pickle'), 'wb')) + + print (Datasets_tokens['valid'][0]) + print (Datasets_tokens['test'][0]) + + + parameters['Feature_vector_length']=dataset.feature_vector_size + parameters['use_features_before_final_lstm']=False + parameters['learning_rate']=0.005 + + + + sess = tf.Session() + number_of_sent=list(range(len(dataset.token_indices['train']))) + + with sess.as_default(): + model=entity_model.EntityLSTM(dataset,parameters) + sess.run(tf.global_variables_initializer()) + model.load_pretrained_token_embeddings(sess, dataset,parameters) + epoch_number = -1 + transition_params_trained = np.random.rand(5+2,5+2) + values={} + values["best"]=0 + + f1_dictionary={} + f1_dictionary['best']=0 + + model_saver = tf.train.Saver(max_to_keep=100) + + print ("START TRAINING") + + eval_dir = os.path.join(tmo_dir, 'cliner_eval_%d' % random.randint(0,256)+os.sep) + parameters['conll_like_result_folder']=eval_dir + + + test_temp = os.path.join(parameters['conll_like_result_folder'], 'test/') + train_temp = os.path.join(parameters['conll_like_result_folder'], 'train/') + valid_temp = os.path.join(parameters['conll_like_result_folder'], 'valid/') + + os.mkdir(parameters['conll_like_result_folder']) + os.mkdir(test_temp) + os.mkdir(train_temp) + os.mkdir(valid_temp) + + + + while epoch_number<90: + average_loss_per_phrase=0 + accuracy_per_phase=0 + step = 0 + + epoch_number += 1 + if epoch_number != 0: + sequence_numbers=list(range(len(dataset.token_indices['train']))) + random.shuffle(sequence_numbers) + for sequence_number in sequence_numbers: + loss,accuracy,transition_params_trained=training_predict_LSTM.train_step(sess, dataset, sequence_number, model) + average_loss_per_phrase+=loss + accuracy_per_phase+=accuracy + step += 1 + if step % 10 == 0: + print('Training {0:.2f}% done\n'.format(step/len(sequence_numbers)*100)) + + model_saver.save(sess, os.path.join(parameters['model_folder'], 'model_{0:05d}.ckpt'.format(epoch_number))) + + + + total_loss=average_loss_per_phrase + total_accuracy=accuracy_per_phase + + average_loss_per_phrase=average_loss_per_phrase/len(number_of_sent) + accuracy_per_phase=accuracy_per_phase/len(number_of_sent) + + + if epoch_number>0: + "" + f1,predictions=training_predict_LSTM.prediction_step(sess,dataset,"test",model,epoch_number,parameters['conll_like_result_folder'],transition_params_trained) + f1_train,_=training_predict_LSTM.prediction_step(sess,dataset,"train", model,epoch_number,parameters['conll_like_result_folder'],transition_params_trained) + f1_valid,_=training_predict_LSTM.prediction_step(sess,dataset,"valid", model,epoch_number,parameters['conll_like_result_folder'],transition_params_trained) + + + correctly_predicted_tokens=training_predict_LSTM.compute_train_accuracy(parameters['conll_like_result_folder']+"valid"+os.sep+"epoche_"+str(epoch_number)+".txt") + + if f1_dictionary['best'] + + # Collect list of feature types + enabled_features = set() + for sf in text_features: + for wf in sf: + for (feature_type,instance),value in wf.items(): + if feature_type.startswith('prev'): + feature_type = 'PREV*' + if feature_type.startswith('next'): + feature_type = 'NEXT*' + enabled_features.add(feature_type) + enabled_features = sorted(enabled_features) + + + # Vectorize features + vocab = DictVectorizer() + flat_X_feats = vocab.fit_transform( flatten(text_features) ) + X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features)) + + + # vectorize IOB labels + Y_labels = [ [tag2id[y] for y in y_seq] for y_seq in train_labels ] + + assert len(X_feats) == len(Y_labels) + for i in range(len(X_feats)): + assert X_feats[i].shape[0] == len(Y_labels[i]) + + + # if there is specified validation data, then vectorize it + if val_sents: + # vectorize validation X + val_text_features = extract_features(val_sents) + flat_val_X_feats = vocab.transform( flatten(val_text_features) ) + val_X = reconstruct_list(flat_val_X_feats, + save_list_structure(val_text_features)) + # vectorize validation Y + val_Y = [ [tag2id[y] for y in y_seq] for y_seq in val_labels ] + + # if there is specified test data, then vectorize it + if test_sents: + # vectorize test X + test_text_features = extract_features(test_sents) + flat_test_X_feats = vocab.transform( flatten(test_text_features) ) + test_X = reconstruct_list(flat_test_X_feats, + save_list_structure(test_text_features)) + # vectorize test Y + test_Y = [ [tag2id[y] for y in y_seq] for y_seq in test_labels ] + else: + test_X = None + test_Y = None + + + sys.stdout.write('\ttraining classifiers %s\n' % p_or_n) + + if use_lstm: + # train using lstm + clf, dev_score = keras_ml.train(X_seq_ids, Y_labels, tag2id, len(vocab), + val_X_ids=val_X, val_Y_ids=val_Y, + test_X_ids=test_X, test_Y_ids=test_Y) + else: + # train using crf + from machine_learning import crf + clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y, + test_X=test_X, test_Y=test_Y) + + return vocab, clf, dev_score, enabled_features + + + +#def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_dataset=None,tokens_to_vec=None, current_model=None, parameters=None): +def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, hyperparams): + ''' + generic_predict() + + Train a model that works for both prose and nonprose + + @param p_or_n. A string that indicates "prose", "nonprose", or "all" + @param tokenized_sents. A list of sentences, where each sentence is tokenized + into words + @param vocab. A dictionary mapping word tokens to numeric indices. + @param clf. An encoding of the trained keras model. + @param use_lstm. Bool indicating whether clf is a CRF or LSTM. + ''' + # use_lstm=self._use_lstm + if use_lstm: + + #parameters=hd.load_parameters_from_file("LSTM_parameters.txt") + parameters['use_pretrained_model']=True + + #model_folder="./models/NN_models" + predictions=[] + sys.stdout.write('\n use_lstm \n') + dataset = Exp.Dataset() + + fictional_labels= copy.deepcopy(tokenized_sents) + for idx,x in enumerate(fictional_labels): + for val_id,value in enumerate(x): + fictional_labels[idx][val_id]='O' + + Datasets_tokens={} + Datasets_labels={} + + Datasets_tokens['deploy']=tokenized_sents + Datasets_labels['deploy']=fictional_labels + + token_to_vector=dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters,token_to_vector=tokens_to_vec, pretrained_dataset=pretrained_dataset) + + print (dataset.token_indices.keys()) + + parameters['Feature_vector_length']=dataset.feature_vector_size + parameters['use_features_before_final_lstm']=False + + + dataset.update_dataset("", ['deploy'],Datasets_tokens,Datasets_labels) + + del Datasets_tokens + del Datasets_labels + + + #model=current_model + model=entity_model.EntityLSTM(dataset,parameters) + + os.mkdir(parameters['conll_like_result_folder']) + + + test_temp = os.path.join(parameters['conll_like_result_folder'], 'test/') + train_temp = os.path.join(parameters['conll_like_result_folder'], 'train/') + valid_temp = os.path.join(parameters['conll_like_result_folder'], 'valid/') + + + os.mkdir(test_temp) + os.mkdir(train_temp) + os.mkdir(valid_temp) + + sess = tf.Session() + with sess.as_default(): + + #model=entity_model.EntityLSTM(dataset,parameters) + transition_params_trained=model.restore_from_pretrained_model(parameters, dataset, sess, token_to_vector=token_to_vector,pretrained_dataset=pretrained_dataset) + del token_to_vector + predictions=training_predict_LSTM.prediction_step(sess,dataset,"deploy",model,0,parameters['conll_like_result_folder'],transition_params_trained) + sess.close() + + tf.reset_default_graph() + + shutil.rmtree(parameters['conll_like_result_folder']) + return predictions, model + + + # If nothing to predict, skip actual prediction + if len(tokenized_sents) == 0: + sys.stdout.write('\tnothing to predict %s\n' % p_or_n) + return [] + + sys.stdout.write('\tvectorizing words %s\n' % p_or_n) + + if use_lstm: + print('todo: incorporate lstm') + # vectorize tokenized sentences + #X = [] + #for sent in tokenized_sents: + # id_seq = [] + # for w in sent: + # if w in vocab: + # id_seq.append(vocab[w]) + # else: + # id_seq.append(vocab['oov']) + # X.append(id_seq) + else: + from feature_extraction.features import extract_features + + # vectorize validation X + text_features = extract_features(tokenized_sents) + flat_X_feats = vocab.transform( flatten(text_features) ) + X = reconstruct_list(flat_X_feats, save_list_structure(text_features)) + + sys.stdout.write('\tpredicting labels %s\n' % p_or_n) + + # Predict labels + if use_lstm: + print ("TEST_PREDICT") + exit() + + + else: + from machine_learning import crf + predictions = crf.predict(clf, X) + + # Format labels from output + return predictions + diff --git a/code/notes/__init__.py b/code/notes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code/notes/documents.py b/code/notes/documents.py new file mode 100644 index 0000000..2ed1b58 --- /dev/null +++ b/code/notes/documents.py @@ -0,0 +1,380 @@ +###################################################################### +# CliNER - documents.py # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: Build model for given training data. # +###################################################################### + + + +import string +import re +import nltk +import os + +from tools import clean_text, normalize_tokens + + +labels = { 'O':0, + 'B-problem':1, 'B-test':2, 'B-treatment':3, + 'I-problem':4, 'I-test':5, 'I-treatment':6, + } + +id2tag = { v:k for k,v in labels.items() } + + +class Document: + + def __init__(self, txt, con=None): + # read data + retVal = read_i2b2(txt, con) + + # Internal representation natural for i2b2 format + self._tok_sents = retVal[0] + + # Store token labels + if con: + self._tok_concepts = retVal[1] + self._labels = tok_concepts_to_labels(self._tok_sents, + self._tok_concepts) + + # save filename + self._filename = txt + + + def getName(self): + return os.path.basename(self._filename).split('.')[0] + + + def getExtension(self): + return 'con' + + + def getTokenizedSentences(self): + return self._tok_sents + + + def getTokenLabels(self): + return self._labels + + + def conlist(self): + return self._labels + + + def write(self, pred_labels=None): + """ + Purpose: Return the given concept label predictions in i2b2 format + + @param pred_labels. of predicted_labels + @return of i2b2-concept-file-formatted data + """ + + # Return value + retStr = '' + + # If given labels to write, use them. Default to classifications + if pred_labels != None: + token_labels = pred_labels + elif self._labels != None: + token_labels = self._labels + else: + raise Exception('Cannot write concept file: must specify labels') + + concept_tuples = tok_labels_to_concepts(self._tok_sents, token_labels) + + # For each classification + for classification in concept_tuples: + + # Ensure 'none' classifications are skipped + if classification[0] == 'none': + raise('Classification label "none" should never happen') + + concept = classification[0] + lineno = classification[1] + start = classification[2] + end = classification[3] + + # A list of words (corresponding line from the text file) + text = self._tok_sents[lineno-1] + + #print("\n" + "-" * 80) + #print("classification: ", classification) + #print("lineno: ", lineno) + #print("start: ", start) + #print("end ", end) + #print("text: ", text) + #print('len(text): ', len(text)) + #print("text[start]: ", text[start]) + #print("concept: ", concept) + + datum = text[start] + for j in range(start, end): + datum += " " + text[j+1] + datum = datum.lower() + + #print('datum: ', datum) + + # Line:TokenNumber of where the concept starts and ends + idx1 = "%d:%d" % (lineno, start) + idx2 = "%d:%d" % (lineno, end ) + + # Classification + label = concept + + # Print format + retStr += "c=\"%s\" %s %s||t=\"%s\"\n" % (datum, idx1, idx2, label) + + # return formatted data + return retStr.strip() + + + + +def read_i2b2(txt, con): + """ + read_i2b2() + + @param txt. A file path for the tokenized medical record + @param con. A file path for the i2b2 annotated concepts for txt + """ + tokenized_sents = [] + + sent_tokenize = lambda text: text.split('\n') + word_tokenize = lambda text: text.split(' ') + + # Read in the medical text + with open(txt) as f: + # Original text file + text = f.read().strip('\n') + + # tokenize + sentences = sent_tokenize(text) + for sentence in sentences: + sent = clean_text(sentence.rstrip()) + + # lowercase + sent = sent.lower() + + toks = word_tokenize(sent) + + # normalize tokens + normed_toks = normalize_tokens(toks) + + #for w in normed_toks: + # print(w) + #print() + + tokenized_sents.append(normed_toks) + + # If an accompanying concept file was specified, read it + tok_concepts = [] + if con: + with open(con) as f: + for line in f.readlines(): + # Empty line + if not line.strip(): + continue + + # parse concept line + concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$' + match = re.search(concept_regex, line.strip()) + groups = match.groups() + + # retrieve regex info + concept_text = groups[0] + start_lineno = int(groups[1]) + start_tok_ind = int(groups[2]) + end_lineno = int(groups[3]) + end_tok_ind = int(groups[4]) + concept_label = groups[5] + + # pre-process text for error-check + #matching_line = tokenized_sents[start_lineno-1] + #matching_toks = matching_line[start_tok_ind:end_tok_ind+1] + #matching_text = ' '.join(matching_toks).lower() + #concept_text = ' '.join(word_tokenize(concept_text)) + + # error-check info + assert start_lineno==end_lineno, 'concept must span single line' + #assert concept_text==matching_text, 'something wrong with inds' + + # add the concept info + tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind) + tok_concepts.append(tup) + + # Safe guard against concept file having duplicate entries + tok_concepts = list(set(tok_concepts)) + + # Concept file does not guarantee ordering by line number + tok_concepts = sorted(tok_concepts, key=lambda t:t[1:]) + + # Ensure no overlapping concepts (that would be bad) + for i in range(len(tok_concepts)-1): + c1 = tok_concepts[i] + c2 = tok_concepts[i+1] + if c1[1] == c2[1]: + if c1[2] <= c2[2] and c2[2] <= c1[3]: + fname = os.path.basename(con) + error1='%s has overlapping entities on line %d'%(fname,c1[1]) + error2="It can't be processed until you remove one" + error3='Please modify this file: %s' % con + error4='\tentity 1: c="%s" %d:%d %d:%d||t="%s"'%(' '.join(tokenized_sents[c1[1]-1][c1[2]:c1[3]+1]), + c1[1], c1[2], c1[1], c1[3], c1[0]) + error5='\tentity 2: c="%s" %d:%d %d:%d||t="%s"'%(' '.join(tokenized_sents[c2[1]-1][c2[2]:c2[3]+1]), + c2[1], c2[2], c2[1], c2[3], c2[0]) + error_msg = '\n\n%s\n%s\n\n%s\n\n%s\n%s\n' % (error1,error2,error3,error4,error5) + raise DocumentException(error_msg) + + return tokenized_sents, tok_concepts + + + + +def tok_concepts_to_labels(tokenized_sents, tok_concepts): + # parallel to tokens + labels = [ ['O' for tok in sent] for sent in tokenized_sents ] + + # fill each concept's tokens appropriately + for concept in tok_concepts: + label,lineno,start_tok,end_tok = concept + labels[lineno-1][start_tok] = 'B-%s' % label + for i in range(start_tok+1,end_tok+1): + labels[lineno-1][i] = 'I-%s' % label + + # test it out + ''' + for i in range(len(tokenized_sents)): + assert len(tokenized_sents[i]) == len(labels[i]) + for tok,lab in zip(tokenized_sents[i],labels[i]): + if lab != 'O': print( '\t',) + print (lab, tok) + print() + exit() + ''' + + return labels + + + + +def tok_labels_to_concepts(tokenized_sents, tok_labels): + + ''' + for gold,sent in zip(tok_labels, tokenized_sents): + print(gold) + print(sent) + print() + ''' + + # convert 'B-treatment' into ('B','treatment') and 'O' into ('O',None) + def split_label(label): + if label == 'O': + iob,tag = 'O', None + else: + iob,tag = label.split('-') + return iob, tag + + # preprocess predictions to "correct" starting Is into Bs + corrected = [] + for lineno,labels in enumerate(tok_labels): + corrected_line = [] + for i in range(len(labels)): + #''' + # is this a candidate for error? + iob,tag = split_label(labels[i]) + if iob == 'I': + # beginning of line has no previous + if i == 0: + print( 'CORRECTING! A') + new_label = 'B' + labels[i][1:] + else: + # ensure either its outside OR mismatch type + prev_iob,prev_tag = split_label(labels[i-1]) + if prev_iob == 'O' or prev_tag != tag: + print( 'CORRECTING! B') + new_label = 'B' + labels[i][1:] + else: + new_label = labels[i] + else: + new_label = labels[i] + #''' + corrected_line.append(new_label) + corrected.append( corrected_line ) + + ''' + for i,(trow,crow) in enumerate(zip(tok_labels, corrected)): + if trow != crow: + for j,(t,c) in enumerate(zip(trow,crow)): + if t != c: + print('lineno: ', i) + print (t, '\t', c) + print() + print() + exit() + ''' + tok_labels = corrected + + concepts = [] + for i,labs in enumerate(tok_labels): + + N = len(labs) + begins = [ j for j,lab in enumerate(labs) if (lab[0] == 'B') ] + + for start in begins: + # "B-test" --> "-test" + label = labs[start][1:] + + # get ending token index + end = start + while (end < N-1) and tok_labels[i][end+1].startswith('I') and tok_labels[i][end+1][1:] == label: + end += 1 + + # concept tuple + concept_tuple = (label[1:], i+1, start, end) + concepts.append(concept_tuple) + + ''' + # test it out + for i in range(len(tokenized_sents)): + assert len(tokenized_sents[i]) == len(tok_labels[i]) + for tok,lab in zip(tokenized_sents[i],tok_labels[i]): + if lab != 'O': print( '\t',) + print (lab, tok) + print() + exit() + ''' + + # test it out + test_tok_labels = tok_concepts_to_labels(tokenized_sents, concepts) + #''' + for lineno,(test,gold,sent) in enumerate(zip(test_tok_labels, tok_labels, tokenized_sents)): + for i,(a,b) in enumerate(zip(test,gold)): + #''' + if not ((a == b)or(a[0]=='B' and b[0]=='I' and a[1:]==b[1:])): + print() + print( 'lineno: ', lineno) + print() + print( 'generated: ', test[i-3:i+4]) + print( 'predicted: ', gold[i-3:i+4]) + print( sent[i-3:i+4]) + print( 'a[0]: ', a[0]) + print( 'b[0]: ', b[0]) + print( 'a[1:]: ', a[1:]) + print( 'b[1:]: ', b[1:]) + print( 'a[1:] == b[a:]: ', a[1:] == b[1:]) + print() + #''' + assert (a == b) or (a[0]=='B' and b[0]=='I' and a[1:]==b[1:]) + i += 1 + #''' + assert test_tok_labels == tok_labels + + return concepts + + + +class DocumentException(Exception): + pass + diff --git a/code/predict.py b/code/predict.py new file mode 100644 index 0000000..7b4557a --- /dev/null +++ b/code/predict.py @@ -0,0 +1,175 @@ +###################################################################### +# CliNER - predict.py # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: Use trained model to predict concept labels for data. # +###################################################################### + +import os +import sys +import glob +import argparse +import itertools +import pickle + +import tools +from model import ClinerModel, write +from notes.documents import Document +import copy + +def main(): + + parser = argparse.ArgumentParser() + + parser.add_argument("--txt", + dest = "txt", + help = ".txt files of discharge summaries", + ) + parser.add_argument("--out", + dest = "output", + help = "The directory to write the output", + ) + parser.add_argument("--model", + dest = "model", + help = "The model to use for prediction", + ) + parser.add_argument("--format", + dest = "format", + help = "Data format (i2b2)", + ) + + args = parser.parse_args() + + # Error check: Ensure that file paths are specified + if not args.txt: + parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide text files\n\n') + sys.stderr.write('\n') + exit(1) + if not args.output: + parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide output directory\n\n') + sys.stderr.write('\n') + exit(1) + if not args.model: + parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide path to model\n\n') + sys.stderr.write('\n') + exit(1) + if not os.path.exists(args.model): + parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: ClinerModel does not exist: %s\n\n' % args.model) + sys.stderr.write('\n') + exit(1) + + #Parse arguments + files = glob.glob(args.txt) + tools.mkpath(args.output) + + if args.format: + format = args.format + else: + parser.print_help(sys.stderr) + sys.stderr.write('\n\tERROR: must provide "format" argument\n\n') + exit(1) + + # Predict + predict(files, args.model, args.output, format=format) + + +def predict(files, model_path, output_dir, format, use_lstm=True): + + # Must specify output format + if format not in ['i2b2']: + sys.stderr.write('\n\tError: Must specify output format\n') + sys.stderr.write('\tAvailable formats: i2b2\n') + sys.stderr.write('\n') + exit(1) + + # Load model + #if use_lstm==False: + with open(model_path, 'rb') as f: + model = pickle.load(f, encoding='latin1') + + if model._use_lstm: + import helper_dataset as hd + import DatasetCliner_experimental as Exp + import entity_lstm as entity_model + + parameters=hd.load_parameters_from_file("LSTM_parameters.txt") + parameters['use_pretrained_model']=True + + temp_pretrained_dataset_adress=parameters['model_folder']+os.sep+"dataset.pickle" + model._pretrained_dataset = pickle.load(open(temp_pretrained_dataset_adress, 'rb')) + model._pretrained_wordvector=hd.load_pretrained_token_embeddings(parameters) + model._current_model=None + + ''' + updating_notes=[] + for i,txt in enumerate(sorted(files)): + note=Document(txt) + tokenized_sents = note.getTokenizedSentences() + updating_notes+=tokenized_sents + print (updating_notes) + fictional_labels= copy.deepcopy(tokenized_sents) + for idx,x in enumerate(fictional_labels): + for val_id,value in enumerate(x): + fictional_labels[idx][val_id]='O' + Datasets_tokens={} + Datasets_labels={} + Datasets_tokens['deploy']=tokenized_sents + Datasets_labels['deploy']=fictional_labels + dataset = Exp.Dataset() + token_to_vector=dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters,token_to_vector=model._pretrained_wordvector, pretrained_dataset=model._pretrained_dataset) + parameters['Feature_vector_length']=dataset.feature_vector_size + parameters['use_features_before_final_lstm']=False + dataset.update_dataset("", ['deploy'],Datasets_tokens,Datasets_labels) + model._pretrained_dataset=dataset + model_LSTM=entity_model.EntityLSTM(dataset,parameters) + model._current_model=model_LSTM + ._current_model + ''' + print ("END TEST") + #exit() + #model.parameters=None + + # Tell user if not predicting + if not files: + sys.stderr.write("\n\tNote: You did not supply any input files\n\n") + exit() + + n = len(files) + + for i,txt in enumerate(sorted(files)): + note = Document(txt) + + # Output file + fname = os.path.splitext(os.path.basename(txt))[0] + '.' + 'con' + out_path = os.path.join(output_dir, fname) + + #''' + if os.path.exists(out_path): + print('\tWARNING: prediction file already exists (%s)' % out_path) + #continue + #''' + + sys.stdout.write('%s\n' % ('-' * 30)) + sys.stdout.write('\n\t%d of %d\n' % (i+1,n)) + sys.stdout.write('\t%s\n\n' % txt) + + # Predict concept labels + labels = model.predict_classes_from_document(note) + + # Get predictions in proper format + output = note.write(labels) + + # Output the concept predictions + sys.stdout.write('\n\nwriting to: %s\n' % out_path) + with open(out_path, 'w') as f: + write(f, '%s\n' % output) + sys.stdout.write('\n') + + +if __name__ == '__main__': + main() diff --git a/code/tools.py b/code/tools.py new file mode 100644 index 0000000..8e93f60 --- /dev/null +++ b/code/tools.py @@ -0,0 +1,356 @@ +###################################################################### +# CliNER - tools.py # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: General purpose tools # +###################################################################### + + +import os +import sys +import errno +import string +import math +import re +import pickle +import numpy as np + + +############################################################# +# files +############################################################# + +def map_files(files): + """Maps a list of files to basename -> path.""" + output = {} + for f in files: #pylint: disable=invalid-name + basename = os.path.splitext(os.path.basename(f))[0] + output[basename] = f + return output + + +def mkpath(path): + """Alias for mkdir -p.""" + try: + os.makedirs(path) + except OSError as exc: + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + + +############################################################# +# text pre-processing +############################################################# + +def clean_text(text): + return ''.join(map(lambda x: x if (x in string.printable) else '@', text)) + + +def normalize_tokens(toks): + # todo: normalize dosages (icluding 8mg -> mg) + # replace number tokens + def num_normalize(w): + return '__num__' if re.search('\d', w) else w + toks = list(map(num_normalize, toks)) + return toks + + +############################################################# +# manipulating list-of-lists +############################################################# + +def flatten(list_of_lists): + ''' + flatten() + Purpose: Given a list of lists, flatten one level deep + @param list_of_lists. of objects. + @return of objects (AKA flattened one level) + >>> flatten([['a','b','c'],['d','e'],['f','g','h']]) + ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] + ''' + return sum(list_of_lists, []) + + + +def save_list_structure(list_of_lists): + ''' + save_list_structure() + Purpose: Given a list of lists, save way to recover structure from flattended + @param list_of_lists. of objects. + @return of indices, where each index refers to the + beginning of a line in the orig list-of-lists + >>> save_list_structure([['a','b','c'],['d','e'],['f','g','h']]) + [3, 5, 8] + ''' + + offsets = [ len(sublist) for sublist in list_of_lists ] + for i in range(1, len(offsets)): + offsets[i] += offsets[i-1] + + return offsets + + + + +def reconstruct_list(flat_list, offsets): + + ''' + save_list_structure() + Purpose: This undoes a list flattening. Uses value from save_list_structure() + @param flat_list. of objects + @param offsets of indices, where each index refers to the + beginning of a line in the orig list-of-lists + @return of objects (the original structure) + >>> reconstruct_list(['a','b','c','d','e','f','g','h'], [3,5,8]) + [['a', 'b', 'c'], ['d', 'e'], ['f', 'g', 'h']] + ''' + + return [ flat_list[i:j] for i, j in zip([0] + offsets, offsets)] + + + + +############################################################# +# serialization to disc +############################################################# + +def load_pickled_obj(path_to_pickled_obj): + data = None + with open(path_to_pickled_obj, "rb") as f: + data = f.read() + return pickle.loads(data) + + +def pickle_dump(obj, path_to_obj): + # NOTE: highest priority makes loading TRAINED models slow + with open(path_to_obj, 'wb') as f: + pickle.dump(obj, f, -1) + + + +############################################################# +# prose v nonprose +############################################################# + + +def is_prose_sentence(sentence): + assert type(sentence) == type([]), 'is_prose_sentence() must take list arg' + if sentence == []: + return False + #elif sentence[-1] == '.' or sentence[-1] == '?': + elif sentence[-1] == '?': + return True + elif sentence[-1] == ':': + return False + elif len(sentence) <= 5: + return False + elif is_at_least_half_nonprose(sentence): + return True + else: + return False + + + +def is_at_least_half_nonprose(sentence): + count = len(filter(is_prose_word, sentence)) + if count >= len(sentence)/2: + return True + else: + return False + + + +def is_prose_word(word): + # Punctuation + for punc in string.punctuation: + if punc in word: + return False + # Digit + if re.match('\d', word): + return False + # All uppercase + if word == word.upper(): + return False + # Else + return True + + + + +def prose_partition(tokenized_sents, labels=None): + prose_sents = [] + nonprose_sents = [] + prose_labels = [] + nonprose_labels = [] + + # partition the sents & labels into EITHER prose OR nonprose groups + for i in range(len(tokenized_sents)): + if is_prose_sentence(tokenized_sents[i]): + prose_sents.append(tokenized_sents[i]) + if labels: + prose_labels.append(labels[i]) + else: + nonprose_sents.append(tokenized_sents[i]) + if labels: + nonprose_labels.append(labels[i]) + + # group data appropriately (note, labels might not be provided) + if labels: + prose = ( prose_sents, prose_labels) + nonprose = (nonprose_sents, nonprose_labels) + else: + prose = ( prose_sents, None) + nonprose = (nonprose_sents, None) + + return prose, nonprose + + + + + +def print_files(f, file_names): + ''' + print_files() + + Pretty formatting for listing the training files in a + log. + + @param f. An open file stream to write to. + @param file_names. A list of filename strings. + ''' + COLUMNS = 4 + file_names = sorted(file_names) + start = 0 + for row in range(int(math.ceil(float(len(file_names))/COLUMNS))): + write(f, u'\t\t') + for featname in file_names[start:start+COLUMNS]: + write(f, '%-15s' % featname) + write(f, u'\n') + start += COLUMNS + + + +# python2 needs to convert to unicdode, but thats default for python3 +if sys.version_info.major == 2: + tostr = unicode +else: + tostr = str + +def write(f, s): + f.write(tostr(s)) + + + +def print_vec(f, label, vec): + ''' + print_vec() + + Pretty formatting for displaying a vector of numbers in a log. + + @param f. An open file stream to write to. + @param label. A description of the numbers (e.g. "recall"). + @param vec. A numpy array of the numbers to display. + ''' + COLUMNS = 7 + start = 0 + write(f, '\t%-10s: ' % label) + if type(vec) != type([]): + vec = vec.tolist() + for row in range(int(math.ceil(float(len(vec))/COLUMNS))): + for featname in vec[start:start+COLUMNS]: + write(f, '%7.3f' % featname) + write(f, u'\n') + start += COLUMNS + + + +def print_str(f, label, names): + + ''' + print_str() + Pretty formatting for displaying a list of strings in a log + @param f. An open file stream to write to. + @param label. A description of the numbers (e.g. "recall"). + @param names. A list of strings. + ''' + COLUMNS = 4 + start = 0 + for row in range(int(math.ceil(float(len(names))/COLUMNS))): + if row == 0: + write(f, '\t%-10s: ' % label) + else: + write(f, '\t%-10s ' % '') + + for featname in names[start:start+COLUMNS]: + write(f, '%-16s ' % featname) + + write(f, u'\n') + start += COLUMNS + + + +############################################################# +# Quick-and-Dirty evaluation of performance +############################################################# + + +def compute_performance_stats(label, pred, ref): + ''' + compute_stats() + Compute the P, R, and F for a given model on some data. + @param label. A name for the data (e.g. "train" or "dev") + @param pred. A list of list of predicted labels. + @param pred. A list of list of true labels. + ''' + + num_tags = max(set(sum( ref,[])) | set(sum(pred,[]))) +1 + # confusion matrix + confusion = np.zeros( (num_tags,num_tags) ) + for tags,yseq in zip(pred,ref): + for y,p in zip(yseq, tags): + confusion[p,y] += 1 + + # print confusion matrix + conf_str = '' + conf_str += '\n\n' + conf_str += label + '\n' + conf_str += ' '*6 + for i in range(num_tags): + conf_str += '%4d ' % i + conf_str += ' (gold)\n' + for i in range(num_tags): + conf_str += '%2d ' % i + for j in range(num_tags): + conf_str += '%4d ' % confusion[i][j] + conf_str += '\n' + conf_str += '(pred)\n' + conf_str += '\n\n' + conf_str = conf_str + #print conf_str + + precision = np.zeros(num_tags) + recall = np.zeros(num_tags) + f1 = np.zeros(num_tags) + + for i in range(num_tags): + correct = confusion[i,i] + num_pred = sum(confusion[i,:]) + num_actual = sum(confusion[:,i]) + + p = correct / (num_pred + 1e-9) + r = correct / (num_actual + 1e-9) + + precision[i] = p + recall[i] = r + f1[i] = (2*p*r) / (p + r + 1e-9) + + scores = {} + scores['precision'] = precision + scores['recall' ] = recall + scores['f1' ] = f1 + scores['conf' ] = conf_str + + return scores diff --git a/code/train.py b/code/train.py new file mode 100644 index 0000000..21e7f42 --- /dev/null +++ b/code/train.py @@ -0,0 +1,200 @@ +###################################################################### +# CliNER - train.py # +# # +# Willie Boag wboag@cs.uml.edu # +# # +# Purpose: Build model for given training data. # +###################################################################### + + +import os +import os.path +import glob +import argparse +import pickle +import sys + +import tools +from model import ClinerModel +from notes.documents import Document + +# base directory +CLINER_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument("--txt", + dest = "txt", + help = "The files that contain the training examples", + ) + parser.add_argument("--annotations", + dest = "con", + help = "The files that contain the labels for the training examples", + ) + parser.add_argument("--val-txt", + dest = "val_txt", + help = "The files that contain the validation examples", + ) + parser.add_argument("--val-annotations", + dest = "val_con", + help = "The files that contain the labels for the validation examples", + ) + parser.add_argument("--test-txt", + dest = "test_txt", + help = "The files that contain the test examples", + ) + parser.add_argument("--test-annotations", + dest = "test_con", + help = "The files that contain the labels for the test examples", + ) + parser.add_argument("--model", + dest = "model", + help = "Path to the model that should be generated", + ) + parser.add_argument("--log", + dest = "log", + help = "Path to the log file for training info", + default = os.path.join(CLINER_DIR, 'models', 'train.log') + ) + parser.add_argument("--use-lstm", + dest = "use_lstm", + help = "Whether to use an LSTM model", + action = 'store_true', + default = False + ) + parser.add_argument("--format", + dest = "format", + help = "Data format ( i2b2 )" + ) + + + # Parse the command line arguments + args = parser.parse_args() + + # Error check: Ensure that file paths are specified + if not args.txt: + parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide text files\n') + sys.stderr.write('\n') + exit(1) + if not args.con: + parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide annotations for text files\n') + sys.stderr.write('\n') + exit(1) + if not args.model: + parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide valid path to store model\n') + sys.stderr.write('\n') + exit(1) + modeldir = os.path.dirname(args.model) + if (not os.path.exists(modeldir)) and (modeldir != ''): + parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Model dir does not exist: %s\n' % modeldir) + sys.stderr.write('\n') + exit(1) + + # A list of txt and concept file paths + train_txt_files = glob.glob(args.txt) + train_con_files = glob.glob(args.con) + + # data format + if args.format: + format = args.format + + # Must specify output format + if args.format not in ['i2b2']: + print >>sys.stderr, '\n\tError: Must specify output format' + print >>sys.stderr, '\tAvailable formats: i2b2' + sys.stderr.write('\n') + exit(1) + + + # Collect training data file paths + train_txt_files_map = tools.map_files(train_txt_files) + train_con_files_map = tools.map_files(train_con_files) + + training_list = [] + for k in train_txt_files_map: + if k in train_con_files_map: + training_list.append((train_txt_files_map[k], train_con_files_map[k])) + + # If validation data was specified + if args.val_txt and args.val_con: + val_txt_files = glob.glob(args.val_txt) + val_con_files = glob.glob(args.val_con) + + val_txt_files_map = tools.map_files(val_txt_files) + val_con_files_map = tools.map_files(val_con_files) + + val_list = [] + for k in val_txt_files_map: + if k in val_con_files_map: + val_list.append((val_txt_files_map[k], val_con_files_map[k])) + else: + val_list=[] + + # If test data was specified + if args.test_txt and args.test_con: + test_txt_files = glob.glob(args.test_txt) + test_con_files = glob.glob(args.test_con) + + test_txt_files_map = tools.map_files(test_txt_files) + test_con_files_map = tools.map_files(test_con_files) + + test_list = [] + for k in test_txt_files_map: + if k in test_con_files_map: + test_list.append((test_txt_files_map[k], test_con_files_map[k])) + else: + test_list=[] + + # Train the model + train(training_list, args.model, args.format, args.use_lstm, logfile=args.log, val=val_list, test=test_list) + + + + +def train(training_list, model_path, format, use_lstm, logfile=None, val=[], test=[]): + + # Read the data into a Document object + train_docs = [] + for txt, con in training_list: + doc_tmp = Document(txt,con) + train_docs.append(doc_tmp) + + val_docs = [] + for txt, con in val: + doc_tmp = Document(txt,con) + val_docs.append(doc_tmp) + + test_docs = [] + for txt, con in test: + doc_tmp = Document(txt,con) + test_docs.append(doc_tmp) + + # file names + if not train_docs: + print( 'Error: Cannot train on 0 files. Terminating train.') + return 1 + + # Create a Machine Learning model + model = ClinerModel(use_lstm) + + # Train the model using the Documents's data + model.train(train_docs, val=val_docs, test=test_docs) + + # Pickle dump + print('\nserializing model to %s\n' % model_path) + with open(model_path, "wb") as m_file: + pickle.dump(model, m_file) + + model.log(logfile , model_file=model_path) + model.log(sys.stdout, model_file=model_path) + + + +if __name__ == '__main__': + main() diff --git a/code/training_predict_LSTM.py b/code/training_predict_LSTM.py new file mode 100644 index 0000000..d4d400b --- /dev/null +++ b/code/training_predict_LSTM.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 25 14:02:11 2017 + +@author: elena +""" +from __future__ import print_function +from evaluation_LSTM import remap_labels +import tensorflow as tf +import numpy as np +import helper_dataset as hd +#import model_lstm as used_model + +import entity_lstm as entity_model + +import random +import time +import operator +import sklearn +#import Per_token_sum as stat + +import os + +def compute_train_accuracy(epoche_adress): + "COMPUTE T_A" + f=open(epoche_adress,'r') + correctly_predicted_tokens=0 + for idx,line in enumerate(f): + if line=="\n": + continue + elements=line.split(" ") + # print (line) + elements=[x.strip("\n") for x in elements] + #print (repr(elements[-1].strip())+repr(elements[-3].strip)) + if elements[-1]==elements[-2]: + #print ("YWY") + correctly_predicted_tokens+=1 + f.close() + return correctly_predicted_tokens + + + +def predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths): + # Predict labels using trained model + y_pred = {} + y_true = {} + output_filepaths = {} + for dataset_type in ['train', 'valid', 'test', 'deploy']: + if dataset_type not in dataset_filepaths.keys(): + continue + prediction_output = prediction_step(sess, dataset, dataset_type, model, transition_params_trained, stats_graph_folder, epoch_number, parameters, dataset_filepaths) + y_pred[dataset_type], y_true[dataset_type], output_filepaths[dataset_type] = prediction_output + return y_pred, y_true, output_filepaths + + + +def train_step(sess, dataset, sequence_number, model): + token_indices_sequence = dataset.token_indices['train'][sequence_number] + + for i, token_index in enumerate(token_indices_sequence): + if token_index in dataset.infrequent_token_indices and np.random.uniform() < 0.5: + token_indices_sequence[i] = dataset.token_to_index[dataset.UNK] + + #dataset + #feature_list_of_lists=hd.get_features_for_sentence(dataset.adresses["train"],sequence_number) + #print (len(feature_list_of_lists)) + #line = linecache.getline(dataset.train_address, sequence_number+1) + #list_of_list=hd.string_to_list_of_lists(line) + # print ("SEQENCE NUMBER CHECK") + # print (sequence_number) + # print (dataset.label_vector_indices['train'][sequence_number]) + # print (dataset.token_lengths['train'][sequence_number]) + feed_dict = { + model.input_token_indices: token_indices_sequence, + model.input_label_indices_vector: dataset.label_vector_indices['train'][sequence_number], + model.input_token_character_indices: dataset.character_indices_padded['train'][sequence_number], + model.input_token_lengths: dataset.token_lengths['train'][sequence_number], + model.input_label_indices_flat: dataset.label_indices['train'][sequence_number], + #ADDED FOR TEST + model.dropout_keep_prob: 0.5 + # model.input_features:feature_list_of_lists + #model.input_label_indices_flat: dataset.label_indices['train'][sequence_number], + } + #loss=sess.run([model.loss],feed_dict) + _,_,loss,accuracy,transition_params_trained=sess.run([model.train_op, model.global_step, model.loss, model.accuracy,model.transition_parameters],feed_dict) + #print loss + return loss,accuracy, transition_params_trained + +def prediction_step(sess, dataset, dataset_type, model,epoch_number,results_folder,transition_params_trained,use_crf=True): + print('Evaluate model on the {0} set'.format(dataset_type)) + all_predictions = [] + all_y_true = [] + + store_at=results_folder+"epoche_"+str(epoch_number)+".txt" + store_at_tes=results_folder+"train/epoche_"+str(epoch_number)+".txt" + store_at_valid=results_folder+"valid/epoche_"+str(epoch_number)+".txt" + + # print ("CURRENT DIRECTORY") + #print (os.getcwd()) + f_store=open(store_at,'a') + f_store_train=open(store_at_tes,'a') + f_store_valid=open(store_at_valid,'a') + + prediction_list=[] + + + for i in range(len(dataset.token_indices[dataset_type])): + # feature_list_of_lists=hd.get_features_for_sentence(dataset.adresses[dataset_type],i) + + feed_dict = { + model.input_token_indices: dataset.token_indices[dataset_type][i], + model.input_token_character_indices: dataset.character_indices_padded[dataset_type][i], + model.input_token_lengths: dataset.token_lengths[dataset_type][i], + model.input_label_indices_vector: dataset.label_vector_indices[dataset_type][i], + model.input_label_indices_flat: dataset.label_indices[dataset_type][i], + + model.dropout_keep_prob: 1 + # model.input_features:feature_list_of_lists + } + unary_scores, predictions = sess.run([model.unary_scores, model.predictions], feed_dict) + + if use_crf==True: + predictions, _ = tf.contrib.crf.viterbi_decode(unary_scores, transition_params_trained) + predictions = predictions[1:-1] + else: + predictions = predictions.tolist() # NO CRF ON TOP + + assert(len(predictions) == len(dataset.tokens[dataset_type][i])) + + prediction_labels = [dataset.index_to_label[prediction] for prediction in predictions] + gold_labels = dataset.labels[dataset_type][i] + + all_predictions.extend(predictions) + all_y_true.extend(dataset.label_indices[dataset_type][i]) + + prediction_list.append(prediction_labels) + + + + if dataset!='deploy': + for prediction, token, gold_label in zip(prediction_labels, dataset.tokens[dataset_type][i], gold_labels): + results=(token +" " + "true " + gold_label + " " +prediction) + + if dataset_type=="test": + f_store.write(results+ "\n") + + if dataset_type=="train": + f_store_train.write(results+ "\n") + + if dataset_type=="valid": + f_store_valid.write(results+ "\n") + + if dataset_type=="test": + f_store.write("\n") + if dataset_type=="train": + f_store_train.write("\n") + if dataset_type=="valid": + f_store_valid.write("\n") + + if dataset_type=='deploy': + return prediction_list + #f_store.write("EPOCHE END" + "\n") + new_y_pred, new_y_true, new_label_indices, new_label_names, _, _ = remap_labels(all_predictions, all_y_true, dataset) + #print (sklearn.metrics.classification_report(new_y_true, new_y_pred, digits=4, labels=new_label_indices, target_names=new_label_names)) + + f_store.close() + f_store_train.close() + ###### CONL + conll_evaluation_script = os.path.join('.', 'conlleval') + conll_output_filepath = '{0}_conll_evaluation.txt'.format(store_at) + shell_read=store_at + + + if dataset_type=="train": + print ("TRAIN") + conll_output_filepath='{0}_conll_evaluation.txt'.format(store_at_tes) + shell_read=store_at_tes + + if dataset_type=="valid": + print("VALID") + conll_output_filepath='{0}_conll_evaluation.txt'.format(store_at_valid) + shell_read=store_at_valid + + shell_command = 'perl {0} < {1} > {2}'.format(conll_evaluation_script, shell_read, conll_output_filepath) + #print('shell_command: {0}'.format(shell_command)) + os.system(shell_command) + conll_parsed_output = hd.get_parsed_conll_output(conll_output_filepath) + #print ("Test F1") + #print (conll_parsed_output['all']['f1']) + + return (conll_parsed_output['all']['f1']),prediction_list \ No newline at end of file diff --git a/code/utils_tf.py b/code/utils_tf.py new file mode 100644 index 0000000..f1be3cb --- /dev/null +++ b/code/utils_tf.py @@ -0,0 +1,19 @@ +import tensorflow as tf + +def variable_summaries(var): + ''' + Attach a lot of summaries to a Tensor (for TensorBoard visualization). + From https://www.tensorflow.org/get_started/summaries_and_tensorboard + ''' + with tf.name_scope('summaries'): + mean = tf.reduce_mean(var) + tf.summary.scalar('mean', mean) + with tf.name_scope('stddev'): + stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) + tf.summary.scalar('stddev', stddev) + tf.summary.scalar('max', tf.reduce_max(var)) + tf.summary.scalar('min', tf.reduce_min(var)) + tf.summary.histogram('histogram', var) + +def resize_tensor_variable(sess, tensor_variable, shape): + sess.run(tf.assign(tensor_variable, tf.zeros(shape), validate_shape=False)) diff --git a/config.txt b/config.txt index a50a60b..65cfcba 100644 --- a/config.txt +++ b/config.txt @@ -1,2 +1,2 @@ GENIA None -UMLS None +UMLS None diff --git a/data/.gitignore b/data/.gitignore index ef6ccf0..0cba111 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -1,4 +1,8 @@ # Ignore everything per data use and confidentiality agreements * +!*/ + # Except this file !.gitignore +!examples/* + diff --git a/data/examples/ex_doc.con b/data/examples/ex_doc.con new file mode 100644 index 0000000..138bf02 --- /dev/null +++ b/data/examples/ex_doc.con @@ -0,0 +1,58 @@ +c="vasovagal syncope" 4:2 4:3||t="problem" +c="fall" 4:7 4:7||t="problem" +c="traumatic arthritis" 5:2 5:3||t="problem" +c="hypertension" 6:2 6:2||t="problem" +c="recurrent urinary tract infection" 7:4 7:7||t="problem" +c="renal carcinoma" 8:4 8:5||t="problem" +c="chronic obstructive pulmonary disease" 10:2 10:5||t="problem" +c="previous stroke" 13:15 13:16||t="problem" +c="hypertension" 13:18 13:18||t="problem" +c="copd" 13:20 13:20||t="problem" +c="renal carcinoma" 13:24 13:25||t="problem" +c="a fall" 13:29 13:30||t="problem" +c="syncope" 13:33 13:33||t="problem" +c="her fall" 15:0 15:1||t="test" +c="any loss of consciousness" 15:12 15:15||t="problem" +c="previous falls" 16:7 16:8||t="problem" +c="a hip fracture" 16:15 16:17||t="problem" +c="physical therapy" 17:3 17:4||t="treatment" +c="initial examination" 18:0 18:1||t="test" +c="examination" 18:11 18:11||t="test" +c="evaluation" 19:5 19:5||t="test" +c="her fall" 19:7 19:8||t="treatment" +c="syncope" 19:13 19:13||t="problem" +c="her positive histories" 19:18 19:20||t="problem" +c="diagnostic studies" 20:0 20:1||t="test" +c="cervical spine" 20:15 20:16||t="test" +c="acute fractures" 20:19 20:20||t="problem" +c="old healed left humeral head and neck fracture" 21:5 21:12||t="problem" +c="baseline anterior dislocation" 21:14 21:16||t="treatment" +c="ct of the brain" 22:0 22:3||t="test" +c="acute changes" 22:6 22:7||t="problem" +c="left periorbital soft tissue swelling" 22:9 22:13||t="problem" +c="ct of the maxillofacial area" 23:0 23:4||t="test" +c="facial bone fracture" 23:7 23:9||t="problem" +c="echocardiogram" 24:0 24:0||t="test" +c="ejection fraction" 24:7 24:8||t="test" +c="syncopal episode" 27:10 27:11||t="problem" +c="echocardiogram" 28:0 28:0||t="test" +c="her orthostatic blood pressures" 28:11 28:14||t="test" +c="traumatic injury of her knee" 31:15 31:19||t="problem" +c="significant pain" 31:22 31:23||t="problem" +c="swelling" 31:25 31:25||t="problem" +c="a scan" 32:1 32:2||t="test" +c="acute fractures" 32:5 32:6||t="problem" +c="daily physical therapy" 33:23 33:25||t="treatment" +c="rehabilitation" 33:27 33:27||t="treatment" +c="rehabilitation" 35:6 35:6||t="treatment" +c="colace" 39:1 39:1||t="treatment" +c="zestril" 41:4 41:4||t="treatment" +c="plavix" 42:2 42:2||t="treatment" +c="norvasc" 43:2 43:2||t="treatment" +c="hydrochlorothiazide" 44:2 44:2||t="treatment" +c="potassium chloride" 45:2 45:3||t="treatment" +c="atrovent inhaler" 46:2 46:3||t="treatment" +c="albuterol inhaler" 46:8 46:9||t="treatment" +c="clonidine" 47:2 47:2||t="treatment" +c="cardura" 48:2 48:2||t="treatment" +c="prophylaxis" 49:5 49:5||t="treatment" diff --git a/data/examples/ex_doc.txt b/data/examples/ex_doc.txt new file mode 100644 index 0000000..4084645 --- /dev/null +++ b/data/examples/ex_doc.txt @@ -0,0 +1,55 @@ +DATE OF ADMISSION : MM/DD/YYYY +DATE OF DISCHARGE : MM/DD/YYYY +DISCHARGE DIAGNOSES : +1 . Vasovagal syncope , status post fall . +2 . Traumatic arthritis , right knee . +3 . Hypertension . +4 . History of recurrent urinary tract infection . +5 . History of renal carcinoma , stable . +6 . +History of chronic obstructive pulmonary disease . +CONSULTANTS : None . +PROCEDURES : None . +BRIEF HISTORY : The patient is an ( XX ) -year-old female with history of previous stroke ; hypertension ; COPD , stable ; renal carcinoma ; presenting after a fall and possible syncope . +While walking , she accidentally fell to her knees and did hit her head on the ground , near her left eye . +Her fall was not observed , but the patient does not profess any loss of consciousness , recalling the entire event . +The patient does have a history of previous falls , one of which resulted in a hip fracture . +She has had physical therapy and recovered completely from that . +Initial examination showed bruising around the left eye , normal lung examination , normal heart examination , normal neurologic function with a baseline decreased mobility of her left arm . +The patient was admitted for evaluation of her fall and to rule out syncope and possible stroke with her positive histories . +DIAGNOSTIC STUDIES : All x-rays including left foot , right knee , left shoulder and cervical spine showed no acute fractures . +The left shoulder did show old healed left humeral head and neck fracture with baseline anterior dislocation . +CT of the brain showed no acute changes , left periorbital soft tissue swelling . +CT of the maxillofacial area showed no facial bone fracture . +Echocardiogram showed normal left ventricular function , ejection fraction estimated greater than 65 % . +HOSPITAL COURSE : +1 . +Fall : The patient was admitted and ruled out for syncopal episode . +Echocardiogram was normal , and when the patient was able , her orthostatic blood pressures were within normal limits . +Any serious conditions were quickly ruled out . +2 . +Status post fall with trauma : The patient was unable to walk normally secondary to traumatic injury of her knee , causing significant pain and swelling . +Although a scan showed no acute fractures , the patients frail status and previous use of cane prevented her regular abilities . +She was set up with a skilled nursing facility , which took several days to arrange , where she was to be given daily physical therapy and rehabilitation until appropriate for her previous residence . +DISCHARGE DISPOSITION : Discharged to skilled nursing facility . +ACTIVITY : Per physical therapy and rehabilitation . +DIET : General cardiac . +MEDICATIONS : Darvocet-N 100 one tablet p.o . +q.4-6 h. p.r.n . +and Colace 100 mg p.o . +b.i.d . +Medications at Home : Zestril 40 mg p.o . +daily , Plavix 75 mg p.o . +daily , Norvasc 5 mg p.o . +daily , hydrochlorothiazide 50 mg p.o . +daily , potassium chloride 40 mEq p.o . +daily , Atrovent inhaler 2 puffs q.i.d. , albuterol inhaler 2 puffs q.4-6 h . +p.r.n. , clonidine 0.1 mg p.o . +b.i.d. , Cardura 2 mg p.o . +daily , and Macrobid for prophylaxis , 100 mg p.o . +daily . +FOLLOWUP : +1 . +Follow up per skilled nursing facility until discharged to regular residence . +2 . +Follow up with primary provider within 2-3 weeks on arriving to home . diff --git a/data/tmp/.gitignore b/data/tmp/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/data/tmp/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 1005e26..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,177 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/complexity.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/complexity.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/complexity" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/complexity" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." \ No newline at end of file diff --git a/docs/authors.rst b/docs/authors.rst deleted file mode 100644 index 94292d0..0000000 --- a/docs/authors.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../AUTHORS.rst \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100755 index 9cb4440..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,275 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# complexity documentation build configuration file, created by -# sphinx-quickstart on Tue Jul 9 22:26:36 2013. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another -# directory, add these directories to sys.path here. If the directory is -# relative to the documentation root, use os.path.abspath to make it -# absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# Get the project root dir, which is the parent dir of this -cwd = os.getcwd() -project_root = os.path.dirname(cwd) - -# Insert the project root dir as the first element in the PYTHONPATH. -# This lets us ensure that the source package is imported, and that its -# version is used. -sys.path.insert(0, project_root) - -import clicon - -# -- General configuration --------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'CliCon' -copyright = u'2014, Tristan Naumann' - -# The version info for the project you're documenting, acts as replacement -# for |version| and |release|, also used in various other places throughout -# the built documents. -# -# The short X.Y version. -version = clicon.__version__ -# The full version, including alpha/beta/rc tags. -release = clicon.__version__ - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to -# some non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built -# documents. -#keep_warnings = False - - -# -- Options for HTML output ------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'default' - -# Theme options are theme-specific and customize the look and feel of a -# theme further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as -# html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the -# top of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon -# of the docs. This file should be a Windows icon file (.ico) being -# 16x16 or 32x32 pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) -# here, relative to this directory. They are copied after the builtin -# static files, so a file named "default.css" will overwrite the builtin -# "default.css". -html_static_path = ['_static'] - -# If not '', a 'Last updated on:' timestamp is inserted at every page -# bottom, using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names -# to template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. -# Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. -# Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages -# will contain a tag referring to it. The value of this option -# must be the base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'clicondoc' - - -# -- Options for LaTeX output ------------------------------------------ - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - #'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - #'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - #'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass -# [howto/manual]). -latex_documents = [ - ('index', 'clicon.tex', - u'CliCon Documentation', - u'Tristan Naumann', 'manual'), -] - -# The name of an image file (relative to this directory) to place at -# the top of the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings -# are parts, not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output ------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'clicon', - u'CliCon Documentation', - [u'Tristan Naumann'], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ---------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'clicon', - u'CliCon Documentation', - u'Tristan Naumann', - 'clicon', - 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False \ No newline at end of file diff --git a/docs/contributing.rst b/docs/contributing.rst deleted file mode 100644 index 3bdd7dc..0000000 --- a/docs/contributing.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../CONTRIBUTING.rst \ No newline at end of file diff --git a/docs/history.rst b/docs/history.rst deleted file mode 100644 index bec23d8..0000000 --- a/docs/history.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../HISTORY.rst \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index e9f5fb5..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,24 +0,0 @@ -.. complexity documentation master file, created by - sphinx-quickstart on Tue Jul 9 22:26:36 2013. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to CliCon's documentation! -====================================== - -Contents: - -.. toctree:: - :maxdepth: 2 - - readme - contributing - authors - history - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 2b44764..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,242 +0,0 @@ -@ECHO OFF - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set BUILDDIR=_build -set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . -set I18NSPHINXOPTS=%SPHINXOPTS% . -if NOT "%PAPER%" == "" ( - set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% - set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% -) - -if "%1" == "" goto help - -if "%1" == "help" ( - :help - echo.Please use `make ^` where ^ is one of - echo. html to make standalone HTML files - echo. dirhtml to make HTML files named index.html in directories - echo. singlehtml to make a single large HTML file - echo. pickle to make pickle files - echo. json to make JSON files - echo. htmlhelp to make HTML files and a HTML help project - echo. qthelp to make HTML files and a qthelp project - echo. devhelp to make HTML files and a Devhelp project - echo. epub to make an epub - echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter - echo. text to make text files - echo. man to make manual pages - echo. texinfo to make Texinfo files - echo. gettext to make PO message catalogs - echo. changes to make an overview over all changed/added/deprecated items - echo. xml to make Docutils-native XML files - echo. pseudoxml to make pseudoxml-XML files for display purposes - echo. linkcheck to check all external links for integrity - echo. doctest to run all doctests embedded in the documentation if enabled - goto end -) - -if "%1" == "clean" ( - for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i - del /q /s %BUILDDIR%\* - goto end -) - - -%SPHINXBUILD% 2> nul -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "html" ( - %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/html. - goto end -) - -if "%1" == "dirhtml" ( - %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. - goto end -) - -if "%1" == "singlehtml" ( - %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. - goto end -) - -if "%1" == "pickle" ( - %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the pickle files. - goto end -) - -if "%1" == "json" ( - %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the JSON files. - goto end -) - -if "%1" == "htmlhelp" ( - %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run HTML Help Workshop with the ^ -.hhp project file in %BUILDDIR%/htmlhelp. - goto end -) - -if "%1" == "qthelp" ( - %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run "qcollectiongenerator" with the ^ -.qhcp project file in %BUILDDIR%/qthelp, like this: - echo.^> qcollectiongenerator %BUILDDIR%\qthelp\complexity.qhcp - echo.To view the help file: - echo.^> assistant -collectionFile %BUILDDIR%\qthelp\complexity.ghc - goto end -) - -if "%1" == "devhelp" ( - %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. - goto end -) - -if "%1" == "epub" ( - %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The epub file is in %BUILDDIR%/epub. - goto end -) - -if "%1" == "latex" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "latexpdf" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - cd %BUILDDIR%/latex - make all-pdf - cd %BUILDDIR%/.. - echo. - echo.Build finished; the PDF files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "latexpdfja" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - cd %BUILDDIR%/latex - make all-pdf-ja - cd %BUILDDIR%/.. - echo. - echo.Build finished; the PDF files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "text" ( - %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The text files are in %BUILDDIR%/text. - goto end -) - -if "%1" == "man" ( - %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The manual pages are in %BUILDDIR%/man. - goto end -) - -if "%1" == "texinfo" ( - %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. - goto end -) - -if "%1" == "gettext" ( - %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The message catalogs are in %BUILDDIR%/locale. - goto end -) - -if "%1" == "changes" ( - %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes - if errorlevel 1 exit /b 1 - echo. - echo.The overview file is in %BUILDDIR%/changes. - goto end -) - -if "%1" == "linkcheck" ( - %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck - if errorlevel 1 exit /b 1 - echo. - echo.Link check complete; look for any errors in the above output ^ -or in %BUILDDIR%/linkcheck/output.txt. - goto end -) - -if "%1" == "doctest" ( - %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest - if errorlevel 1 exit /b 1 - echo. - echo.Testing of doctests in the sources finished, look at the ^ -results in %BUILDDIR%/doctest/output.txt. - goto end -) - -if "%1" == "xml" ( - %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The XML files are in %BUILDDIR%/xml. - goto end -) - -if "%1" == "pseudoxml" ( - %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. - goto end -) - -:end \ No newline at end of file diff --git a/docs/readme.rst b/docs/readme.rst deleted file mode 100644 index 6b2b3ec..0000000 --- a/docs/readme.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../README.rst \ No newline at end of file diff --git a/examples/demo.sh b/examples/demo.sh deleted file mode 100644 index a714278..0000000 --- a/examples/demo.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh - - -# Train model on an xml-formatted file -# Note, 'examples' directory only has one xml file (but clicon accepts globs) -clicon train "$CLICON_DIR/examples/*.xml" --format xml - - -# Use trained model to predict concepts for a given txt file -# Note, 'examples' directory only has one txt file (but clicon accepts globs) -clicon predict "$CLICON_DIR/examples/*.txt" --out $CLICON_DIR/data/test_predictions/ --format xml - - -# Evaluate how well the system classified. -# Note, in this case it is 100% because it trained/predicted on the same file. -clicon evaluate "$CLICON_DIR/examples/pretend.txt" --gold $CLICON_DIR/examples --predictions $CLICON_DIR/data/test_predictions/ --format xml diff --git a/examples/pretend.con b/examples/pretend.con deleted file mode 100644 index 40a5cbc..0000000 --- a/examples/pretend.con +++ /dev/null @@ -1,6 +0,0 @@ -c="casey at the bat" 2:0 2:3||t="treatment" -c="the score stood four to two" 8:0 8:5||t="problem" -c="cooney died at first" 9:3 9:6||t="problem" -c="casey at the bat" 14:9 14:12||t="treatment" -c="the former was a lulu" 16:1 16:5||t="problem" -c="the latter was a cake" 16:7 16:11||t="problem" diff --git a/examples/pretend.txt b/examples/pretend.txt deleted file mode 100644 index c96246c..0000000 --- a/examples/pretend.txt +++ /dev/null @@ -1,18 +0,0 @@ -Title : -Casey at the Bat -Author : -Ernest Thayer -Published : -June 3 , 1888 -The outlook wasn't brilliant for the Mudville Nine that day ; -The score stood four to two , with but one inning more to play , -And then when Cooney died at first , and Barrows did the same , -A sickly silence fell upon the patrons of the game . -A straggling few got up to go in deep despair . The rest -Clung to that hope which springs eternal in the human breast ; -They thought , if only Casey could get a whack at that - -They'd put up even money , now , with Casey at the bat . -But Flynn preceded Casey , as did also Jimmy Blake . -And the former was a lulu and the latter was a cake ; -So upon that stricken multitude grim melancholy sat , -For there semmed but little chance of Casey's getting to the bat . diff --git a/examples/pretend.xml b/examples/pretend.xml deleted file mode 100644 index ee2096d..0000000 --- a/examples/pretend.xml +++ /dev/null @@ -1,18 +0,0 @@ -Title : - Casey at the Bat -Author : -Ernest Thayer -Published : -June 3 , 1888 -The outlook wasn't brilliant for the Mudville Nine that day ; - The score stood four to two , with but one inning more to play , -And then when Cooney died at first , and Barrows did the same , -A sickly silence fell upon the patrons of the game . -A straggling few got up to go in deep despair . The rest -Clung to that hope which springs eternal in the human breast ; -They thought , if only Casey could get a whack at that - -They'd put up even money , now , with Casey at the bat . -But Flynn preceded Casey , as did also Jimmy Blake . -And the former was a lulu and the latter was a cake ; -So upon that stricken multitude grim melancholy sat , -For there semmed but little chance of Casey's getting to the bat. diff --git a/install.sh b/install.sh deleted file mode 100755 index 55cfd4a..0000000 --- a/install.sh +++ /dev/null @@ -1,151 +0,0 @@ -# -# install.sh -# -# Purpose: This is a demo that will install CliCon and it's package dependencies -# -# Note: This does not download/install: -# 1) i2b2 data -# 2) UMLS tables -# - - -function install_python_dependencies { - - modules=(nltk python-crfsuite nose numpy scipy scikit-learn) - for m in ${modules[@]} ; do - - #echo -e "\n\nmodule: $m\n\n" - - # Install module if necessary - python $CLICON_DIR/clicon/is_installed.py $m - if [[ $? != 0 ]] ; then - echo "installing $m" - pip install -U $m &>> $log - echo -e "$m installation complete\n" - fi - - done - - # Install nltk data - echo "downloading nltk data" - python -m nltk.downloader maxent_treebank_pos_tagger wordnet punkt &>> $log - echo -e "nltk download complete\n" - -} - - - -function get_genia { - # save current path - old_path=$(pwd) - - # Get sources - cd $CLICON_DIR/clicon/features_dir/genia - wget http://www.nactem.ac.uk/tsujii/GENIA/tagger/geniatagger-3.0.1.tar.gz &>> $log - tar xzvf geniatagger-3.0.1.tar.gz &>> $log - rm geniatagger-3.0.1.tar.gz - - # Build GENIA tagger - cd geniatagger-3.0.1/ - echo "$(sed '1i#include ' morph.cpp)" > morph.cpp # fix build error - echo "building GENIA tagger" - make &>> $log - echo -e "GENIA tagger built\n" - - # Successful build ? - if ! [[ $? -eq 0 ]] ; then - echo "there was a build error in GENIA" - return - fi - - # Set config file location of tagger - if [[ ! -f "$CLICON_DIR/config.txt" ]] ; then - echo -e "\tWarning: Could not update config.txt because CLICON_DIR must be an absolute path\n" - cd $old_path - return - fi - config_file="$CLICON_DIR/config.txt" - out_tmp="out.tmp.txt" - echo "GENIA $(pwd)/geniatagger" > $out_tmp - while read line ; do - if ! [[ $line = GENIA* ]] ; then - echo $line >> $out_tmp - fi - done < "$config_file" - mv $out_tmp $config_file - - # return to original path - cd $old_path -} - - - -# Ensure resources are available -which g++ gfortran virtualenv pip &> /dev/null -resources=$? -if [[ $resources -eq 0 ]] ; then - - - # CLICON_DIR must be defined before proceeding - if [[ "$CLICON_DIR" = "" ]] ; then - - echo -e "\n\tYou must define the CLICON_DIR evironment variable to run this script" - echo -e "\tRecommendation: 'cd' to the directory containing this script and execute 'export CLICON_DIR=\$(pwd)'\n" - - else - - # Installation log - log="$CLICON_DIR/installation_log.txt" - - - # Create virtual environment - echo "creating virtual environment" - virtualenv venv_clicon --system-site-packages &>> $log - source venv_clicon/bin/activate - echo -e "virtual environment enabled\n" - - - # Install python dependencies - install_python_dependencies - - - # Download & install GENIA tagger - get_genia - - - # Install 'clicon' script for command line usage - setup_output="setup_output.txt" - echo "Building executable 'clicon' script" - python setup.py install &> $setup_output - success=$? - echo -e "'clicon' script built\n" - - - # Successful - if [[ $success == 0 ]] ; then - echo "CliCon successfully installed" - else - echo -e "CliCon installation failure\n" - echo "---------------------FAILURE-------------------------" - cat $setup_output - echo "-----------------------------------------------------" - fi - - - cat $setup_output >> $log - rm $setup_output - - fi - -else - - echo -e "\n\tError: Not all resources available on system." - echo -e "\nPlease ensure the following packages are installed:" - - packages=(g++ gfortran python-dev python-pip python-virtualenv libopenblas-dev liblapack-dev) - for p in ${packages[@]} ; do - echo -e "\t$p" - done - echo "" - -fi diff --git a/requirements.txt b/requirements.txt index edcc87e..9b83111 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,7 @@ -wheel==0.23.0 \ No newline at end of file +nltk +python-crfsuite +numpy +scipy +scikit-learn +marisa-trie +repoze.lru diff --git a/run.sh b/run.sh deleted file mode 100755 index b3720f2..0000000 --- a/run.sh +++ /dev/null @@ -1,5 +0,0 @@ -file="00098-016139" - -#python clicon/train.py -t test/text/$file.text -c test/pipe/$file.pipe -f semeval - -python clicon/predict.py -i test/text/$file.text -f semeval diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 5e40900..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[wheel] -universal = 1 diff --git a/setup.py b/setup.py deleted file mode 100755 index c5e6cbc..0000000 --- a/setup.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import os -import sys - - -try: - from setuptools import setup -except ImportError: - from distutils.core import setup - - -readme = open('README.rst').read() -history = open('HISTORY.rst').read().replace('.. :changelog:', '') - -requirements = [ - # TODO: put package requirements here - 'click', - 'nltk', - 'scikit-learn', -] - -test_requirements = [ - # TODO: put package test requirements here -] - -setup( - name='clicon', - version='0.1dev', - description='A tool for clinical concept extraction.', - long_description=readme + '\n\n' + history, - url='https://github.com/mitmedg/CliCon', - packages=[ - 'clicon', - ], - package_dir={'clicon': - 'clicon'}, - include_package_data=True, - install_requires=requirements, - zip_safe=False, - keywords='clicon', - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', - 'Natural Language :: English', - "Programming Language :: Python :: 2", - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - ], - test_suite='tests', - tests_require=test_requirements, - entry_points=''' - [console_scripts] - clicon=clicon.cli:clicon - ''', -) \ No newline at end of file diff --git a/tests/data/single.con b/tests/data/single.con new file mode 100644 index 0000000..d522eaf --- /dev/null +++ b/tests/data/single.con @@ -0,0 +1 @@ +c="the score stood four to two" 1:0 1:5||t="problem" diff --git a/tests/data/single.txt b/tests/data/single.txt new file mode 100644 index 0000000..d4ff891 --- /dev/null +++ b/tests/data/single.txt @@ -0,0 +1 @@ +The score stood four to two , with but one inning more to play , diff --git a/tests/test_features_dir.py b/tests/test_features_dir.py new file mode 100644 index 0000000..b45f932 --- /dev/null +++ b/tests/test_features_dir.py @@ -0,0 +1,25 @@ + + +if __name__ == '__main__': + import doctest + + import os, sys + home = os.path.join( os.getenv('CLINER_DIR') , 'cliner' ) + if home not in sys.path: sys.path.append(home) + + #from features_dir import * + + import features_dir.features + doctest.testmod(features_dir.features) + + import features_dir.read_config + doctest.testmod(features_dir.read_config) + + import features_dir.sentence_features + doctest.testmod(features_dir.sentence_features) + + import features_dir.utilities + doctest.testmod(features_dir.utilities) + + import features_dir.word_features + doctest.testmod(features_dir.word_features) diff --git a/tools/i2b2va-eval.jar b/tools/i2b2va-eval.jar new file mode 100644 index 0000000..1054d46 Binary files /dev/null and b/tools/i2b2va-eval.jar differ diff --git a/tools/py2_maxent_treebank_pos_tagger.pickle b/tools/py2_maxent_treebank_pos_tagger.pickle new file mode 100644 index 0000000..ba97523 Binary files /dev/null and b/tools/py2_maxent_treebank_pos_tagger.pickle differ diff --git a/tools/py3_maxent_treebank_pos_tagger.pickle b/tools/py3_maxent_treebank_pos_tagger.pickle new file mode 100644 index 0000000..d785432 Binary files /dev/null and b/tools/py3_maxent_treebank_pos_tagger.pickle differ diff --git a/tools/tok.py b/tools/tok.py new file mode 100644 index 0000000..dbcbaae --- /dev/null +++ b/tools/tok.py @@ -0,0 +1,82 @@ +import sys +import re +import nltk + + +def main(): + file_tokenize(sys.argv[1], sys.argv[2]) + + +def file_tokenize(filename, outfile): + toks = tokenize(filename) + with open(outfile, 'w') as f: + for sent in toks: + print >>f, ' '.join(sent) + + +def tokenize(filename): + with open(filename, 'r') as f: + text = f.read().strip() + text = clean_text(text) + + text = re.sub('\n\n+', '\n\n', text) + + # remove PHI + phis = re.findall('(\[\*\*.*?\*\*\])', text) + for phi in phis: + new = replace_phi(text, phi) + text = text.replace(phi, new) + + # break into sentences + sections = text.split('\n\n') + sents = [] + for section in sections: + # remove leading section lines + if '\n' in section: + index = section.index('\n') + else: + index = 500 + while index < 60: + # add that line to all sents + line = section[:index] + sents.append(line) + + section = section[index+1:] + if '\n' in section: + index = section.index('\n') + else: + index = 500 + + s_toks = nltk.sent_tokenize(section) + sents += s_toks + + # break into words + word_toks = [] + for sent in sents: + w_toks = nltk.word_tokenize(sent) + word_toks.append(w_toks) + + return word_toks + + +def clean_text(text): + try: + return text.decode('ascii', 'ignore') + except UnicodeDecodeError, e: + chars = [] + for c in text: + try: + c.decode('ascii', 'ignore') + chars.append(c) + except UnicodeDecodeError, f: + pass + return ''.join(chars) + + +def replace_phi(text, phi): + return '__phi__' + + +if __name__ == '__main__': + main() + diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 3954c7f..0000000 --- a/tox.ini +++ /dev/null @@ -1,9 +0,0 @@ -[tox] -envlist = py26, py27, py33, py34 - -[testenv] -setenv = - PYTHONPATH = {toxinidir}:{toxinidir}/clicon -commands = python setup.py test -deps = - -r{toxinidir}/requirements.txt \ No newline at end of file