diff --git a/.gitignore b/.gitignore index 5bd7a5b0..a7d599a5 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ database/network-database/script-results database/network-database/source-files database/expression-database/script-results database/expression-database/source-files +database2/network-database/script-results npm-debug.log node_modules @@ -82,3 +83,6 @@ Backup of *.doc* # Misc .env + +# Ignore Python __pycache__ folder +__pycache__/ \ No newline at end of file diff --git a/database2/README.md b/database2/README.md new file mode 100644 index 00000000..1e02eff9 --- /dev/null +++ b/database2/README.md @@ -0,0 +1,165 @@ +# GRNsight Database + +Here are the instructions how to set up the database for GRNsight. + +## Setting up a local postgres GRNsight Database + +### Installing PostgreSQL on your computer + +- MacOS and Windows can follow these instructions on how to install postgreSQL. + +- Install the software at this [link](https://www.postgresql.org/download/) +- > MacOS users: It is recommended to install with homebrew rather than the interactive installation in order to correctly view the `initdb --locale=C -E UTF-8 location-of-cluster` message in the documentation. +- > Windows users: when prompted for a password at the end of the installation process, save this password. It is the password for the postgres user +- Initialize the database + - If your terminal emits a message that looks like `initdb --locale=C -E UTF-8 location-of-cluster` from Step 1B, then your installer has initialized a database for you. + - Open the terminal and type the command `initdb --locale=C -E UTF-8 location-of-cluster` + - "Cluster" is the PostgreSQL term for the file structure of a PostgreSQL database instance + - You will have to modify location-of-cluster to the folder name you want to store the database (you don't need to create a folder, the command will create the folder for you, just create the name) +- Start and stop the server + - Additionally, your installer may start the server for you upon installation (You can save this command for further reuse). + - To start the server yourself run `pg_ctl start -D location-of-cluster` (You can save this command for further reuse). + - To stop the server run `pg_ctl stop -D location-of-cluster`. + - After installing with homebrew on MacOS, you may receive an error when you try to start the server that the server is unable to be started, and when attempting to stop the server, there terminal states there is no server running. In this case, you have to directly kill the port that the server is running on. + - To double check that this is the issue, you can open the Activity Monitor app on your computer and search for the `postgres` activity. If there is one, that means the server is running, and we have to terminate the port that the server is running on. + - First, we have to check what port the server is running on. Navigate to your homebrew installation, which is the same `location-of-cluster` from when the database was initialized and open that location in VSCode. + - Search for `port =` in the file `postgresql.conf`. By default, the port should be port 5432, but keep note of this port in case it is different. + - Refer to this Stack Overflow documentation on how to kill a server: + - https://stackoverflow.com/questions/4075287/node-express-eaddrinuse-address-already-in-use-kill-server + - If that doesn't work, then refer to the different methods on this link from Stack Overflow: + - https://stackoverflow.com/questions/42416527/postgres-app-port-in-use + +- Linux users + + - The MacOS and Windows instructions will _probably_ not work for you. You can try at your own risk to check. + - Linux users can try these [instructions](https://www.geeksforgeeks.org/install-postgresql-on-linux/) and that should work for you (...maybe...). If it doesn't try googling instructions with your specific operating system. Sorry! + +### Loading data to your database + +## 1. Set Up Database Schema + +For detailed instructions on setting up the database schema, refer to the `README.md` file located in the `schema` folder. + +## 2. Install Python Dependencies + +GRNsight generates Network data (gene regulatory network and protein protein interactions) from SGD through AllianceMine. n order to run the script that generates these Network files, you must pip3 install the dependencies used. If you get an error saying that a module doesn't exist, just run `pip3 install ` and it should fix the error. If the error persists and is found in a specific file on your machine, you might have to manually go into that file and alter the naming conventions of the dependencies that are used. _Note: So far this issue has only occured on Ubuntu 22.04.1, and certain MacOS versions so you might be lucky and not have to do it!_ + +``` +pip3 install pandas requests intermine tzlocal psycopg2 +``` + +### 3. Populate Data into Database + +#### 1. Expression Database + +##### Step 1: Create a directory (aka folder) in the `database/expression-database` folder called `source-files` + + ``` + mkdir /source-files + ``` + +##### Step 2: Download Expression Data + +Download the _"Expression 2020"_ folder from Box located in `GRNsight > GRNsight Expression > Expression 2020` to your newly created `source-files` folder. Your the path should look like this: GRNsight > database > expression-database > source-files > Expression 2020 > [the actual csv and xlsx files are here!] + +##### Step 3: Run the Pre-Processing script + +Run the pre-processing script on the data. This will create a folder full of the processed files in `database/expression-database/script-results`. + +``` +cd +python3 preprocessing.py +``` + +**Note:** If you receive a UnicodeEncodeError add `-X utf8` to the beginning of the command + +##### Step 4: Load the Processed Data into the Database + +Use the `loader.py` script located in `expression-database/scripts` to load the processed expression data into the database. This script generates SQL statements to populate your relational database with the processed data. + +- To move to `expression-database/scripts` + + ``` + cd + ``` + +- To load to local database + + ``` + python3 loader.py | psql postgresql://localhost/postgres + ``` + +- To load to production database + ``` + python3 loader.py | psql + ``` + +For more details, refer to the `README.md` inside the `expression-database` folder. + +#### 2. Network Database for GRN (Gene Regulatory Network) and PPI (Protein-Protein Interactions) + +The code for generating and populating the network data (GRN and PPI) is located in the network-folder. The main script for fetching, processing, and loading the data into the database is `main.py`. + +##### Step 1: Navigate to the network-folder + + ``` + + ``` + +#### Step 2: Run the main.py Script + +Run the `main.py` script with the appropriate `--network` argument: + +- `all`: Fetch and populate both GRN and PPI data. +- `grn`: Fetch and populate only GRN data. +- `ppi`: Fetch and populate only PPI data. + +For example, to populate both GRN and PPI data into a local database, run: + +``` +python3 main.py --network all --db_url postgresql://localhost/postgres +``` + +**Note:** If you get the following error: +ImportError: urllib3 v2.0 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'OpenSSL 1.1.0h 27 Mar 2018'. See: Drop support for OpenSSL<1.1.1 urllib3/urllib3#2168 +Run `pip install urllib3==1.26.6` + +**Note:** If you get an error similar to the following image where it references the in then you are one of the unlucky few who has to edit the intermine.py file directly. + +![image](https://user-images.githubusercontent.com/21343072/213089777-dfe772bc-deca-4df7-816f-72703db24d1e.png) + +- Navigate the referenced file ( \/intermine/webservice.py). If you have virtual environment set up, you can find the folder using this path: + ``` + cd /site-packages/intermine/webservice.py> + ``` +- The try-catch block should look like this: + + ``` + try: + from urlparse import urlparse + from UserDict import DictMixin + from urllib import urlopen + from urllib import urlencode + except ImportError: + from urllib.parse import urlparse + from urllib.parse import urlencode + from collections.abc import MutableMapping as DictMixin + from urllib.request import urlopen + ``` + +- Replace the try-catch block with this: + ``` + try: + from urlparse import urlparse + from UserDict import DictMixin + from urllib import urlopen + from urllib import urlencode + except ImportError: + from urllib.parse import urlparse + from urllib.parse import urlencode + from collections.abc import MutableMapping as DictMixin + from urllib.request import urlopen + ``` +- Rerun the command to run `main.py` file. + +For more information, refer to the `README.md` in the `network-folder`. diff --git a/database2/expression-database/README.md b/database2/expression-database/README.md new file mode 100644 index 00000000..8d57bdb7 --- /dev/null +++ b/database2/expression-database/README.md @@ -0,0 +1,44 @@ +# Expression Database + +All files pertaining the expression database live within this directory. + +## The basics + +### Scripts + +All scripts live within the subdirectory `scripts`, located in the top-level of the network database directory. + +Any source files required to run the scripts live within the subdirectory `source-files`, located in the top-level of the network database directory. As source files may be large, you must create this directory yourself and add any source files you need to use there. + +All generated results of the scripts live in the subdirectory `script-results`, located in the top-level of the network database directory. Currently, all scripts that generate code create the directory if it does not currently exist. When adding a new script that generates resulting code, best practice is to create the script-results directory and any subdirectories if it does not exist, in order to prevent errors and snafus for recently cloned repositories. + +Within the scripts directory, there are the following files: + +- `preprocessing.py` +- `loader.py` + +#### Data Preprocessor(s) +*Note: Data Preprocessing is always specific to each dataset that you obtain. `preprocessing.py` is capable of preprocessing the specific Expression data files located in `source-files/Expression 2020`. Because these files are too large to be stored on github, access the direct source files on BOX and move them into this directory. If more data sources are to be added in the database, create a new directory in source-files for it, note it in this `README.md` file and create a new preprocessing script for that data source (if required). Please document the changes in this section so that future developers may use your work to recreate the database if ever required.* + + * The script (`preprocessing.py`) is used to preprocess the data in `source-files/Expression 2020`. It parses through each file to construct the processed loader files, so that they are ready to load using `loader.py`. Please read through the code, as there are instructions on what to add within the comments. Good luck! + * The resulting processed loader files are located in `script-results/processed-expression` and the resulting processed loader files are located within `script-results/processed-loader-files` + + Usage: + ``` + python3 preprocessing.py + ``` +#### Database Loader + +This script (`loader.py`) is to be used to load your preprocessed expression data into the database. + +This program generates direct SQL statements from the source files generated by the data preprocessor in order to populate a relational database with those files’ data + +Usage: +To load to local database +``` +python3 loader.py | psql postgresql://localhost/postgres +``` +To load to production database +``` +python3 loader.py | psql +``` diff --git a/database2/expression-database/scripts/loader.py b/database2/expression-database/scripts/loader.py new file mode 100755 index 00000000..d31cfff6 --- /dev/null +++ b/database2/expression-database/scripts/loader.py @@ -0,0 +1,186 @@ +import csv +import re +# Usage +# python3 loader.py | psql postgresql://localhost/postgres +""" +This program generates direct SQL statements from the source files in order +to populate a relational database with those files’ data. + +By taking the approach of emitting SQL statements directly, we bypass the need to import +some kind of database library for the loading process, instead passing the statements +directly into a database command line utility such as `psql`. +""" + +""" +Stolen from https://www.kite.com/python/answers/how-to-check-if-a-string-is-a-valid-float-in-python +""" +def check_float(potential_float): + try: + float(potential_float) + return True + except ValueError: + return False +""" +Inspired by https://www.kite.com/python/answers/how-to-check-if-a-string-is-a-valid-float-in-python +""" +def check_int(potential_int): + try: + int(potential_int) + return True + except ValueError: + return False +""" +Created out of necessity +""" +def convert_float(potential_float): + return float("".join(potential_float.split()).replace(" ", "")) if "".join(potential_float.split()).replace(" ", "") else -0.000000000001 +""" +Created out of necessity +""" +def convert_int(potential_int): + return int("".join(potential_int.split()).replace(" ", "")) if check_int("".join(potential_int.split()).replace(" ", "")) else -1111111 + + +""" +This program Loads Refs into the database +""" +def LOAD_REFS(): + print('COPY gene_expression.ref (pubmed_id, authors, publication_year, title, doi, ncbi_geo_id) FROM stdin;') + REFS_SOURCE = '../script-results/processed-expression/refs.csv' + with open(REFS_SOURCE, 'r+') as f: + reader = csv.reader(f) + row_num = 0 + for row in reader: + if row_num != 0: + r= ','.join(row).split('\t') + pubmed_id = r[0] + authors = r[1] + publication_year = r[2] + title = r[3] + doi = r[4] + ncbi_geo_id = r[5] + print(f'{pubmed_id}\t{authors}\t{publication_year}\t{title}\t{doi}\t{ncbi_geo_id}') + row_num += 1 + print('\\.') + +""" +This program Loads ID Mapping into the database +""" +def LOAD_GENES(): + print('COPY gene_expression.gene (gene_id, display_gene_id, species, taxon_id) FROM stdin;') + GENE_SOURCE = '../script-results/processed-expression/genes.csv' + with open(GENE_SOURCE, 'r+') as f: + reader = csv.reader(f) + row_num = 0 + for row in reader: + if row_num != 0: + r= ','.join(row).split('\t') + gene_id = r[0] + display_gene_id= r[1] + species = r[2] + taxon_id = r[3] + print(f'{gene_id}\t{display_gene_id}\t{species}\t{taxon_id}') + row_num += 1 + print('\\.') + +""" +This program Loads Expression Metadata into the database +""" +def LOAD_EXPRESSION_METADATA(): + print('COPY gene_expression.expression_metadata (ncbi_geo_id, pubmed_id, control_yeast_strain, treatment_yeast_strain, control, treatment, concentration_value, concentration_unit, time_value, time_unit, number_of_replicates, expression_table) FROM stdin;') + EXPRESSION_METADATA_SOURCE = '../script-results/processed-expression/expression-metadata.csv' + with open(EXPRESSION_METADATA_SOURCE, 'r+') as f: + reader = csv.reader(f) + row_num = 0 + for row in reader: + if row_num != 0: + r= ','.join(row).split('\t') + ncbi_geo_id = r[0] + pubmed_id =r[1] + control_yeast_strain = r[2] + treatment_yeast_strain = r[3] + control = r[4] + treatment = r[5] + concentration_value = float(r[6]) + concentration_unit = r[7] + time_value = float(r[8]) + time_unit = r[9] + number_of_replicates = int(r[10]) + expression_table = r[11] + + print(f'{ncbi_geo_id}\t{pubmed_id}\t{control_yeast_strain}\t{treatment_yeast_strain}\t{control}\t{treatment}\t{concentration_value}\t{concentration_unit}\t{time_value}\t{time_unit}\t{number_of_replicates}\t{expression_table}') + row_num += 1 + print('\\.') + +""" +This program Loads Expression Data into the database +""" +def LOAD_EXPRESSION_DATA(): + print('COPY gene_expression.expression (gene_id, taxon_id, sort_index, sample_id, expression, time_point, dataset) FROM stdin;') + EXPRESSION_DATA_SOURCE = '../script-results/processed-expression/expression-data.csv' + with open(EXPRESSION_DATA_SOURCE, 'r+') as f: + reader = csv.reader(f) + row_num = 0 + for row in reader: + if row_num != 0: + r= ','.join(row).split('\t') + gene_id = r[0] + taxon_id = r[1] + sort_index = int(r[2]) + sample_id = r[3] + expression = float(r[4]) if r[4] != "" else "NaN" + + time_point = float(r[5]) + data_set = r[6] + print(f'{gene_id}\t{taxon_id}\t{sort_index}\t{sample_id}\t{expression}\t{time_point}\t{data_set}') + row_num += 1 + print('\\.') + +""" +This program Loads Production Rates into the database +""" +def LOAD_PRODUCTION_RATES(): + print('COPY gene_expression.production_rate (gene_id, taxon_id, ncbi_geo_id, pubmed_id, production_rate) FROM stdin;') + PRODUCTION_RATES_SOURCE = '../script-results/processed-expression/production-rates.csv' + with open(PRODUCTION_RATES_SOURCE, 'r+') as f: + reader = csv.reader(f) + row_num = 0 + for row in reader: + if row_num != 0: + r= ','.join(row).split('\t') + gene_id = r[0] + taxon_id = r[1] + ncbi_geo_id = r[2] + pubmed_id = r[3] + production_rate = float(r[4]) if r[4] != "" else "NaN" + print(f'{gene_id}\t{taxon_id}\t{ncbi_geo_id}\t{pubmed_id}\t{production_rate}') + row_num += 1 + print('\\.') + +""" +This program Loads Degradation Rates into the database +""" +def LOAD_DEGRADATION_RATES(): + print('COPY gene_expression.degradation_rate (gene_id, taxon_id, ncbi_geo_id, pubmed_id, degradation_rate) FROM stdin;') + DEGRADATION_RATES_SOURCE = '../script-results/processed-expression/degradation-rates.csv' + with open(DEGRADATION_RATES_SOURCE, 'r+') as f: + reader = csv.reader(f) + row_num = 0 + for row in reader: + if row_num != 0: + r= ','.join(row).split('\t') + gene_id = r[0] + taxon_id = r[1] + ncbi_geo_id = r[2] + pubmed_id = r[3] + degradation_rate = float(r[4]) if r[4] != "" else "NaN" + print(f'{gene_id}\t{taxon_id}\t{ncbi_geo_id}\t{pubmed_id}\t{degradation_rate}') + row_num += 1 + print('\\.') + +LOAD_REFS() +LOAD_GENES() +LOAD_EXPRESSION_METADATA() +LOAD_EXPRESSION_DATA() +LOAD_PRODUCTION_RATES() +LOAD_DEGRADATION_RATES() diff --git a/database2/expression-database/scripts/preprocessing.py b/database2/expression-database/scripts/preprocessing.py new file mode 100755 index 00000000..f184109a --- /dev/null +++ b/database2/expression-database/scripts/preprocessing.py @@ -0,0 +1,197 @@ +import csv +import re +import sys +import os + +# Need to manually add Dahlquist data to Expression metadata and refs + + +species = "Saccharomyces cerevisiae" +taxon_id = "559292" + +# Gene Id Generation and Expression Data Generation + +# Create folder paths +if not os.path.exists('../script-results'): + os.makedirs('../script-results') + +if not os.path.exists('../script-results/processed-expression/'): + os.makedirs('../script-results/processed-expression') + +# For simplicity, we assume that the program runs in the expression-database-folder. +EXPRESSION_DATA_SOURCE = '../source-files/Expression 2020/ExpressionData.csv' +EXPRESSION_DATA_DESTINATION = '../script-results/processed-expression/expression-data.csv' +EXPRESSION_SHEET_DESTINATION = '../script-results/processed-expression/expression-sheet.csv' +GENES_DESTINATION = '../script-results/processed-expression/genes.csv' + +genes = {} +expression_data = [] +expression_sheets = {} +print(f'Processing file {EXPRESSION_DATA_SOURCE}') +with open(EXPRESSION_DATA_SOURCE, 'r+', encoding="UTF-8") as f: + i = 0 + replicate_count = 0 + prev_dataset = "" + reader = csv.reader(f) + for row in reader: + if i != 0: + col_num = 0 + display_gene_id = row[2].replace('\t','') + gene_id = row[1].replace('\t','') + sort_index = row[0] + sample_id = row[4] + expression = row[5] + time_points = row[6] + dataset = row[7] + # update the objects + if gene_id not in genes: + genes.update({gene_id : [display_gene_id, species, taxon_id]}) + expression_data.append([gene_id, taxon_id, sort_index, sample_id, expression, time_points, dataset]) + i+=1 +print(f'Creating {EXPRESSION_DATA_DESTINATION}\n') +expression_data_file = open(EXPRESSION_DATA_DESTINATION, 'w') +expression_data_file.write(f'Gene ID\tTaxon ID\tSort Index\tSample ID\tExpression\tTime Points\tDataset\n') +for d in expression_data: + result = '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(d[0], d[1], d[2], d[3], d[4], d[5], d[6]) + expression_data_file.write(f'{result}\n') +expression_data_file.close() + +# Expression Metadata +EXPRESSION_METADATA_SOURCE = '../source-files/Expression 2020/ExpressionMetadata.csv' +EXPRESSION_METADATA_DESTINATION = '../script-results/processed-expression/expression-metadata.csv' +# Add Dalquist Data Here +expression_metadata = [ + # [1, 'GSE83656', '', 'control_yeast_strain', 'treatment_yeast_strain', 'control', 'treatment', 'concentration_value', 'concentration_unit', 'time_value', 'time_unit', 'number_of_replicates,', 'expression_table'], + # [3, 'GSE83656', '', 'control_yeast_strain', 'treatment_yeast_strain', 'control', 'treatment', 'concentration_value', 'concentration_unit', 'time_value', 'time_unit', 'number_of_replicates,', 'expression_table'], + # [2, 'GSE83656', '', 'control_yeast_strain', 'treatment_yeast_strain', 'control', 'treatment', 'concentration_value', 'concentration_unit', 'time_value', 'time_unit', 'number_of_replicates,', 'expression_table'], + # [4, 'GSE83656', '', 'control_yeast_strain', 'treatment_yeast_strain', 'control', 'treatment', 'concentration_value', 'concentration_unit', 'time_value', 'time_unit', 'number_of_replicates,', 'expression_table'], +] + +pubmed_to_geo_conversion = { + '12269742': 'GSE9336', + '17327492': 'GSE6129', + '23039231': 'GSE24712' +} + +print(f'Processing file {EXPRESSION_METADATA_SOURCE}') +with open(EXPRESSION_METADATA_SOURCE, 'r+', encoding="UTF-8") as f: + i = 0 + reader = csv.reader(f) + for row in reader: + if i != 0: + # replicate_index = row[0][-1] + pubmed_id = row[1] + geo_id = pubmed_to_geo_conversion[pubmed_id] + control_yeast_strain = row[2] + treatment_yeast_strain = row[3] + control = row[4] + treatment = row[5] + concentration_value = row[6] + concentration_unit = row[7] + time_value = row[8] + time_unit = row[9] + number_of_replicates = row[10] + expression_table = row[11] + + expression_metadata.append([geo_id, pubmed_id, control_yeast_strain, treatment_yeast_strain, control, treatment, concentration_value, concentration_unit, time_value, time_unit, number_of_replicates, expression_table]) + # next row + i+= 1 + +print(f'Creating {EXPRESSION_METADATA_DESTINATION}\n') +expression_metadata_file = open(EXPRESSION_METADATA_DESTINATION, 'w') +expression_metadata_file.write(f'NCBI GEO ID\tPubmed ID\tControl Yeast Strain\tTreatment Yeast Strain\tControl\tTreatment\tConcentration Value\tConcentration Unit\tTime Value\tTime Units\tNumber of Replicates\tExpression Table\n') +for m in expression_metadata: + expression_metadata_file.write(f'{m[0]}\t{m[1]}\t{m[2]}\t{m[3]}\t{m[4]}\t{m[5]}\t{m[6]}\t{m[7]}\t{m[8]}\t{m[9]}\t{m[10]}\t{m[11]}\n') +expression_metadata_file.close() + + +# Refs csv file generation (She is smol so we write her ourselves) +refs = [ + # [pubmed_id, authors, publication_year, title, doi, ncbi_geo_id] + ['12269742', 'Kitagawa E., Takahashi J., Momose Y., Iwahashi H.', '2002', 'Effects of the Pesticide Thiuram: Genome-wide Screening of Indicator Genes by Yeast DNA Microarray', '10.1021/es015705v', 'GSE9336'], + ['17327492', 'Thorsen, M., Lagniel, G., Kristiansson, E., Junot, C., Nerman, O., Labarre, J., & Tamás, M. J.', '2007', 'Quantitative transcriptome, proteome, and sulfur metabolite profiling of the Saccharomyces cerevisiae response to arsenite.', '10.1152/physiolgenomics.00236.2006', 'GSE6129'], + ['23039231', 'Barreto, L., Canadell, D., Valverde‐Saubí, D., Casamayor, A., & Ariño, J.', '2012', 'The short‐term response of yeast to potassium starvation', '10.1111/j.1462-2920.2012.02887.x', 'GSE24712'], + ['', 'Dahlquist KD, Abdulla H, Arnell AJ, Arsan C, Baker JM, Carson RM, Citti WT, De Las Casas SE, Ellis LG, Entzminger KC, Entzminger SD, Fitzpatrick BG, Flores SP, Harmon NS, Hennessy KP, Herman AF, Hong MV, King HL, Kubeck LN, La-Anyane OM, Land DL, Leon Guerrero MJ, Liu EM, Luu MD, McGee KP, Mejia MR, Melone SN, Pepe NT, Rodriguez KR, Rohacz NA, Rovetti RJ, Sakhon OS, Sampana JT, Sherbina K, Terada LH, Vega AJ, Wavrin AJ, Wyllie KW, Zapata BB', + '2018', 'Global transcriptional response of wild type and transcription factor deletion strains of Saccharomyces cerevisiae to the environmental stress of cold shock and subsequent recovery', + '', 'GSE83656'], + ['25161313', 'Neymotin, B., Athanasiadou R., and Gresham D.', '2014', ' Determination of in vivo RNA kinetics using RATE-seq. RNA, 20, 1645-1652.', '10.1261/rna.045104.114', ''] +] + +REFS_DESTINATION = '../script-results/processed-expression/refs.csv' +print(f'Creating {REFS_DESTINATION}\n') +refs_file = open(REFS_DESTINATION, 'w') +refs_file.write(f'Pubmed ID\tAuthors\tPublication Year\tTitle\tDOI\tNCBI GEO ID\n') +for r in refs: + result = '{}\t{}\t{}\t{}\t{}\t{}'.format(r[0], r[1], r[2], r[3], r[4], r[5]) + refs_file.write(f'{result}\n') +refs_file.close() + +# Degradation Rates +DEGRADATION_RATES_SOURCE = '../source-files/Expression 2020/DegradationRates.csv' +DEGRADATION_RATES_DESTINATION = '../script-results/processed-expression/degradation-rates.csv' + +degradation_rates = [] + +print(f'Processing file {DEGRADATION_RATES_SOURCE}') +with open(DEGRADATION_RATES_SOURCE, 'r+', encoding="UTF-8") as f: + i = 0 + reader = csv.reader(f) + for row in reader: + if i != 0: + gene_id = row[0] + display_gene_id = row[1] + degradation_rate = row[2] + pubmed_id = "25161313" + geo_id = "" + degradation_rates.append([gene_id, taxon_id, geo_id, pubmed_id, degradation_rate]) + if gene_id not in genes: + genes.update({gene_id : [display_gene_id, species, taxon_id]}) + i+= 1 + +print(f'Creating {DEGRADATION_RATES_DESTINATION}\n') +degradation_rates_file = open(DEGRADATION_RATES_DESTINATION, 'w') +degradation_rates_file.write(f'Gene ID\tTaxon ID\tNCBI GEO ID\tPubmed ID\tDegradation Rate\n') +for r in degradation_rates: + result = '{}\t{}\t{}\t{}\t{}'.format(r[0], r[1], r[2], r[3], r[4]) + degradation_rates_file.write(f'{result}\n') +degradation_rates_file.close() + +# Production Rates +PRODUCTION_RATES_SOURCE = '../source-files/Expression 2020/ProductionRates.csv' +PRODUCTION_RATES_DESTINATION = '../script-results/processed-expression/production-rates.csv' + +production_rates = [] + +print(f'Processing file {PRODUCTION_RATES_SOURCE}') +with open(PRODUCTION_RATES_SOURCE, 'r+', encoding="UTF-8") as f: + i = 0 + reader = csv.reader(f) + for row in reader: + if i != 0: + gene_id = row[0] + display_gene_id = row[1] + production_rate = row[2] + pubmed_id = "25161313" + geo_id = "" + production_rates.append([gene_id, taxon_id, geo_id, pubmed_id, production_rate]) + if gene_id not in genes: + genes.update({gene_id : [display_gene_id, species, taxon_id]}) + # next row + i+= 1 + +print(f'Creating {PRODUCTION_RATES_DESTINATION}\n') +production_rates_file = open(PRODUCTION_RATES_DESTINATION, 'w') +production_rates_file.write(f'Gene ID\tTaxon ID\tNCBI GEO ID\tPubmed ID\tProduction Rate\n') +for r in production_rates: + result = '{}\t{}\t{}\t{}\t{}'.format(r[0], r[1], r[2], r[3], r[4]) + production_rates_file.write(f'{result}\n') +production_rates_file.close() + + +print(f'Creating {GENES_DESTINATION}\n') +genes_file = open(GENES_DESTINATION, 'w') +genes_file.write(f'Gene ID\tDisplay Gene ID\tSpecies\tTaxon ID\n') +for g in genes: + result = '{}\t{}\t{}\t{}'.format(g, genes[g][0], genes[g][1], genes[g][2],) + genes_file.write(f'{result}\n') +genes_file.close() \ No newline at end of file diff --git a/database2/grnsettings-database/README.md b/database2/grnsettings-database/README.md new file mode 100644 index 00000000..cb344570 --- /dev/null +++ b/database2/grnsettings-database/README.md @@ -0,0 +1,52 @@ +# GRNsettings Database +The schema of this database lives within this directory. + +## The basics + +### Schema +The default database name is stored within the settings schema on our Postgres database. + +The schema is located within this directory at the top level of this file `schema.sql`. It creates the schema as well as defining the table located within the settings schema. + +1. Move to file that contains `schema.sql` file, which is under `database/grnsettings-database` folder + +2. Load database + + Example of loading to local database + + For Windows: + ``` + psql -U postgres -f schema.sql postgresql://localhost/postgres + ``` + + For Mac: + ``` + psql -f schema.sql postgresql://localhost/postgres + ``` + +### Changing the default database name + +1. In order to change the default database name you would first need to login to the database using the following command: + + For Windows: + ``` + psql -U postgres
+ ``` + For Mac: + ``` + psql
+ ``` + +2. Then you will need to set your search path to the settings schema using the following command : + ``` + SET SEARCH_PATH TO settings; + ``` +3. After that you will simply delete the current default database name using this command + ``` + DELETE FROM grnsettings; + ``` +4. And then insert the new database name using the following command + ``` + INSERT INTO grnsettings(expression_dataset) VALUES ('the new default database name'); + ``` + _The current default database is 'dahlquist_2018'_ diff --git a/database2/network-database/ReadME.md b/database2/network-database/ReadME.md new file mode 100644 index 00000000..19a8fe64 --- /dev/null +++ b/database2/network-database/ReadME.md @@ -0,0 +1 @@ +# Network Database \ No newline at end of file diff --git a/database2/network-database/constants.py b/database2/network-database/constants.py new file mode 100644 index 00000000..f2c1f753 --- /dev/null +++ b/database2/network-database/constants.py @@ -0,0 +1,27 @@ +class Constants: + # database namespace + GRN_DATABASE_NAMESPACE = "gene_regulatory_network_new" + PPI_DATABASE_NAMESPACE = "protein_protein_interactions_new" + + # network types + GRN_NETWORK_MODE = "grn" + PPI_NETWORK_MODE = "ppi" + + # data file paths + DATA_DIRECTORY = "script-results" + GENE_DATA_FILEPATH = DATA_DIRECTORY + "/gene_data.tsv" + PROTEIN_DATA_FILEPATH = DATA_DIRECTORY + "/protein_data.tsv" + GENE_REGULATORY_NETWORK_DATA_FILEPATH = DATA_DIRECTORY + "/gene_regulatory_network_data.tsv" + PROTEIN_PROTEIN_INTERACTIONS_DATA_FILEPATH = DATA_DIRECTORY + "/protein_protein_interactions_data.tsv" + SOURCE_DATA_FILEPATH = DATA_DIRECTORY + "/source_data.tsv" + + # missing and update file paths + MISSING_DATA_DIRECTORY = DATA_DIRECTORY + "/missing_data" + UPDATE_DATA_DIRECTORY = DATA_DIRECTORY + "/update_data" + MISSING_GRN_GENE_DATA_FILEPATH = MISSING_DATA_DIRECTORY + "/missing_grn_gene_data.tsv" + UPDATE_GRN_GENE_DATA_FILEPATH = UPDATE_DATA_DIRECTORY + "/update_grn_gene_data.tsv" + MISSING_PPI_GENE_DATA_FILEPATH = MISSING_DATA_DIRECTORY + "/missing_ppi_gene_data.tsv" + UPDATE_PPI_GENE_DATA_FILEPATH = UPDATE_DATA_DIRECTORY + "/update_ppi_gene_data.tsv" + MISSING_PROTEIN_DATA_FILEPATH = MISSING_DATA_DIRECTORY + "/missing_protein_data.tsv" + UPDATE_PROTEIN_DATA_FILEPATH = UPDATE_DATA_DIRECTORY + "/update_protein_data.tsv" + UPDATE_PROTEIN_NAME_DATA_FILEPATH = UPDATE_DATA_DIRECTORY + "/update_protein_name_data.tsv" diff --git a/database2/network-database/data_services/data_fetcher_service.py b/database2/network-database/data_services/data_fetcher_service.py new file mode 100644 index 00000000..c5eaa67a --- /dev/null +++ b/database2/network-database/data_services/data_fetcher_service.py @@ -0,0 +1,244 @@ +from abc import ABC, abstractmethod +from intermine.webservice import Service +import requests +import pandas as pd +from io import StringIO + +class DataFetcherService(ABC): + def __init__(self): + self.service = Service("https://www.alliancegenome.org/alliancemine/service") + + @abstractmethod + def fetch_data(self): + pass + +class GeneFetcherService(DataFetcherService): + def fetch_data(self): + print("Fetching data from GeneFetcherService") + + query = self.service.new_query("Gene") + query.add_view( + "primaryIdentifier", "name", "briefDescription", + "chromosome.primaryIdentifier", "chromosomeLocation.start", + "chromosomeLocation.end", "chromosomeLocation.strand", "organism.shortName", + "featureType", "symbol", "secondaryIdentifier" + ) + query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="A") + query.add_constraint("featureType", "=", "ORF", code="B") + query.add_sort_order("Gene.primaryIdentifier", "ASC") + + rows_data = [] + for row in query.rows(): + rows_data.append({ + "primaryIdentifier": row["primaryIdentifier"], + "name": row["name"], + "briefDescription": row["briefDescription"], + "chromosome.primaryIdentifier": row["chromosome.primaryIdentifier"], + "chromosomeLocation.start": row["chromosomeLocation.start"], + "chromosomeLocation.end": row["chromosomeLocation.end"], + "chromosomeLocation.strand": row["chromosomeLocation.strand"], + "organism.shortName": row["organism.shortName"], + "featureType": row["featureType"], + "standardName": row["symbol"] if pd.notnull(row["symbol"]) else row["secondaryIdentifier"], + "systematicName": row["secondaryIdentifier"] + }) + + df = pd.DataFrame(rows_data) + + print("Data fetched successfully") + print("====================================================================") + return df + + +class GeneRegulatoryNetworkFetcherService(DataFetcherService): + def fetch_data(self): + print("Fetching data from GeneRegulatoryNetworkFetcherService") + + query = self.service.new_query("Gene") + query.add_constraint("regulatoryRegions", "TFBindingSite") + + query.add_view( + "regulatoryRegions.regulator.symbol", + "regulatoryRegions.regulator.secondaryIdentifier", "symbol", + "secondaryIdentifier", "regulatoryRegions.regEvidence.ontologyTerm.name", + "regulatoryRegions.regEvidence.ontologyTerm.identifier", + "regulatoryRegions.experimentCondition", + "regulatoryRegions.strainBackground", + "regulatoryRegions.regulationDirection", "regulatoryRegions.regulationType", + "regulatoryRegions.regulatorType", + "regulatoryRegions.publications.pubMedId", "regulatoryRegions.datasource", + "regulatoryRegions.annotationType", "featureType", + "regulatoryRegions.regulator.featureType" + ) + + query.add_sort_order("Gene.secondaryIdentifier", "ASC") + query.add_constraint("featureType", "=", "ORF", code="A") + query.add_constraint("regulatoryRegions.regulator.featureType", "=", "ORF", code="B") + query.add_constraint("regulatoryRegions.strainBackground", "=", "S288c", code="C") + + rows_data = [] + print("Query length: ", len(query.rows())) + networks = set() + for row in query.rows(): + network = (row["secondaryIdentifier"], row["regulatoryRegions.regulator.secondaryIdentifier"], row["regulatoryRegions.annotationType"]) + if network in networks: + continue + else: + networks.add(network) + rows_data.append({ + "regulatorStandardName": row["regulatoryRegions.regulator.symbol"], + "regulatorSystematicName": row["regulatoryRegions.regulator.secondaryIdentifier"], + "targetStandardName": row["symbol"], + "targetSystematicName": row["secondaryIdentifier"], + "ontologyTermName": row["regulatoryRegions.regEvidence.ontologyTerm.name"], + "ontologyTermIdentifier": row["regulatoryRegions.regEvidence.ontologyTerm.identifier"], + "experimentCondition": row["regulatoryRegions.experimentCondition"], + "strainBackground": row["regulatoryRegions.strainBackground"], + "regulationDirection": row["regulatoryRegions.regulationDirection"], + "regulatoryRegionsRegulationType": row["regulatoryRegions.regulationType"], + "regulatoryRegionsRegulatorType": row["regulatoryRegions.regulatorType"], + "pubMedId": row["regulatoryRegions.publications.pubMedId"], + "datasource": row["regulatoryRegions.datasource"], + "annotationType": row["regulatoryRegions.annotationType"] + }) + + df = pd.DataFrame(rows_data) + print("Data fetched successfully") + print("Number of duplicated networks: ", len(query.rows()) - len(networks)) + print("====================================================================") + return df + +class ProteinProteinInteractionsFetcherService(DataFetcherService): + def fetch_data(self): + print("Fetching data from ProteinProteinInteractionsFetcherService") + query = self.service.new_query("Gene") + query.add_constraint("interactions.participant2", "Gene") + + query.add_view( + "primaryIdentifier", "secondaryIdentifier", "symbol", "name", "sgdAlias", + "interactions.details.annotationType", + "interactions.details.experiment.publication.pubMedId", + "interactions.participant2.symbol", + "interactions.participant2.secondaryIdentifier", + "interactions.details.experiment.interactionDetectionMethods.identifier", + "interactions.details.experiment.name", + "interactions.details.relationshipType", "featureType", + "interactions.participant2.featureType", "proteins.symbol", + "interactions.participant2.proteins.symbol" + ) + + query.add_sort_order("Gene.primaryIdentifier", "ASC") + query.add_constraint("interactions.details.relationshipType", "=", "physical", code="A") + query.add_constraint("interactions.participant2.featureType", "=", "ORF", code="C") + query.add_constraint("featureType", "=", "ORF", code="B") + query.set_logic("A and B and C") + + rows_data = [] + interactions = set() + count = 0 + print("Query length: ", len(query.rows())) + for row in query.rows(): + interaction = (row["secondaryIdentifier"], row["interactions.participant2.secondaryIdentifier"], row["interactions.details.annotationType"]) + if interaction in interactions: + count += 1 + continue + else: + interactions.add(interaction) + rows_data.append({ + "primaryIdentifier": row["primaryIdentifier"], + "gene1SystematicName": row["secondaryIdentifier"], + "gene1StandardName": row["symbol"], + "protein1StandardName": row["proteins.symbol"], + "name ": row["name"], + "sgdAlias": row["sgdAlias"], + "annotationType": row["interactions.details.annotationType"], + "pubMedId": row["interactions.details.experiment.publication.pubMedId"], + "gene2StandardName": row["interactions.participant2.symbol"], + "gene2SystematicName": row["interactions.participant2.secondaryIdentifier"], + "protein2StandardName": row["interactions.participant2.proteins.symbol"], + "interactionDetectionMethodsIdentifier": row["interactions.details.experiment.interactionDetectionMethods.identifier"], + "experimentName": row["interactions.details.experiment.name"], + "relationshipType": row["interactions.details.relationshipType"] + }) + + df = pd.DataFrame(rows_data) + print("Data fetched successfully") + print("Number of duplicated interactions: ", count) + print("====================================================================") + return df + +class ProteinFetcherService(DataFetcherService): + def fetch_data(self): + print("Fetching data from ProteinFetcherService") + + query = self.service.new_query("Gene") + + # The view specifies the output columns + query.add_view( + "proteins.secondaryIdentifier", "proteins.symbol", "proteins.molecularWeight", + "proteins.pI", "proteins.length", "proteins.ntermseq", "proteins.ctermseq", + "proteins.gravyScore", "proteins.aromaticityScore", "proteins.cai", + "proteins.codonBias", "proteins.fopScore", "proteins.ala", "proteins.arg", + "proteins.asn", "proteins.asp", "proteins.cys", "proteins.gln", "proteins.glu", + "proteins.gly", "proteins.his", "proteins.ile", "proteins.leu", + "proteins.lys", "proteins.met", "proteins.phe", "proteins.pro", + "proteins.ser", "proteins.thr", "proteins.trp", "proteins.val", + "proteins.carbon", "proteins.hydrogen", "proteins.nitrogen", + "proteins.oxygen", "proteins.sulphur", "proteins.instabilityIndex", + "proteins.allCysHalf", "proteins.noCysHalf", "proteins.aliphaticIndex", "symbol" + ) + + query.add_constraint("organism.shortName", "=", "S. cerevisiae", code="B") + query.add_constraint("featureType", "=", "ORF", code="A") + query.add_sort_order("Gene.secondaryIdentifier", "ASC") + + rows_data = [] + for row in query.rows(): + rows_data.append({ + "geneStandardName": row["symbol"], + "proteinSystematicName": row["proteins.secondaryIdentifier"], + "proteinStandardName": row["proteins.symbol"], + "molecularWeight": row["proteins.molecularWeight"] if pd.notnull(row["proteins.molecularWeight"]) else "0", + "pI": row["proteins.pI"] if pd.notnull(row["proteins.pI"]) else "0", + "length": row["proteins.length"] if pd.notnull(row["proteins.length"]) else "0", + "ntermseq": row["proteins.ntermseq"], + "ctermseq": row["proteins.ctermseq"], + "gravyScore": row["proteins.gravyScore"], + "aromaticityScore": row["proteins.aromaticityScore"], + "cai": row["proteins.cai"], + "codonBias": row["proteins.codonBias"], + "fopScore": row["proteins.fopScore"], + "ala": row["proteins.ala"], + "arg": row["proteins.arg"], + "asn": row["proteins.asn"], + "asp": row["proteins.asp"], + "cys": row["proteins.cys"], + "gln": row["proteins.gln"], + "glu": row["proteins.glu"], + "gly": row["proteins.gly"], + "his": row["proteins.his"], + "ile": row["proteins.ile"], + "leu": row["proteins.leu"], + "lys": row["proteins.lys"], + "met": row["proteins.met"], + "phe": row["proteins.phe"], + "pro": row["proteins.pro"], + "ser": row["proteins.ser"], + "thr": row["proteins.thr"], + "trp": row["proteins.trp"], + "val": row["proteins.val"], + "carbon": row["proteins.carbon"], + "hydrogen": row["proteins.hydrogen"], + "nitrogen": row["proteins.nitrogen"], + "oxygen": row["proteins.oxygen"], + "sulphur": row["proteins.sulphur"], + "instabilityIndex": row["proteins.instabilityIndex"], + "allCysHalf": row["proteins.allCysHalf"], + "noCysHalf": row["proteins.noCysHalf"], + "aliphaticIndex": row["proteins.aliphaticIndex"] + }) + + df = pd.DataFrame(rows_data) + print("Data fetched successfully") + print("====================================================================") + return df \ No newline at end of file diff --git a/database2/network-database/data_services/data_generator.py b/database2/network-database/data_services/data_generator.py new file mode 100644 index 00000000..09500155 --- /dev/null +++ b/database2/network-database/data_services/data_generator.py @@ -0,0 +1,58 @@ +from data_services.data_fetcher_service import * +from data_services.processor import * +from data_services.save_service import * +from constants import Constants + +class DataGenerator: + def __init__(self, data_fetcher=None, processor=None, save_service=None, filepath=None): + self.data_fetcher = data_fetcher + self.processor = processor + self.save_service = save_service + self.file_directory = Constants.DATA_DIRECTORY + self.filepath = filepath + self.data = None + self.generate() + + def generate(self): + if self.data_fetcher: + self.data = self.data_fetcher.fetch_data() + if self.processor: + self.data = self.processor.process_data(self.data) + if self.save_service and self.data is not None: + self.save_service.save(self.data, self.file_directory, self.filepath) + + +class GeneRegulatoryNetworkDataGenerator(DataGenerator): + def __init__(self, data_fetcher, processor, save_service): + super().__init__(data_fetcher, processor, save_service, Constants.GENE_REGULATORY_NETWORK_DATA_FILEPATH) + + +class GeneDataGenerator(DataGenerator): + def __init__(self, data_fetcher, processor, save_service, regulators=None, proteins=None): + self.regulators = regulators + self.proteins = proteins + super().__init__(data_fetcher, processor, save_service, Constants.GENE_DATA_FILEPATH) + + def generate(self): + self.data = self.data_fetcher.fetch_data() + self.data = self.processor.process_data(self.data, self.regulators, self.proteins) + self.save_service.save(self.data, self.file_directory, self.filepath) + + +class ProteinDataGenerator(DataGenerator): + def __init__(self, data_fetcher, processor, save_service): + super().__init__(data_fetcher, processor, save_service, Constants.PROTEIN_DATA_FILEPATH) + + +class ProteinProteinInteractionsDataGenerator(DataGenerator): + def __init__(self, data_fetcher, processor, save_service): + super().__init__(data_fetcher, processor, save_service, Constants.PROTEIN_PROTEIN_INTERACTIONS_DATA_FILEPATH) + + +class SourceDataGenerator(DataGenerator): + def __init__(self, processor, save_service): + super().__init__(None, processor, save_service, Constants.SOURCE_DATA_FILEPATH) + + def generate(self): + self.data = self.processor.process_data() + self.save_service.save(self.data, self.file_directory, self.filepath) diff --git a/database2/network-database/data_services/processor.py b/database2/network-database/data_services/processor.py new file mode 100644 index 00000000..d9623471 --- /dev/null +++ b/database2/network-database/data_services/processor.py @@ -0,0 +1,159 @@ +from abc import ABC, abstractmethod +from datetime import datetime, timezone, timedelta +import pandas as pd + +class Processor(ABC): + def __init__(self, formatted_time_stamp=None): + self.species = "Saccharomyces cerevisiae" + self.taxon_id = "559292" + self.source = "AllianceMine - Saccharomyces Genome Database" + self.source_display_name = "AllianceMine - SGD" + self.formatted_time_stamp = formatted_time_stamp + + @abstractmethod + def process_data(self, data): + pass + +class GeneProcessor(Processor): + def __init__(self, formatted_time_stamp): + super().__init__(formatted_time_stamp) + + def process_data(self, data, regulators, proteins): + print("Processing data from GeneProcessor") + + genes_df = data[['systematicName', 'standardName']] + if proteins is not None: + combine_genes_df = pd.concat([genes_df, self._combine_with_protein_genes(genes=data, proteins=proteins)]) + else: + combine_genes_df = genes_df + processed_data = [] + for _, row in combine_genes_df.iterrows(): + gene_id = row['systematicName'] + # Check if the gene_id (systematicName) matches any of the regulators + regulator = gene_id in regulators["regulator_gene_id"].values + + processed_data.append({ + "gene_id": gene_id, + "display_gene_id": row['standardName'], + "species": self.species, + "taxon_id": self.taxon_id, + "regulator": regulator, + "time_stamp": self.formatted_time_stamp, + "source": self.source + }) + + processed_df = pd.DataFrame(processed_data) + print("Finished processing data from GeneProcessor") + print("====================================================================") + return processed_df + + + def _combine_with_protein_genes(self, genes, proteins): + genes_systematic_names = set(genes['systematicName']) + proteins_systematic_names = set(proteins['gene_systematic_name']) + diff_systematic_names = genes_systematic_names.symmetric_difference(proteins_systematic_names) + + # Filter the rows in genes and proteins where their first element is in the difference + genes_diff = genes[genes['systematicName'].isin(diff_systematic_names)] + proteins_diff = proteins[proteins['gene_systematic_name'].isin(diff_systematic_names)] + + # Combine the differences from both genes and proteins + diff_combined = pd.concat([ + genes_diff[['systematicName', 'standardName']], + proteins_diff[['gene_systematic_name', 'standard_name']].rename( + columns={'gene_systematic_name': 'systematicName', 'standard_name': 'standardName'} + ) + ], ignore_index=True) + + return diff_combined + + +class GeneRegulatoryNetworkProcessor(Processor): + def __init__(self, formatted_time_stamp): + super().__init__(formatted_time_stamp) + + def process_data(self, data): + print("Processing data from GeneRegulatoryNetworkProcessor") + + processed_data = [] + + for _, row in data.iterrows(): + processed_data.append({ + "regulator_gene_id": row['regulatorSystematicName'], + "target_gene_id": row['targetSystematicName'], + "taxon_id": self.taxon_id, + "annotation_type": row['annotationType'], + "time_stamp": self.formatted_time_stamp, + "source": self.source + }) + + processed_df = pd.DataFrame(processed_data) + print("Finished processing data from GeneRegulatoryNetworkProcessor") + print("====================================================================") + return processed_df + +class ProteinProcessor(Processor): + def __init__(self, formatted_time_stamp): + super().__init__(formatted_time_stamp) + + def process_data(self, data): + print("Processing data from ProteinProcessor") + + processed_data = [] + for _, row in data.iterrows(): + processed_data.append({ + "standard_name": row['proteinStandardName'], + "gene_systematic_name": row['proteinSystematicName'], + "length": row['length'], + "molecular_weight": row['molecularWeight'], + "pi": row['pI'], + "taxon_id": self.taxon_id, + "time_stamp": self.formatted_time_stamp, + "source": self.source + }) + + processed_df = pd.DataFrame(processed_data) + print("Finished processing data from ProteinProcessor") + print("====================================================================") + return processed_df + +class ProteinProteinInteractionsProcessor(Processor): + def __init__(self, formatted_time_stamp): + super().__init__(formatted_time_stamp) + + def process_data(self, data): + print("Processing data from ProteinProteinInteractionsProcessor") + processed_data = [] + for _, row in data.iterrows(): + processed_data.append({ + "protein1": row['protein1StandardName'], + "protein2": row['protein2StandardName'], + "interaction_detection_methods_identifier": row['interactionDetectionMethodsIdentifier'], + "annotation_type": row['annotationType'], + "experiment_name": row['experimentName'], + "time_stamp": self.formatted_time_stamp, + "source": self.source + }) + + processed_df = pd.DataFrame(processed_data) + print("Finished processing data from ProteinProteinInteractionsProcessor") + print("====================================================================") + return processed_df + +class SourceProcessor(Processor): + def __init__(self, formatted_time_stamp): + super().__init__(formatted_time_stamp) + + def process_data(self): + print("Processing data from SourceProcessor") + processed_data = [] + processed_data.append({ + "time_stamp": self.formatted_time_stamp, + "source": self.source, + "display_name": self.source_display_name + }) + + processed_df = pd.DataFrame(processed_data) + print("Finished processing data from SourceProcessor") + print("====================================================================") + return processed_df \ No newline at end of file diff --git a/database2/network-database/data_services/save_service.py b/database2/network-database/data_services/save_service.py new file mode 100644 index 00000000..f7163d62 --- /dev/null +++ b/database2/network-database/data_services/save_service.py @@ -0,0 +1,14 @@ +import os + +class SaveToTSVService: + + def save(self, data, file_directory, filepath): + print(f"Saving data to {filepath} file") + + if not os.path.exists(file_directory): + os.makedirs(file_directory, exist_ok=True) + + data.to_csv(filepath, sep='\t', index=False) + + print(f"Data saved to {filepath} file") + print("====================================================================") \ No newline at end of file diff --git a/database2/network-database/database_services/filter.py b/database2/network-database/database_services/filter.py new file mode 100644 index 00000000..426a4f9b --- /dev/null +++ b/database2/network-database/database_services/filter.py @@ -0,0 +1,156 @@ +import psycopg2 +import csv +import pandas as pd +from constants import Constants + +class Filter: + def __init__(self, db_url, save_service): + self.db_url = db_url + self.save_service = save_service + + def get_all_db_data(self, database_namespace, table_name, columns): + """ + Fetch all data from the specified table and return it as a list of dictionaries. + """ + conn = psycopg2.connect(self.db_url) + cursor = conn.cursor() + + query = f"SELECT {', '.join(columns)} FROM {database_namespace}.{table_name};" + cursor.execute(query) + + rows = cursor.fetchall() + column_names = [desc[0] for desc in cursor.description] + + result = [dict(zip(column_names, row)) for row in rows] + + cursor.close() + conn.close() + + return result + + def filter_data(self, data_filepath, db_data, key_columns, update_columns): + """ + Filter the data to return: + - Records that need to be inserted. + - Records that need to be updated. + """ + with open(data_filepath, 'r') as f: + reader = csv.DictReader(f, delimiter='\t') + data = list(reader) + + db_keys = {tuple(row[col] for col in key_columns): row for row in db_data} + + insert_data = [] + update_data = [] + update_data_names = [] + + for row in data: + key_tuple = tuple(row[col] for col in key_columns) + if key_tuple in db_keys: + db_record = db_keys[key_tuple] + changes_needed = False + + for col in update_columns: + if str(row[col]).lower() != str(db_record[col]).lower(): + # Special case for protein daat that ned to check if standard name is changed + if col == "standard_name" and data_filepath == Constants.PROTEIN_DATA_FILEPATH: + update_data_names.append({ + "old_standard_name": db_record[col], + "new_standard_name": row[col], + }) + + if col == "length" or col == "molecular_weight" or col == "pi": + if float(row[col]) == float(db_record[col]): + continue + + changes_needed = True + break + + if changes_needed: + update_data.append({ + **{col: row[col] for col in key_columns + update_columns}, + }) + else: + insert_data.append(row) + + insert_data_df = pd.DataFrame(insert_data) + update_data_df = pd.DataFrame(update_data) + + self.save_service.save(insert_data_df, Constants.MISSING_DATA_DIRECTORY, self.missing_filepath) + self.save_service.save(update_data_df, Constants.UPDATE_DATA_DIRECTORY, self.update_filepath) + + if data_filepath == Constants.PROTEIN_DATA_FILEPATH: + update_data_names_df = pd.DataFrame(update_data_names) + self.save_service.save(update_data_names_df, Constants.UPDATE_DATA_DIRECTORY, Constants.UPDATE_PROTEIN_NAME_DATA_FILEPATH) + + +class ProteinFilter(Filter): + def __init__(self, db_url, save_service): + super().__init__(db_url, save_service) + self.missing_filepath = Constants.MISSING_PROTEIN_DATA_FILEPATH + self.update_filepath = Constants.UPDATE_PROTEIN_DATA_FILEPATH + + def get_all_db_data(self): + """ + Fetch all protein data from the database. + """ + columns = ["standard_name", "gene_systematic_name", "length", "molecular_weight", "pi"] + return super().get_all_db_data(Constants.PPI_DATABASE_NAMESPACE, "protein", columns) + + def filter_data(self): + """ + Filter protein data that is missing or needs to be updated in the database. + """ + db_data = self.get_all_db_data() + + key_columns = ["gene_systematic_name"] + update_columns = ["standard_name", "length", "molecular_weight", "pi"] + + return super().filter_data(Constants.PROTEIN_DATA_FILEPATH, db_data, key_columns, update_columns) + +class GeneFilter(Filter): + def __init__(self, db_url, save_service, network_mode): + super().__init__(db_url, save_service) + self.network_mode = network_mode + if network_mode == Constants.GRN_NETWORK_MODE: + self.missing_filepath = Constants.MISSING_GRN_GENE_DATA_FILEPATH + self.update_filepath = Constants.UPDATE_GRN_GENE_DATA_FILEPATH + self.database_namespace = Constants.GRN_DATABASE_NAMESPACE + elif network_mode == Constants.PPI_NETWORK_MODE: + self.missing_filepath = Constants.MISSING_PPI_GENE_DATA_FILEPATH + self.update_filepath = Constants.UPDATE_PPI_GENE_DATA_FILEPATH + self.database_namespace = Constants.PPI_DATABASE_NAMESPACE + else: + raise ValueError("Unknown network type specified.") + + def get_all_db_data(self): + """ + Fetch all gene data from the database. + """ + if self.network_mode == Constants.GRN_NETWORK_MODE: + columns = ["gene_id", "display_gene_id", "regulator"] + elif self.network_mode == Constants.PPI_NETWORK_MODE: + + columns = ["gene_id", "display_gene_id"] + else: + raise ValueError("Unknown network type specified.") + + return super().get_all_db_data(self.database_namespace, "gene", columns) + + def filter_data(self): + """ + Filter gene data that is missing or needs to be updated in the database. + """ + + if self.network_mode == Constants.GRN_NETWORK_MODE: + update_columns = ["display_gene_id", "regulator"] + elif self.network_mode == Constants.PPI_NETWORK_MODE: + update_columns = ["display_gene_id"] + else: + raise ValueError("Unknown network type specified.") + + key_columns = ["gene_id"] + + db_data = self.get_all_db_data() + + return super().filter_data(Constants.GENE_DATA_FILEPATH, db_data, key_columns, update_columns) diff --git a/database2/network-database/database_services/populator.py b/database2/network-database/database_services/populator.py new file mode 100644 index 00000000..259f9a31 --- /dev/null +++ b/database2/network-database/database_services/populator.py @@ -0,0 +1,124 @@ +import psycopg2 +from abc import ABC, abstractmethod +from constants import Constants +from io import StringIO + +class DataPopulator(ABC): + + def __init__(self, db_url): + self.db_url = db_url + self.filepath = None + self.network_mode = None + + @abstractmethod + def get_copy_statement(self): + """ + This method should return the COPY SQL statement for the specific type of data. + """ + pass + + def determine_database_namespace(self, network_mode): + if network_mode == Constants.GRN_NETWORK_MODE: + return Constants.GRN_DATABASE_NAMESPACE + elif network_mode == Constants.PPI_NETWORK_MODE: + return Constants.PPI_DATABASE_NAMESPACE + else: + raise ValueError(f"Unknown network type: {network_mode}") + + def process_file(self, conn, cursor, data_filepath, copy_statement): + """ + A helper function that processes the input file and performs the COPY command to load data into the database. + If the network is PPI, it drops the last column from the input data. + """ + + # Determine if we need to drop the last column (PPI network type) + if self.network_mode == Constants.PPI_NETWORK_MODE and data_filepath == Constants.MISSING_PPI_GENE_DATA_FILEPATH: + print("Dropping the regulator column from the input data...") + processed_rows = [] + + with open(data_filepath, 'r') as f: + for line in f: + columns = line.strip().split('\t') + processed_row = columns[:4] + columns[5:] + processed_rows.append('\t'.join(processed_row)) + + from io import StringIO + temp_file = StringIO("\n".join(processed_rows)) + + # Execute the COPY command using the processed data (without the last column) + cursor.copy_expert(sql=copy_statement, file=temp_file) + conn.commit() + + else: + with open(data_filepath, 'r') as f: + cursor.copy_expert(sql=copy_statement, file=f) + conn.commit() + + print(f"Data from {data_filepath} has been successfully populated.") + print("===============================================") + + def populate_data(self): + conn = psycopg2.connect(self.db_url) + cursor = conn.cursor() + + copy_statement = self.get_copy_statement() + + self.process_file(conn, cursor, self.filepath, copy_statement) + + cursor.close() + conn.close() + +class GeneDataPopulator(DataPopulator): + def __init__(self, db_url, network_mode): + super().__init__(db_url) + self.network_mode = network_mode + if network_mode == Constants.GRN_NETWORK_MODE: + self.database_namespace = Constants.GRN_DATABASE_NAMESPACE + self.filepath = Constants.MISSING_GRN_GENE_DATA_FILEPATH + elif network_mode == Constants.PPI_NETWORK_MODE: + self.database_namespace = Constants.PPI_DATABASE_NAMESPACE + self.filepath = Constants.MISSING_PPI_GENE_DATA_FILEPATH + else: + raise ValueError(f"Unknown network type: {network_mode}") + + def get_copy_statement(self): + if self.network_mode == Constants.GRN_NETWORK_MODE: + return f"COPY {self.database_namespace}.gene (gene_id, display_gene_id, species, taxon_id, regulator, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;" + elif self.network_mode == Constants.PPI_NETWORK_MODE: + return f"COPY {self.database_namespace}.gene (gene_id, display_gene_id, species, taxon_id, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;" + else: + raise ValueError(f"Unknown network type: {self.network_mode}") + +class ProteinDataPopulator(DataPopulator): + def __init__(self, db_url): + super().__init__(db_url) + self.filepath = Constants.MISSING_PROTEIN_DATA_FILEPATH + + def get_copy_statement(self): + return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.protein (standard_name, gene_systematic_name, length, molecular_weight, PI, taxon_id, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;" + +class GeneRegulatoryNetworkDataPopulator(DataPopulator): + def __init__(self, db_url): + super().__init__(db_url) + self.filepath = Constants.GENE_REGULATORY_NETWORK_DATA_FILEPATH + + def get_copy_statement(self): + return f"COPY {Constants.GRN_DATABASE_NAMESPACE}.network (regulator_gene_id, target_gene_id, taxon_id, annotation_type, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;" + +class ProteinProteinInteractionsDataPopulator(DataPopulator): + def __init__(self, db_url): + super().__init__(db_url) + self.filepath = Constants.PROTEIN_PROTEIN_INTERACTIONS_DATA_FILEPATH + + def get_copy_statement(self): + return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.physical_interactions (protein1, protein2, interaction_detection_methods_identifier, annotation_type, experiment_name, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;" + +class SourceDataPopulator(DataPopulator): + def __init__(self, db_url, network_mode): + super().__init__(db_url) + self.network_mode = network_mode + self.database_namespace = self.determine_database_namespace(network_mode) + self.filepath = Constants.SOURCE_DATA_FILEPATH + + def get_copy_statement(self): + return f"COPY {self.database_namespace}.source (time_stamp, source, display_name) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;" \ No newline at end of file diff --git a/database2/network-database/database_services/updater.py b/database2/network-database/database_services/updater.py new file mode 100644 index 00000000..c4333aea --- /dev/null +++ b/database2/network-database/database_services/updater.py @@ -0,0 +1,157 @@ +import psycopg2 +from abc import ABC, abstractmethod +import csv +from constants import Constants + +class Updater(ABC): + def __init__(self, db_url): + self.db_url = db_url + self.filepath = None + + @abstractmethod + def process_each_row(self, row): + """ + Process each row of data from the file. + """ + pass + + def update_data(self): + print(f"Updating data from {self.filepath}...") + conn, cursor = self._connect_to_db() + + rows = self._process_file() + + # SQL Update query for protein data + for row in rows: + update_query, params = self.process_each_row(row) + + self._execute_update(cursor, update_query, params) + + self._commit_and_close(conn, cursor) + + print("Data update complete!") + print("====================================================================") + + def _process_file(self): + """ + Helper function to process the file, which will be used by subclasses to process data rows. + """ + with open(self.filepath, 'r') as file: + reader = csv.reader(file, delimiter='\t') + next(reader) + return list(reader) + + def _execute_update(self, cursor, update_query, params): + """ + Executes the update query with provided parameters. + """ + try: + cursor.execute(update_query, params) + except Exception as e: + print(f"Error executing query: {e}") + cursor.connection.rollback() + else: + print(f"Update successful!") + + def _connect_to_db(self): + """ + Establish connection to the database and return cursor. + """ + try: + conn = psycopg2.connect(self.db_url) + cursor = conn.cursor() + return conn, cursor + except Exception as e: + print(f"Error connecting to the database: {e}") + raise + + def _commit_and_close(self, conn, cursor): + """ + Commit the transaction and close the database connection. + """ + conn.commit() + cursor.close() + conn.close() + +class GeneUpdater(Updater): + def __init__(self, db_url, network_mode): + super().__init__(db_url) + self.network_mode = network_mode + self.filepath = Constants.UPDATE_PPI_GENE_DATA_FILEPATH if network_mode == Constants.PPI_NETWORK_MODE else Constants.UPDATE_GRN_GENE_DATA_FILEPATH + + def process_each_row(self, row): + gene_id = row[0] + display_gene_id = row[1] + + # Construct query based on network type (GRN vs PPI) + if self.network_mode == Constants.GRN_NETWORK_MODE: + regulator = row[2] + update_query = """ + UPDATE "{}".gene + SET display_gene_id = %s, regulator = %s + WHERE gene_id = %s; + """.format(Constants.GRN_DATABASE_NAMESPACE) # Directly format the schema name here + params = (display_gene_id, regulator, gene_id) + elif self.network_mode == Constants.PPI_NETWORK_MODE: + update_query = """ + UPDATE "{}".gene + SET display_gene_id = %s + WHERE gene_id = %s; + """.format(Constants.PPI_DATABASE_NAMESPACE) + params = (display_gene_id, gene_id) + else: + raise ValueError(f"Unknown network type '{self.network_mode}' specified. Expected 'grn' or 'ppi'.") + + return update_query, params + + +class ProteinUpdater(Updater): + def __init__(self, db_url): + super().__init__(db_url) + self.filepath = Constants.UPDATE_PROTEIN_DATA_FILEPATH + + def process_each_row(self, row): + gene_systematic_name = row[0] + standard_name = row[1] + length = row[2] if row[2] != "None" else 0 + molecular_weight = row[3] + pi = row[4] if row[4] != "None" else 0 + + update_query = """ + UPDATE {}.protein + SET standard_name = %s, length = %s, molecular_weight = %s, pi = %s + WHERE gene_systematic_name = %s; + """.format(Constants.PPI_DATABASE_NAMESPACE) + params = (standard_name, length, molecular_weight, pi, gene_systematic_name) + + return update_query, params + +class ProteinProteinInteractionsUpdater(Updater): + def __init__(self, db_url): + super().__init__(db_url) + self.filepath = Constants.UPDATE_PROTEIN_NAME_DATA_FILEPATH + + def process_each_row(self, row): + old_standard_name = row[0] + new_standard_name = row[1] + + # Use SQL CASE statement to update either protein1 or protein2 + update_query = """ + UPDATE {}.physical_interactions + SET + protein1 = CASE + WHEN protein1 = %s THEN %s + ELSE protein1 + END, + protein2 = CASE + WHEN protein2 = %s THEN %s + ELSE protein2 + END + WHERE protein1 = %s OR protein2 = %s; + """.format(Constants.PPI_DATABASE_NAMESPACE) + + # Parameters for the query + params = (old_standard_name, new_standard_name, old_standard_name, new_standard_name, old_standard_name, old_standard_name) + + return update_query, params + diff --git a/database2/network-database/main.py b/database2/network-database/main.py new file mode 100644 index 00000000..d89d00c8 --- /dev/null +++ b/database2/network-database/main.py @@ -0,0 +1,77 @@ +from constants import Constants +from data_services.data_generator import * +from data_services.save_service import * +from database_services.filter import * +from database_services.updater import * +from database_services.populator import * +import argparse +from datetime import datetime, timezone, timedelta + +save_service = SaveToTSVService() + +def load_data(network_option): + print("Generating data.................................................") + time_stamp = datetime.now(timezone(timedelta(hours=-8))) + formatted_time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S%z") + if network_option in ['all', Constants.GRN_NETWORK_MODE]: + grnDataGenerator = GeneRegulatoryNetworkDataGenerator(GeneRegulatoryNetworkFetcherService(), GeneRegulatoryNetworkProcessor(formatted_time_stamp), save_service) + + if network_option in ['all', Constants.PPI_NETWORK_MODE]: + proteinDataGenerator = ProteinDataGenerator(ProteinFetcherService(), ProteinProcessor(formatted_time_stamp), save_service) + ProteinProteinInteractionsDataGenerator(ProteinProteinInteractionsFetcherService(), ProteinProteinInteractionsProcessor(formatted_time_stamp), save_service) + + if network_option == Constants.GRN_NETWORK_MODE: + GeneDataGenerator(GeneFetcherService(), GeneProcessor(formatted_time_stamp), save_service, grnDataGenerator.data) + else: + GeneDataGenerator(GeneFetcherService(), GeneProcessor(formatted_time_stamp), save_service, grnDataGenerator.data if grnDataGenerator else None, proteinDataGenerator.data) + + SourceDataGenerator(SourceProcessor(formatted_time_stamp), save_service) + +def filter_data(network_option, db_url): + print("Filtering data.................................................") + if network_option in ['all', Constants.GRN_NETWORK_MODE]: + GeneFilter(db_url, save_service, network_mode="grn").filter_data() + + if network_option in ['all', Constants.PPI_NETWORK_MODE]: + GeneFilter(db_url, save_service, network_mode="ppi").filter_data() + ProteinFilter(db_url, save_service).filter_data() + +def adding_data_to_databse(network_option, db_url): + print("Adding data to database.................................................") + if network_option in ['all', Constants.GRN_NETWORK_MODE]: + network_mode = Constants.GRN_NETWORK_MODE + SourceDataPopulator(db_url, network_mode).populate_data() + GeneDataPopulator(db_url, network_mode).populate_data() + GeneUpdater(db_url, network_mode).update_data() + GeneRegulatoryNetworkDataPopulator(db_url).populate_data() + + if network_option in ['all', Constants.PPI_NETWORK_MODE]: + network_mode = Constants.PPI_NETWORK_MODE + SourceDataPopulator(db_url, network_mode).populate_data() + + GeneDataPopulator(db_url, network_mode).populate_data() + GeneUpdater(db_url, network_mode).update_data() + + ProteinDataPopulator(db_url).populate_data() + ProteinProteinInteractionsUpdater(db_url).update_data() + ProteinUpdater(db_url).update_data() + + ProteinProteinInteractionsDataPopulator(db_url).populate_data() + +def main(network_option, db_url): + load_data(network_option) + filter_data(network_option, db_url) + adding_data_to_databse(network_option, db_url) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate data for different networks.") + parser.add_argument('--network', choices=[Constants.PPI_NETWORK_MODE, Constants.GRN_NETWORK_MODE, 'all'], required=True, + help=f"Specify the type of network data to generate. Options: '{Constants.PPI_NETWORK_MODE}', '{Constants.GRN_NETWORK_MODE}', 'all'") + parser.add_argument('--db_url', type=str, required=True, + help="PostgreSQL database URL, e.g., postgresql://localhost/postgres") + + args = parser.parse_args() + main(args.network, args.db_url) + + + \ No newline at end of file diff --git a/database2/schema/README.md b/database2/schema/README.md new file mode 100644 index 00000000..66f02261 --- /dev/null +++ b/database2/schema/README.md @@ -0,0 +1,97 @@ +# Schema Setup for GRNsight + +This directory contains all the necessary schemas for the databases required by GRNsight. + +## Load Database + +For Mac: + +``` +psql
+``` + +For Windows: + +``` +psql -U postgres
+``` + +When prompted for the password, use the password you specified earlier during the installation process. For all future commands requiring you to access postgres, you will need to add `-U postgres` + +For example, to access your local PostgreSQL database, use the following command: + +``` +psql postgresql://localhost/postgres +``` + +## Creating Schemas and Adding Table Specifications + +GRNsight requires four schemas, one for each of the following namespaces: + +1. `grnsettings` +2. `gene_expression` +3. `gene_regulatory_network` +4. `protein_protein_interactions` + +The scripts already contain the command to create the schema for you. Each schema requires a set of table definitions. You can add these by running the following commands, each corresponding to an SQL file that defines the structure for each schema: + +``` +cd +``` + +``` +psql -f expression_schema.sql postgresql://localhost/postgres +``` + +``` +psql -f gene_regulatory_network_schema.sql postgresql://localhost/postgres +``` + +``` +psql -f protein_protein_interactions_schema.sql postgresql://localhost/postgres +``` + +``` +psql -f grnsettings_schema.sql postgresql://localhost/postgres +``` + +Once these steps are completed, your database will be set up and ready to accept expression and network data. + +## Populating Data into the Database + +### 1. Settings Database + +The `settings` table stores the default database name. + +To change the default database name, follow these steps: + +1. **Log in to the Database** + + For instructions on how to load the database, refer to the [Load Database](#load-database) section. + +2. **Set the Search Path** + + Set your search path to the `settings` schema with the following command: + + ``` + SET SEARCH_PATH TO settings; + ``` + +3. **Delete the Current Default Database Name** + + Delete the existing database name with this command: + + ``` + DELETE FROM grnsettings; + ``` + +4. **Insert the New Default Database Name** + Insert the new default database name with the following command: + ``` + INSERT INTO grnsettings(expression_dataset) VALUES (''); + ``` + _Note: The current default database is `dahlquist_2018`. Don't forget ''!_ + +### 2. Other databases + +For other databases, continue follow the instructions in the [README.md](https://github.com/dondi/GRNsight/tree/master/database) outside of this directory. diff --git a/database2/schema/expression_schema.sql b/database2/schema/expression_schema.sql new file mode 100755 index 00000000..f977f24a --- /dev/null +++ b/database2/schema/expression_schema.sql @@ -0,0 +1,73 @@ +CREATE SCHEMA gene_expression; + +CREATE TABLE gene_expression.ref ( + pubmed_id VARCHAR, + authors VARCHAR, + publication_year VARCHAR, + title VARCHAR, + doi VARCHAR, + ncbi_geo_id VARCHAR, + PRIMARY KEY(ncbi_geo_id, pubmed_id) +); + +CREATE TABLE gene_expression.gene ( + gene_id VARCHAR, -- systematic like name + display_gene_id VARCHAR, -- standard like name + species VARCHAR, + taxon_id VARCHAR, + PRIMARY KEY(gene_id, taxon_id) +); + +CREATE TABLE gene_expression.expression_metadata ( + ncbi_geo_id VARCHAR, + pubmed_id VARCHAR, + FOREIGN KEY (ncbi_geo_id, pubmed_id) REFERENCES gene_expression.ref(ncbi_geo_id, pubmed_id), + control_yeast_strain VARCHAR, + treatment_yeast_strain VARCHAR, + control VARCHAR, + treatment VARCHAR, + concentration_value FLOAT, + concentration_unit VARCHAR, + time_value FLOAT, + time_unit VARCHAR, + number_of_replicates INT, + expression_table VARCHAR, + display_expression_table VARCHAR, + PRIMARY KEY(ncbi_geo_id, pubmed_id, time_value) +); +CREATE TABLE gene_expression.expression ( + gene_id VARCHAR, + taxon_id VARCHAR, + FOREIGN KEY (gene_id, taxon_id) REFERENCES gene_expression.gene(gene_id, taxon_id), + -- ncbi_geo_id VARCHAR, + -- pubmed_id VARCHAR, + sort_index INT, + sample_id VARCHAR, + expression FLOAT, + time_point FLOAT, + dataset VARCHAR, + PRIMARY KEY(gene_id, sample_id) + -- FOREIGN KEY (ncbi_geo_id, pubmed_id, time_point) REFERENCES gene_expression.expression_metadata(ncbi_geo_id, pubmed_id, time_value) +); +CREATE TABLE gene_expression.degradation_rate ( + gene_id VARCHAR, + taxon_id VARCHAR, + FOREIGN KEY (gene_id, taxon_id) REFERENCES gene_expression.gene(gene_id, taxon_id), + ncbi_geo_id VARCHAR, + pubmed_id VARCHAR, + FOREIGN KEY (ncbi_geo_id, pubmed_id) REFERENCES gene_expression.ref(ncbi_geo_id, pubmed_id), + PRIMARY KEY(gene_id, ncbi_geo_id, pubmed_id), + degradation_rate FLOAT +); + +CREATE TABLE gene_expression.production_rate ( + gene_id VARCHAR, + taxon_id VARCHAR, + FOREIGN KEY (gene_id, taxon_id) REFERENCES gene_expression.gene(gene_id, taxon_id), + ncbi_geo_id VARCHAR, + pubmed_id VARCHAR, + FOREIGN KEY (ncbi_geo_id, pubmed_id) REFERENCES gene_expression.ref(ncbi_geo_id, pubmed_id), + PRIMARY KEY(gene_id, ncbi_geo_id, pubmed_id), + production_rate FLOAT + -- FOREIGN KEY (gene_id, ncbi_geo_id, pubmed_id) REFERENCES gene_expression.degradation_rate(gene_id, ncbi_geo_id, pubmed_id) -- not sure if we want to link the generated production rate to it's original degradation rate +); \ No newline at end of file diff --git a/database2/schema/gene_regulatory_network_schema.sql b/database2/schema/gene_regulatory_network_schema.sql new file mode 100644 index 00000000..26cd5719 --- /dev/null +++ b/database2/schema/gene_regulatory_network_schema.sql @@ -0,0 +1,33 @@ +CREATE SCHEMA gene_regulatory_network_new; + +CREATE TABLE gene_regulatory_network_new.source ( + time_stamp TIMESTAMP WITH TIME ZONE, + source VARCHAR, + display_name VARCHAR, + PRIMARY KEY(time_stamp, source) +); + +CREATE TABLE gene_regulatory_network_new.gene ( + gene_id VARCHAR, -- systematic like name + display_gene_id VARCHAR, -- standard like name + species VARCHAR, + taxon_id VARCHAR, + regulator BOOLEAN, + time_stamp TIMESTAMP WITH TIME ZONE, + source VARCHAR, + PRIMARY KEY(gene_id, taxon_id), + FOREIGN KEY (time_stamp, source) REFERENCES gene_regulatory_network_new.source(time_stamp, source) +); + +CREATE TABLE gene_regulatory_network_new.network ( + regulator_gene_id VARCHAR, + target_gene_id VARCHAR, + taxon_id VARCHAR, + annotation_type VARCHAR, + time_stamp TIMESTAMP WITH TIME ZONE, + source VARCHAR, + FOREIGN KEY (regulator_gene_id, taxon_id) REFERENCES gene_regulatory_network_new.gene(gene_id, taxon_id), + FOREIGN KEY (target_gene_id, taxon_id) REFERENCES gene_regulatory_network_new.gene(gene_id, taxon_id), + FOREIGN KEY (time_stamp, source) REFERENCES gene_regulatory_network_new.source(time_stamp, source), + CONSTRAINT unique_network UNIQUE (regulator_gene_id, target_gene_id, taxon_id, time_stamp, source, annotation_type) +); \ No newline at end of file diff --git a/database2/schema/grnsettings_schema.sql b/database2/schema/grnsettings_schema.sql new file mode 100644 index 00000000..32f3e772 --- /dev/null +++ b/database2/schema/grnsettings_schema.sql @@ -0,0 +1,5 @@ +CREATE SCHEMA settings; + +CREATE TABLE settings.grnsettings ( + expression_dataset VARCHAR PRIMARY KEY +); diff --git a/database2/schema/protein_protein_interactions_schema.sql b/database2/schema/protein_protein_interactions_schema.sql new file mode 100644 index 00000000..cc8207a1 --- /dev/null +++ b/database2/schema/protein_protein_interactions_schema.sql @@ -0,0 +1,48 @@ +CREATE SCHEMA protein_protein_interactions_new; + +CREATE TABLE protein_protein_interactions_new.source ( + time_stamp TIMESTAMP WITH TIME ZONE, + source VARCHAR, + display_name VARCHAR, + PRIMARY KEY(time_stamp, source) +); + +CREATE TABLE protein_protein_interactions_new.gene ( + gene_id VARCHAR, -- systematic like name + display_gene_id VARCHAR, -- standard like name + species VARCHAR, + taxon_id VARCHAR, + time_stamp TIMESTAMP WITH TIME ZONE, + source VARCHAR, + PRIMARY KEY(gene_id, taxon_id), + FOREIGN KEY (time_stamp, source) REFERENCES protein_protein_interactions_new.source(time_stamp, source) +); + +CREATE TABLE protein_protein_interactions_new.protein ( + standard_name VARCHAR PRIMARY KEY, + gene_systematic_name VARCHAR, + length FLOAT, + molecular_weight FLOAT, + PI FLOAT, + taxon_id VARCHAR, + time_stamp TIMESTAMP WITH TIME ZONE, + source VARCHAR, + FOREIGN KEY (gene_systematic_name, taxon_id) REFERENCES protein_protein_interactions_new.gene(gene_id, taxon_id), + FOREIGN KEY (time_stamp, source) REFERENCES protein_protein_interactions_new.source(time_stamp, source) +); + + CREATE TABLE protein_protein_interactions_new.physical_interactions ( + protein1 VARCHAR, + protein2 VARCHAR, + gene_systematic_name1 VARCHAR, + gene_systematic_name2 VARCHAR, + interaction_detection_methods_identifier VARCHAR, + annotation_type VARCHAR, + experiment_name VARCHAR, + time_stamp TIMESTAMP WITH TIME ZONE, + source VARCHAR, + FOREIGN KEY (protein1) REFERENCES protein_protein_interactions_new.protein(standard_name), + FOREIGN KEY (protein2) REFERENCES protein_protein_interactions_new.protein(standard_name), + FOREIGN KEY (time_stamp, source) REFERENCES protein_protein_interactions_new.source(time_stamp, source), + CONSTRAINT unique_physical_interaction UNIQUE (protein1, protein2, interaction_detection_methods_identifier, annotation_type, experiment_name, time_stamp, source) + );