@@ -278,43 +278,60 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
278278
279279
280280def map_proteomics (proteomics_data , improve_id_data , entrez_data ):
281-
282- # read in data
283- if isinstance (proteomics_data , pd .DataFrame ) == False :
284- proteomics_data = pd .read_csv (proteomics_data )
285-
286- if isinstance (improve_id_data , pd .DataFrame ) == False :
281+ if not isinstance (proteomics_data , pd .DataFrame ):
282+ # use header=1 (second line as header), drop first row which is a comment
283+ proteomics_data = pd .read_csv (proteomics_data , header = 1 , index_col = 0 )
284+ if not isinstance (improve_id_data , pd .DataFrame ):
287285 improve_id_data = pd .read_csv (improve_id_data )
288-
289- if isinstance (entrez_data , pd .DataFrame ) == False :
286+ if not isinstance (entrez_data , pd .DataFrame ):
290287 entrez_data = pd .read_csv (entrez_data )
291288
292- # first, replace colnames with first row and delete first row
293- proteomics_data .columns = proteomics_data .iloc [0 ,:]
294- proteomics_data = proteomics_data .iloc [1 :]
295-
296- # melt the df so there is one sample and prot per row
297- proteomics_data = proteomics_data .rename (columns = {proteomics_data .columns [0 ]:'gene_symbol' })
298- long_prot_df = pd .melt (proteomics_data , id_vars = ['gene_symbol' ], value_vars = proteomics_data .columns [proteomics_data .columns != 'gene_symbol' ])
299- long_prot_df = long_prot_df .rename (columns = {0 :'sample_name' , 'value' :'proteomics' })
289+ # Clean column names
290+ proteomics_data .columns = proteomics_data .columns .astype (str ).str .strip ()
291+ proteomics_data = proteomics_data .rename (columns = {"Sample \n Gene symbol" : "gene_symbol" })
300292
301-
302- # map gene names to entrez id's
303- mapped_proteomics_df = pd .merge (long_prot_df , entrez_data [['other_id' ,'entrez_id' ]].drop_duplicates (), how = 'inner' , left_on = "gene_symbol" , right_on = "other_id" )
304- mapped_proteomics_df = mapped_proteomics_df .dropna (subset = ['entrez_id' ])
305-
306- # mapping improve sample id'samples_df
307- mapped_proteomics_df = pd .merge (mapped_proteomics_df , improve_id_data [['other_id' ,'improve_sample_id' ]].drop_duplicates (), how = 'inner' , left_on = "sample_name" , right_on = "other_id" )
308-
309- # clean up column names and data types
310- mapped_proteomics_df = mapped_proteomics_df .drop (columns = ['gene_symbol' ,'sample_name' ,'other_id_x' ,'other_id_y' ])
311- mapped_proteomics_df ['source' ] = "Synapse"
312- mapped_proteomics_df ['study' ] = "liver"
313- mapped_proteomics_df = mapped_proteomics_df .dropna ()
314- mapped_proteomics_df = mapped_proteomics_df .astype ({'entrez_id' :'int' ,'improve_sample_id' :'int' })
315- mapped_proteomics_df = mapped_proteomics_df [['entrez_id' ,'proteomics' ,'improve_sample_id' ,'source' ,'study' ]]
316-
317- return (mapped_proteomics_df )
293+ # Drop any rows with missing gene symbol and all are strings
294+ proteomics_data = proteomics_data .dropna (subset = ["gene_symbol" ])
295+ proteomics_data ["gene_symbol" ] = proteomics_data ["gene_symbol" ].astype (str ).str .strip ()
296+
297+ value_cols = [c for c in proteomics_data .columns if c != "gene_symbol" ]
298+ long_prot_df = proteomics_data .melt (
299+ id_vars = ["gene_symbol" ],
300+ value_vars = value_cols ,
301+ var_name = "sample_name" ,
302+ value_name = "proteomics"
303+ )
304+
305+ #ensure strings
306+ long_prot_df ["gene_symbol" ] = long_prot_df ["gene_symbol" ].astype (str ).str .strip ()
307+ entrez_data ["other_id" ] = entrez_data ["other_id" ].astype (str ).str .strip ()
308+
309+ #Two merges
310+ mapped_proteomics_df = pd .merge (
311+ long_prot_df ,
312+ entrez_data [["other_id" , "entrez_id" ]].drop_duplicates (),
313+ how = "inner" ,
314+ left_on = "gene_symbol" ,
315+ right_on = "other_id"
316+ )
317+
318+ improve_id_data ["other_id" ] = improve_id_data ["other_id" ].astype (str ).str .strip ()
319+ mapped_proteomics_df = pd .merge (
320+ mapped_proteomics_df ,
321+ improve_id_data [["other_id" , "improve_sample_id" ]].drop_duplicates (),
322+ how = "inner" ,
323+ left_on = "sample_name" ,
324+ right_on = "other_id"
325+ )
326+
327+ mapped_proteomics_df = mapped_proteomics_df .drop (columns = ["other_id_x" , "other_id_y" , "gene_symbol" ])
328+ mapped_proteomics_df ["source" ] = "Synapse"
329+ mapped_proteomics_df ["study" ] = "liver"
330+ mapped_proteomics_df = mapped_proteomics_df .dropna (subset = ["entrez_id" , "improve_sample_id" ])
331+ mapped_proteomics_df = mapped_proteomics_df .astype ({"entrez_id" : "int" , "improve_sample_id" : "int" })
332+ mapped_proteomics_df = mapped_proteomics_df [["entrez_id" , "proteomics" , "improve_sample_id" , "source" , "study" ]]
333+
334+ return mapped_proteomics_df
318335
319336
320337if __name__ == "__main__" :
0 commit comments