@@ -65,7 +65,6 @@ def main():
6565 "The name of the Recount3 project for which samples will " \
6666 f"be retrieved. The available projects are: { ip_choices_str } ."
6767 parser .add_argument ("-ip" , "--input-project-name" ,
68- type = str ,
6968 required = True ,
7069 choices = ip_choices ,
7170 help = ip_help )
@@ -80,7 +79,6 @@ def main():
8079 "associated with." \
8180 "For SRA data, this is the code associated with the project."
8281 parser .add_argument ("-is" , "--input-samples-category" ,
83- type = str ,
8482 required = True ,
8583 help = is_help )
8684
@@ -92,7 +90,6 @@ def main():
9290 "written in the working directory. The default file name is " \
9391 "'{input_project_name}_{input_samples_category}.csv'."
9492 parser .add_argument ("-o" , "--output-csv" ,
95- type = str ,
9693 default = None ,
9794 help = o_help )
9895
@@ -102,7 +99,6 @@ def main():
10299 "The working directory. The default is the current " \
103100 "working directory."
104101 parser .add_argument ("-d" , "--work-dir" ,
105- type = str ,
106102 default = os .getcwd (),
107103 help = d_help )
108104
@@ -139,19 +135,44 @@ def main():
139135 "accepts a plain text file containing the string " \
140136 "since it can be long for complex queries."
141137 parser .add_argument ("-qs" , "--query-string" ,
142- type = str ,
143138 default = None ,
144139 help = qs_help )
145140
146141 #-----------------------------------------------------------------#
147142
143+ mk_help = \
144+ "A vertical line (|)-separated list of names of metadata " \
145+ "columns to keep in the final data frame. All the other " \
146+ "metadata columns will be dropped from the data frame. If " \
147+ "neither this option nor the '-md', '--metadata-to-drop' " \
148+ "is passed, all metadata columns are kept in the final data " \
149+ "frame."
150+ parser .add_argument ("-mk" , "--metadata-to-keep" ,
151+ default = None ,
152+ help = mk_help )
153+
154+ #-----------------------------------------------------------------#
155+
156+ md_help = \
157+ "A vertical line (|)-separated list of names of metadata " \
158+ "columns to drop in the final data frame. All the other " \
159+ "metadata columns will be kept in the final data frame. Use " \
160+ "the '_all_' reserved keyword to drop all metadata columns " \
161+ "from the data frame. If neither this option nor the '-mk', " \
162+ "'--metadata-to-keep' option is passed, all metadata " \
163+ "columns are kept in the final data frame."
164+ parser .add_argument ("-md" , "--metadata-to-drop" ,
165+ default = None ,
166+ help = md_help )
167+
168+ #-----------------------------------------------------------------#
169+
148170 lf_default = "_dgd_get_recount3_data_single_batch.log"
149171 lf_help = \
150172 "The name of the log file. The file will be written " \
151173 "in the working directory. The default file name is " \
152174 f"'{ lf_default } '."
153175 parser .add_argument ("-lf" , "--log-file" ,
154- type = str ,
155176 default = lf_default ,
156177 help = lf_help )
157178
@@ -187,6 +208,8 @@ def main():
187208 output_csv = args .output_csv
188209 wd = os .path .abspath (args .work_dir )
189210 query_string = args .query_string
211+ metadata_to_keep = args .metadata_to_keep
212+ metadata_to_drop = args .metadata_to_drop
190213 save_gene_sums = args .save_gene_sums
191214 save_metadata = args .save_metadata
192215 log_file = args .log_file
@@ -197,7 +220,7 @@ def main():
197220 #-----------------------------------------------------------------#
198221
199222 # Get the module's logger.
200- logger = log .getLogger (__name__ )
223+ logger = log .getLogger ("dgd_get_recount3_data" )
201224
202225 # Set WARNING logging level by default.
203226 log_level = log .WARNING
@@ -219,9 +242,11 @@ def main():
219242 handlers = \
220243 util .get_handlers (\
221244 log_console = log_console ,
245+ log_console_level = log .ERROR ,
222246 log_file_class = log .FileHandler ,
223- log_file_options = {"filename" : log_file },
224- log_level = log_level )
247+ log_file_options = {"filename" : log_file ,
248+ "mode" : "w" },
249+ log_file_level = log_level )
225250
226251 # Set the logging configuration.
227252 log .basicConfig (level = log_level ,
@@ -247,10 +272,8 @@ def main():
247272
248273 # Log it an exit.
249274 errstr = \
250- "It was not possible to get the RNA-seq data for " \
251- f"project '{ input_project_name } ', samples " \
252- f"'{ input_samples_category } ' from Recount3. " \
253- f"Error: { e } "
275+ "It was not possible to get the RNA-seq data from " \
276+ f"Recount3. Error: { e } "
254277 logger .exception (errstr )
255278 sys .exit (errstr )
256279
@@ -271,14 +294,32 @@ def main():
271294
272295 # Log it an exit.
273296 errstr = \
274- "It was not possible to get the metadata for " \
275- f"project '{ input_project_name } ', samples " \
276- f"'{ input_samples_category } ' from Recount3. " \
297+ "It was not possible to get the metadata from Recount3. " \
277298 f"Error: { e } "
278299 logger .exception (errstr )
279300 sys .exit (errstr )
280301
281302 #-----------------------------------------------------------------#
303+
304+ # Try to merge the RNA-seq data frame and the metadata data frame.
305+ try :
306+
307+ df_final = \
308+ recount3 .merge_gene_sums_and_metadata (\
309+ df_gene_sums = df_gene_sums ,
310+ df_metadata = df_metadata )
311+
312+ # If something went wrong
313+ except Exception as e :
314+
315+ # Log it and exit.
316+ errstr = \
317+ "It was not possible to combine the RNA-seq data " \
318+ f"with the metadata. Error: { e } "
319+ logger .exception (errstr )
320+ sys .exit (errstr )
321+
322+ #-----------------------------------------------------------------#
282323
283324 # If the user has passed a query string or a file containing the
284325 # query string
@@ -303,20 +344,12 @@ def main():
303344
304345 # Try to add the metadata to the RNA-seq data frame.
305346 try :
306-
307- # Merge the RNA-seq data frame and the metadata data frame.
308- df_merged = \
309- recount3 .merge_gene_sums_and_metadata (\
310- df_gene_sums = df_gene_sums ,
311- df_metadata = df_metadata ,
312- project_name = input_project_name )
313347
314348 # Filter the samples.
315349 df_final = \
316350 recount3 .filter_by_metadata (\
317- df = df_merged ,
318- query_string = query_string ,
319- project_name = input_project_name )
351+ df = df_final ,
352+ query_string = query_string )
320353
321354 # If something went wrong
322355 except Exception as e :
@@ -328,12 +361,95 @@ def main():
328361 logger .exception (errstr )
329362 sys .exit (errstr )
330363
331- # Otherwise
332- else :
364+ #-----------------------------------------------------------------#
365+
366+ # If the user passed a list of metadata columns to keep in the
367+ # final data frame
368+ if metadata_to_keep is not None :
333369
334- # The final data frame will be the one containing the gene
335- # expression data.
336- df_final = df_gene_sums
370+ # Get the list of metadata columns to keep.
371+ metadata_to_keep = \
372+ [m .lstrip ("'" ).rstrip ("'" ) for m \
373+ in metadata_to_keep .rstrip ().split ("|" )]
374+
375+ # Get the columns to keep in the final data frame.
376+ columns_to_keep = \
377+ [col for col in df_final .columns \
378+ if col .startswith ("ENSG" )] + \
379+ [col for col in df_final .columns \
380+ if not col .startswith ("ENSG" ) \
381+ and col in metadata_to_keep ]
382+
383+ # Try to keep only the selected columns
384+ try :
385+
386+ df_final = df_final .loc [:,columns_to_keep ]
387+
388+ # If something went wrong
389+ except Exception as e :
390+
391+ # Get a string representing the metadata columns to keep.
392+ metadata_to_keep_str = \
393+ ", " .join ([f"'{ m } '" for m in metadata_to_keep ])
394+
395+ # Log it and exit.
396+ errstr = \
397+ "It was not possible to keep only the following " \
398+ "metadata columns in the final data frame: " \
399+ f"{ metadata_to_keep_str } . Error: { e } "
400+ logger .exception (errstr )
401+ sys .exit (errstr )
402+
403+ #-----------------------------------------------------------------#
404+
405+ # If the user passed a list of metadata columns to drop in the
406+ # final data frame
407+ if metadata_to_drop is not None :
408+
409+ # If the user wants to drop all metadata columns
410+ if metadata_to_drop == "_all_" :
411+
412+ # Get the columns to keep in the final data frame.
413+ columns_to_keep = \
414+ [col for col in df_final .columns \
415+ if col .startswith ("ENSG" )]
416+
417+ # Otherwise
418+ else :
419+
420+ # Get the list of metadata columns.
421+ metadata_to_drop = \
422+ [m .lstrip ("'" ).rstrip ("'" ) for m \
423+ in metadata_to_drop .rstrip ().split ("|" )]
424+
425+ # Get the columns to keep in the final data frame.
426+ columns_to_keep = \
427+ [col for col in df_final .columns \
428+ if col .startswith ("ENSG" )] + \
429+ [col for col in df_final .columns \
430+ if not col .startswith ("ENSG" ) \
431+ and col not in metadata_to_drop ]
432+
433+ # Try to keep only the selected columns.
434+ try :
435+
436+ df_final = df_final .loc [:, columns_to_keep ]
437+
438+ # If something went wrong
439+ except Exception as e :
440+
441+ # Get a string representing the metadata columns to
442+ # drop.
443+ metadata_to_drop_str = \
444+ ", " .join ([f"'{ m } '" for m in metadata_to_drop ])
445+
446+ # Log it and exit.
447+ errstr = \
448+ "It was not possible to drop the following " \
449+ "metadata columns from the final data frame: " \
450+ f"{ metadata_to_drop_str } . Error: { e } "
451+ logger .exception (errstr )
452+ sys .exit (errstr )
337453
338454 #-----------------------------------------------------------------#
339455
@@ -364,7 +480,7 @@ def main():
364480
365481 # Log it and exit.
366482 errstr = \
367- "It was not possible to save the RNA-seq data " \
368- f"in '{ output_csv_path } '. Error: { e } "
483+ "It was not possible to save the final data frame in " \
484+ f"'{ output_csv_path } '. Error: { e } "
369485 logger .exception (errstr )
370486 sys .exit (errstr )
0 commit comments