Skip to content

Commit 8cd0187

Browse files
committed
recount3 sub-package updated
API-breaking changes: * The output data frames produced by the 'dgd_get_recount3_data' executable now contain both gene expression data and metadata, unless otherwise filtered (see below). Other changes: * Now the 'experiment_attributes' column, if present in the metadata columns of a SRA study, will be split into its constituent components when writing the output data frames for the 'dgd_get_recount3_data' executable (as it is already the case with the 'sample_attributes' column). * The user can now pass a YAML file to 'dgd_get_recount3_data' to download data from the Recount3 platform in bulk and filter them. * The user can now pass 'metadata_to_keep' and 'metadata_to_drop' lists of metadata columns in the input file to 'dgd_get_recount3_data' to keep or drop specific metadata columns in the output data frames. These can be passed both as columns, if the input file is a CSV file, or as specific keywords, if the input file is a YAML file. * The 'recount3.util.get_metadata' function now returns the metadata data frame with the 'recount3_project_name' and 'recount3_samples_category' columns added. * The 'model_untrained.yaml' configuration file was added to the examples of configuration files available within the package. Internal changes (for contributors): * Two new internal functions in the 'bulkDGD.recount3.util' module ('_load_samples_batches_csv' and 'load_samples_batches_yaml') were introduced to parse the input files to 'dgd_get_recount3_data'. The public function 'load_samples_batches' simply calls one of them depending on the file's extension. * The 'bulkDGD.util.get_handlers' function now accepts two new arguments 'log_level_console' and 'log_level_file' instead of the old 'log_level' to have a more fine-grained control over the log level of the handlers. * The log level of the console handler for the '_dgd_get_recount3_data_single_batch' execuable was changed to ERROR not to clutter the console too much with all the INFO messages from the subprocesses (which get logged to their own log files anyway if the overall log level is INFO or below). * The header of the 'bulkDGD/recount3/data/sra_metadata_fields.txt' file was changed to better describe the metadata fileds included in it. Documentation: * The documentation was updated to reflect the user-facing changes. * The readme files for the configurations were removed because of the redundancy with the content of the documentation and of the configuration files themselves. * The 'model_config_options.rst' file was removed from the documentation because it was empty and not referenced anywhere.
1 parent 49848d4 commit 8cd0187

15 files changed

+752
-512
lines changed

bulkDGD/execs/_dgd_get_recount3_data_single_batch.py

Lines changed: 149 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@ def main():
6565
"The name of the Recount3 project for which samples will " \
6666
f"be retrieved. The available projects are: {ip_choices_str}."
6767
parser.add_argument("-ip", "--input-project-name",
68-
type = str,
6968
required = True,
7069
choices = ip_choices,
7170
help = ip_help)
@@ -80,7 +79,6 @@ def main():
8079
"associated with." \
8180
"For SRA data, this is the code associated with the project."
8281
parser.add_argument("-is", "--input-samples-category",
83-
type = str,
8482
required = True,
8583
help = is_help)
8684

@@ -92,7 +90,6 @@ def main():
9290
"written in the working directory. The default file name is " \
9391
"'{input_project_name}_{input_samples_category}.csv'."
9492
parser.add_argument("-o", "--output-csv",
95-
type = str,
9693
default = None,
9794
help = o_help)
9895

@@ -102,7 +99,6 @@ def main():
10299
"The working directory. The default is the current " \
103100
"working directory."
104101
parser.add_argument("-d", "--work-dir",
105-
type = str,
106102
default = os.getcwd(),
107103
help = d_help)
108104

@@ -139,19 +135,44 @@ def main():
139135
"accepts a plain text file containing the string " \
140136
"since it can be long for complex queries."
141137
parser.add_argument("-qs", "--query-string",
142-
type = str,
143138
default = None,
144139
help = qs_help)
145140

146141
#-----------------------------------------------------------------#
147142

143+
mk_help = \
144+
"A vertical line (|)-separated list of names of metadata " \
145+
"columns to keep in the final data frame. All the other " \
146+
"metadata columns will be dropped from the data frame. If " \
147+
"neither this option nor the '-md', '--metadata-to-drop' " \
148+
"is passed, all metadata columns are kept in the final data " \
149+
"frame."
150+
parser.add_argument("-mk", "--metadata-to-keep",
151+
default = None,
152+
help = mk_help)
153+
154+
#-----------------------------------------------------------------#
155+
156+
md_help = \
157+
"A vertical line (|)-separated list of names of metadata " \
158+
"columns to drop in the final data frame. All the other " \
159+
"metadata columns will be kept in the final data frame. Use " \
160+
"the '_all_' reserved keyword to drop all metadata columns " \
161+
"from the data frame. If neither this option nor the '-mk', " \
162+
"'--metadata-to-keep' option is passed, all metadata " \
163+
"columns are kept in the final data frame."
164+
parser.add_argument("-md", "--metadata-to-drop",
165+
default = None,
166+
help = md_help)
167+
168+
#-----------------------------------------------------------------#
169+
148170
lf_default = "_dgd_get_recount3_data_single_batch.log"
149171
lf_help = \
150172
"The name of the log file. The file will be written " \
151173
"in the working directory. The default file name is " \
152174
f"'{lf_default}'."
153175
parser.add_argument("-lf", "--log-file",
154-
type = str,
155176
default = lf_default,
156177
help = lf_help)
157178

@@ -187,6 +208,8 @@ def main():
187208
output_csv = args.output_csv
188209
wd = os.path.abspath(args.work_dir)
189210
query_string = args.query_string
211+
metadata_to_keep = args.metadata_to_keep
212+
metadata_to_drop = args.metadata_to_drop
190213
save_gene_sums = args.save_gene_sums
191214
save_metadata = args.save_metadata
192215
log_file = args.log_file
@@ -197,7 +220,7 @@ def main():
197220
#-----------------------------------------------------------------#
198221

199222
# Get the module's logger.
200-
logger = log.getLogger(__name__)
223+
logger = log.getLogger("dgd_get_recount3_data")
201224

202225
# Set WARNING logging level by default.
203226
log_level = log.WARNING
@@ -219,9 +242,11 @@ def main():
219242
handlers = \
220243
util.get_handlers(\
221244
log_console = log_console,
245+
log_console_level = log.ERROR,
222246
log_file_class = log.FileHandler,
223-
log_file_options = {"filename" : log_file},
224-
log_level = log_level)
247+
log_file_options = {"filename" : log_file,
248+
"mode" : "w"},
249+
log_file_level = log_level)
225250

226251
# Set the logging configuration.
227252
log.basicConfig(level = log_level,
@@ -247,10 +272,8 @@ def main():
247272

248273
# Log it an exit.
249274
errstr = \
250-
"It was not possible to get the RNA-seq data for " \
251-
f"project '{input_project_name}', samples " \
252-
f"'{input_samples_category}' from Recount3. " \
253-
f"Error: {e}"
275+
"It was not possible to get the RNA-seq data from " \
276+
f"Recount3. Error: {e}"
254277
logger.exception(errstr)
255278
sys.exit(errstr)
256279

@@ -271,14 +294,32 @@ def main():
271294

272295
# Log it an exit.
273296
errstr = \
274-
"It was not possible to get the metadata for " \
275-
f"project '{input_project_name}', samples " \
276-
f"'{input_samples_category}' from Recount3. " \
297+
"It was not possible to get the metadata from Recount3. " \
277298
f"Error: {e}"
278299
logger.exception(errstr)
279300
sys.exit(errstr)
280301

281302
#-----------------------------------------------------------------#
303+
304+
# Try to merge the RNA-seq data frame and the metadata data frame.
305+
try:
306+
307+
df_final = \
308+
recount3.merge_gene_sums_and_metadata(\
309+
df_gene_sums = df_gene_sums,
310+
df_metadata = df_metadata)
311+
312+
# If something went wrong
313+
except Exception as e:
314+
315+
# Log it and exit.
316+
errstr = \
317+
"It was not possible to combine the RNA-seq data " \
318+
f"with the metadata. Error: {e}"
319+
logger.exception(errstr)
320+
sys.exit(errstr)
321+
322+
#-----------------------------------------------------------------#
282323

283324
# If the user has passed a query string or a file containing the
284325
# query string
@@ -303,20 +344,12 @@ def main():
303344

304345
# Try to add the metadata to the RNA-seq data frame.
305346
try:
306-
307-
# Merge the RNA-seq data frame and the metadata data frame.
308-
df_merged = \
309-
recount3.merge_gene_sums_and_metadata(\
310-
df_gene_sums = df_gene_sums,
311-
df_metadata = df_metadata,
312-
project_name = input_project_name)
313347

314348
# Filter the samples.
315349
df_final = \
316350
recount3.filter_by_metadata(\
317-
df = df_merged,
318-
query_string = query_string,
319-
project_name = input_project_name)
351+
df = df_final,
352+
query_string = query_string)
320353

321354
# If something went wrong
322355
except Exception as e:
@@ -328,12 +361,95 @@ def main():
328361
logger.exception(errstr)
329362
sys.exit(errstr)
330363

331-
# Otherwise
332-
else:
364+
#-----------------------------------------------------------------#
365+
366+
# If the user passed a list of metadata columns to keep in the
367+
# final data frame
368+
if metadata_to_keep is not None:
333369

334-
# The final data frame will be the one containing the gene
335-
# expression data.
336-
df_final = df_gene_sums
370+
# Get the list of metadata columns to keep.
371+
metadata_to_keep = \
372+
[m.lstrip("'").rstrip("'") for m \
373+
in metadata_to_keep.rstrip().split("|")]
374+
375+
# Get the columns to keep in the final data frame.
376+
columns_to_keep = \
377+
[col for col in df_final.columns \
378+
if col.startswith("ENSG")] + \
379+
[col for col in df_final.columns \
380+
if not col.startswith("ENSG") \
381+
and col in metadata_to_keep]
382+
383+
# Try to keep only the selected columns
384+
try:
385+
386+
df_final = df_final.loc[:,columns_to_keep]
387+
388+
# If something went wrong
389+
except Exception as e:
390+
391+
# Get a string representing the metadata columns to keep.
392+
metadata_to_keep_str = \
393+
", ".join([f"'{m}'" for m in metadata_to_keep])
394+
395+
# Log it and exit.
396+
errstr = \
397+
"It was not possible to keep only the following " \
398+
"metadata columns in the final data frame: " \
399+
f"{metadata_to_keep_str}. Error: {e}"
400+
logger.exception(errstr)
401+
sys.exit(errstr)
402+
403+
#-----------------------------------------------------------------#
404+
405+
# If the user passed a list of metadata columns to drop in the
406+
# final data frame
407+
if metadata_to_drop is not None:
408+
409+
# If the user wants to drop all metadata columns
410+
if metadata_to_drop == "_all_":
411+
412+
# Get the columns to keep in the final data frame.
413+
columns_to_keep = \
414+
[col for col in df_final.columns \
415+
if col.startswith("ENSG")]
416+
417+
# Otherwise
418+
else:
419+
420+
# Get the list of metadata columns.
421+
metadata_to_drop = \
422+
[m.lstrip("'").rstrip("'") for m \
423+
in metadata_to_drop.rstrip().split("|")]
424+
425+
# Get the columns to keep in the final data frame.
426+
columns_to_keep = \
427+
[col for col in df_final.columns \
428+
if col.startswith("ENSG")] + \
429+
[col for col in df_final.columns \
430+
if not col.startswith("ENSG") \
431+
and col not in metadata_to_drop]
432+
433+
# Try to keep only the selected columns.
434+
try:
435+
436+
df_final = df_final.loc[:, columns_to_keep]
437+
438+
# If something went wrong
439+
except Exception as e:
440+
441+
# Get a string representing the metadata columns to
442+
# drop.
443+
metadata_to_drop_str = \
444+
", ".join([f"'{m}'" for m in metadata_to_drop])
445+
446+
# Log it and exit.
447+
errstr = \
448+
"It was not possible to drop the following " \
449+
"metadata columns from the final data frame: " \
450+
f"{metadata_to_drop_str}. Error: {e}"
451+
logger.exception(errstr)
452+
sys.exit(errstr)
337453

338454
#-----------------------------------------------------------------#
339455

@@ -364,7 +480,7 @@ def main():
364480

365481
# Log it and exit.
366482
errstr = \
367-
"It was not possible to save the RNA-seq data " \
368-
f"in '{output_csv_path}'. Error: {e}"
483+
"It was not possible to save the final data frame in " \
484+
f"'{output_csv_path}'. Error: {e}"
369485
logger.exception(errstr)
370486
sys.exit(errstr)

bulkDGD/execs/dgd_get_probability_density.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,11 @@ def main():
198198
handlers = \
199199
util.get_handlers(\
200200
log_console = log_console,
201+
log_console_level = log_level,
201202
log_file_class = log.FileHandler,
202203
log_file_options = {"filename" : log_file,
203204
"mode" : "w"},
204-
log_level = log_level)
205+
log_file_level = log_level)
205206

206207
# Set the logging configuration.
207208
log.basicConfig(level = log_level,

0 commit comments

Comments
 (0)