Skip to content

Commit 82289d2

Browse files
authored
Merge pull request #12 from Center-for-Health-Data-Science/fix11_experiment_attributes
recount3 sub-package updated
2 parents 49848d4 + 8cd0187 commit 82289d2

15 files changed

+752
-512
lines changed

bulkDGD/execs/_dgd_get_recount3_data_single_batch.py

Lines changed: 149 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@ def main():
6565
"The name of the Recount3 project for which samples will " \
6666
f"be retrieved. The available projects are: {ip_choices_str}."
6767
parser.add_argument("-ip", "--input-project-name",
68-
type = str,
6968
required = True,
7069
choices = ip_choices,
7170
help = ip_help)
@@ -80,7 +79,6 @@ def main():
8079
"associated with." \
8180
"For SRA data, this is the code associated with the project."
8281
parser.add_argument("-is", "--input-samples-category",
83-
type = str,
8482
required = True,
8583
help = is_help)
8684

@@ -92,7 +90,6 @@ def main():
9290
"written in the working directory. The default file name is " \
9391
"'{input_project_name}_{input_samples_category}.csv'."
9492
parser.add_argument("-o", "--output-csv",
95-
type = str,
9693
default = None,
9794
help = o_help)
9895

@@ -102,7 +99,6 @@ def main():
10299
"The working directory. The default is the current " \
103100
"working directory."
104101
parser.add_argument("-d", "--work-dir",
105-
type = str,
106102
default = os.getcwd(),
107103
help = d_help)
108104

@@ -139,19 +135,44 @@ def main():
139135
"accepts a plain text file containing the string " \
140136
"since it can be long for complex queries."
141137
parser.add_argument("-qs", "--query-string",
142-
type = str,
143138
default = None,
144139
help = qs_help)
145140

146141
#-----------------------------------------------------------------#
147142

143+
mk_help = \
144+
"A vertical line (|)-separated list of names of metadata " \
145+
"columns to keep in the final data frame. All the other " \
146+
"metadata columns will be dropped from the data frame. If " \
147+
"neither this option nor the '-md', '--metadata-to-drop' " \
148+
"is passed, all metadata columns are kept in the final data " \
149+
"frame."
150+
parser.add_argument("-mk", "--metadata-to-keep",
151+
default = None,
152+
help = mk_help)
153+
154+
#-----------------------------------------------------------------#
155+
156+
md_help = \
157+
"A vertical line (|)-separated list of names of metadata " \
158+
"columns to drop in the final data frame. All the other " \
159+
"metadata columns will be kept in the final data frame. Use " \
160+
"the '_all_' reserved keyword to drop all metadata columns " \
161+
"from the data frame. If neither this option nor the '-mk', " \
162+
"'--metadata-to-keep' option is passed, all metadata " \
163+
"columns are kept in the final data frame."
164+
parser.add_argument("-md", "--metadata-to-drop",
165+
default = None,
166+
help = md_help)
167+
168+
#-----------------------------------------------------------------#
169+
148170
lf_default = "_dgd_get_recount3_data_single_batch.log"
149171
lf_help = \
150172
"The name of the log file. The file will be written " \
151173
"in the working directory. The default file name is " \
152174
f"'{lf_default}'."
153175
parser.add_argument("-lf", "--log-file",
154-
type = str,
155176
default = lf_default,
156177
help = lf_help)
157178

@@ -187,6 +208,8 @@ def main():
187208
output_csv = args.output_csv
188209
wd = os.path.abspath(args.work_dir)
189210
query_string = args.query_string
211+
metadata_to_keep = args.metadata_to_keep
212+
metadata_to_drop = args.metadata_to_drop
190213
save_gene_sums = args.save_gene_sums
191214
save_metadata = args.save_metadata
192215
log_file = args.log_file
@@ -197,7 +220,7 @@ def main():
197220
#-----------------------------------------------------------------#
198221

199222
# Get the module's logger.
200-
logger = log.getLogger(__name__)
223+
logger = log.getLogger("dgd_get_recount3_data")
201224

202225
# Set WARNING logging level by default.
203226
log_level = log.WARNING
@@ -219,9 +242,11 @@ def main():
219242
handlers = \
220243
util.get_handlers(\
221244
log_console = log_console,
245+
log_console_level = log.ERROR,
222246
log_file_class = log.FileHandler,
223-
log_file_options = {"filename" : log_file},
224-
log_level = log_level)
247+
log_file_options = {"filename" : log_file,
248+
"mode" : "w"},
249+
log_file_level = log_level)
225250

226251
# Set the logging configuration.
227252
log.basicConfig(level = log_level,
@@ -247,10 +272,8 @@ def main():
247272

248273
# Log it an exit.
249274
errstr = \
250-
"It was not possible to get the RNA-seq data for " \
251-
f"project '{input_project_name}', samples " \
252-
f"'{input_samples_category}' from Recount3. " \
253-
f"Error: {e}"
275+
"It was not possible to get the RNA-seq data from " \
276+
f"Recount3. Error: {e}"
254277
logger.exception(errstr)
255278
sys.exit(errstr)
256279

@@ -271,14 +294,32 @@ def main():
271294

272295
# Log it an exit.
273296
errstr = \
274-
"It was not possible to get the metadata for " \
275-
f"project '{input_project_name}', samples " \
276-
f"'{input_samples_category}' from Recount3. " \
297+
"It was not possible to get the metadata from Recount3. " \
277298
f"Error: {e}"
278299
logger.exception(errstr)
279300
sys.exit(errstr)
280301

281302
#-----------------------------------------------------------------#
303+
304+
# Try to merge the RNA-seq data frame and the metadata data frame.
305+
try:
306+
307+
df_final = \
308+
recount3.merge_gene_sums_and_metadata(\
309+
df_gene_sums = df_gene_sums,
310+
df_metadata = df_metadata)
311+
312+
# If something went wrong
313+
except Exception as e:
314+
315+
# Log it and exit.
316+
errstr = \
317+
"It was not possible to combine the RNA-seq data " \
318+
f"with the metadata. Error: {e}"
319+
logger.exception(errstr)
320+
sys.exit(errstr)
321+
322+
#-----------------------------------------------------------------#
282323

283324
# If the user has passed a query string or a file containing the
284325
# query string
@@ -303,20 +344,12 @@ def main():
303344

304345
# Try to add the metadata to the RNA-seq data frame.
305346
try:
306-
307-
# Merge the RNA-seq data frame and the metadata data frame.
308-
df_merged = \
309-
recount3.merge_gene_sums_and_metadata(\
310-
df_gene_sums = df_gene_sums,
311-
df_metadata = df_metadata,
312-
project_name = input_project_name)
313347

314348
# Filter the samples.
315349
df_final = \
316350
recount3.filter_by_metadata(\
317-
df = df_merged,
318-
query_string = query_string,
319-
project_name = input_project_name)
351+
df = df_final,
352+
query_string = query_string)
320353

321354
# If something went wrong
322355
except Exception as e:
@@ -328,12 +361,95 @@ def main():
328361
logger.exception(errstr)
329362
sys.exit(errstr)
330363

331-
# Otherwise
332-
else:
364+
#-----------------------------------------------------------------#
365+
366+
# If the user passed a list of metadata columns to keep in the
367+
# final data frame
368+
if metadata_to_keep is not None:
333369

334-
# The final data frame will be the one containing the gene
335-
# expression data.
336-
df_final = df_gene_sums
370+
# Get the list of metadata columns to keep.
371+
metadata_to_keep = \
372+
[m.lstrip("'").rstrip("'") for m \
373+
in metadata_to_keep.rstrip().split("|")]
374+
375+
# Get the columns to keep in the final data frame.
376+
columns_to_keep = \
377+
[col for col in df_final.columns \
378+
if col.startswith("ENSG")] + \
379+
[col for col in df_final.columns \
380+
if not col.startswith("ENSG") \
381+
and col in metadata_to_keep]
382+
383+
# Try to keep only the selected columns
384+
try:
385+
386+
df_final = df_final.loc[:,columns_to_keep]
387+
388+
# If something went wrong
389+
except Exception as e:
390+
391+
# Get a string representing the metadata columns to keep.
392+
metadata_to_keep_str = \
393+
", ".join([f"'{m}'" for m in metadata_to_keep])
394+
395+
# Log it and exit.
396+
errstr = \
397+
"It was not possible to keep only the following " \
398+
"metadata columns in the final data frame: " \
399+
f"{metadata_to_keep_str}. Error: {e}"
400+
logger.exception(errstr)
401+
sys.exit(errstr)
402+
403+
#-----------------------------------------------------------------#
404+
405+
# If the user passed a list of metadata columns to drop in the
406+
# final data frame
407+
if metadata_to_drop is not None:
408+
409+
# If the user wants to drop all metadata columns
410+
if metadata_to_drop == "_all_":
411+
412+
# Get the columns to keep in the final data frame.
413+
columns_to_keep = \
414+
[col for col in df_final.columns \
415+
if col.startswith("ENSG")]
416+
417+
# Otherwise
418+
else:
419+
420+
# Get the list of metadata columns.
421+
metadata_to_drop = \
422+
[m.lstrip("'").rstrip("'") for m \
423+
in metadata_to_drop.rstrip().split("|")]
424+
425+
# Get the columns to keep in the final data frame.
426+
columns_to_keep = \
427+
[col for col in df_final.columns \
428+
if col.startswith("ENSG")] + \
429+
[col for col in df_final.columns \
430+
if not col.startswith("ENSG") \
431+
and col not in metadata_to_drop]
432+
433+
# Try to keep only the selected columns.
434+
try:
435+
436+
df_final = df_final.loc[:, columns_to_keep]
437+
438+
# If something went wrong
439+
except Exception as e:
440+
441+
# Get a string representing the metadata columns to
442+
# drop.
443+
metadata_to_drop_str = \
444+
", ".join([f"'{m}'" for m in metadata_to_drop])
445+
446+
# Log it and exit.
447+
errstr = \
448+
"It was not possible to drop the following " \
449+
"metadata columns from the final data frame: " \
450+
f"{metadata_to_drop_str}. Error: {e}"
451+
logger.exception(errstr)
452+
sys.exit(errstr)
337453

338454
#-----------------------------------------------------------------#
339455

@@ -364,7 +480,7 @@ def main():
364480

365481
# Log it and exit.
366482
errstr = \
367-
"It was not possible to save the RNA-seq data " \
368-
f"in '{output_csv_path}'. Error: {e}"
483+
"It was not possible to save the final data frame in " \
484+
f"'{output_csv_path}'. Error: {e}"
369485
logger.exception(errstr)
370486
sys.exit(errstr)

bulkDGD/execs/dgd_get_probability_density.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,11 @@ def main():
198198
handlers = \
199199
util.get_handlers(\
200200
log_console = log_console,
201+
log_console_level = log_level,
201202
log_file_class = log.FileHandler,
202203
log_file_options = {"filename" : log_file,
203204
"mode" : "w"},
204-
log_level = log_level)
205+
log_file_level = log_level)
205206

206207
# Set the logging configuration.
207208
log.basicConfig(level = log_level,

0 commit comments

Comments
 (0)