From 3192bd01bf5784c7b487135ffb4c30eeeea9a7be Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Tue, 22 Jun 2021 09:34:22 +0200 Subject: [PATCH 1/2] Replace genomes.config with igenomes.config. Remove unecessary parameters. Boolean flag params --read_transcriptome_fasta_host_from_file and --read_transcriptome_fasta_pathogen_from_file were not needed, as the --transcriptome_host and --transcriptome_pathogen parameters already act as boolean flags if not specified by the user. Replaced genomes.config file with the standard nf-core igenomes.config and updated the documentation. --- conf/genomes.config | 30 -------------- docs/usage.md | 94 ++++++++++++++++++++++++-------------------- main.nf | 32 +++++++-------- nextflow.config | 18 +++------ nextflow_schema.json | 25 +++--------- 5 files changed, 76 insertions(+), 123 deletions(-) delete mode 100644 conf/genomes.config diff --git a/conf/genomes.config b/conf/genomes.config deleted file mode 100644 index 2c029714..00000000 --- a/conf/genomes.config +++ /dev/null @@ -1,30 +0,0 @@ -/* - * ------------------------------------------------- - * Nextflow config file for host/pathogen reference paths - * ------------------------------------------------- - * This file allows you to define host and pathogen references and create a permanent link to the files. - * Below, we show the general style that should be used. This file can be populated with a large number of different hosts and pathogens. - * For an example usage: to use the human and salmonella references here, add --genome_host GRCh38 --genome_pathogen SL1344 to the command line - * Annotation files are required to be in the GFF3 format - not GTF - */ - -params { - genomes { - 'GRCh38' { - fasta_host = "path_to_references/human/GRCh38.genome.fa" - gff_host = "path_to_references/human/GRCh38.annotation.gff3" - gff_host_tRNA = "path_to_references/human/GRCh38.gencode.tRNAs.gff3" - transcriptome_host = "path_to_references/human/GRCh38.gencode.transcripts.fa" - } - - 'SL1344' { - fasta_pathogen = "path_to_references/Salmonella/SL1344.fasta" - gff_pathogen = "path_to_references/Salmonella/SL1344.gff" - } - - 'My_Bacteria' { - fasta_pathogen = "path_to_references/My_Bacteria/My_Bacteria.fasta" - gff_pathogen = "path_to_references/My_Bacteria/My_Bacteria.gff" - } -} -} diff --git a/docs/usage.md b/docs/usage.md index 9bd6da9f..5ad4a7f3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -240,15 +240,26 @@ The main goal of Dual RNA-seq is simultaneous profiling of host and pathogen gen These parameters can be used in two ways: -#### A) Using a configuration file +#### A) Using configuration files -You can create your own configuration file with sets of reference files and save it to this file which is read each time the pipeline is run: `...conf/genomes.config` - -If using a custom genome configuration file, you will need to enable genomes_ignore by passing `genomes_ignore = true` +Most nf-core pipelines, including this one, come with a pre-built set of reference genome indices that work out of the box. +They are hosted in the cloud (see ) and will be downloaded on demand. +For this pipeline however, it is very likely that you will need to specify additional custom genomes to work with. > See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) and [Reference genomes](https://nf-co.re/usage/reference_genomes) for instructions. -The syntax for this reference file follows this syntax: +In short, the best way to do this is to extend the genomes config that comes with the pipeline, using the same configuration structure. +You can then save this file in a number of different locations, according to your preference. +See for details on this, but in short the common places are: + +* In the Nextflow home directory as a file called `config` (no file extensions). Typically this is in your user's home directory: `~/.nextflow/config` +* As a file called `nextflow.config` in the current working directory when you run the pipeline. +* Any custom path, which you will then need to specify on the command line with `-c`. eg: `-c path/to/config` (multiple files can be given like this) + +Depending on which location you use, you may want to disable the bundled igenomes to avoid conflicts. +You can do this by setting `igenomes_ignore = true` in the config, or `--igenomes_ignore` on the command line. + +The syntax for a custom reference file follows this syntax: ```nextflow params { @@ -258,69 +269,66 @@ params { gff_host = '' gff_host_tRNA = '' // Optional transcriptome_host = '' // Optional - } + } 'SL1344' { fasta_pathogen = '' gff_pathogen = '' transcriptome_pathogen = '' // Optional - } - } - // Default genomes (optional). Ignored if --genome_host 'OTHER-GENOME' and --genome_pathogen 'OTHER-GENOME' specified on command line - genome_host = 'GRCh38' - genome_pathogen = 'SL1344' - } + } + } + // Default genomes (optional). + // Ignored if --genome_host 'OTHER-GENOME' and --genome_pathogen 'OTHER-GENOME' specified on command line + genome_host = 'GRCh38' + genome_pathogen = 'SL1344' +} ``` Defining default genomes in your configuration file is optional. You can specify the references using the following flags on command line: -`--genome_pathogen SL1344` +* `--genome_pathogen SL1344` +* `--genome_host GRCh38` -`--genome_host GRCh38` - -> Any number of additional genomes can be added to this file and specified through either `--genome_host` or `--genome_pathogen`. - -If using your own custom genome file, you will also need to include either the following line, or something similar in your `nextflow.config` file to make sure the information is being read when the pipeline runs. - -```bash -includeConfig 'conf/custom_genomes.config' -``` +Any number of additional genome references can be added to this file and specified using the given genome key, through either `--genome_host` or `--genome_pathogen`. -Note: - -* The transcriptome fasta file is created by default in the pipeline using the provided genome and annotation files. If you already have one, you can specify it here as shown above, and through the parameter ```--read_transcriptome_fasta_host_from_file``` or -```--read_transcriptome_fasta_pathogen_from_file``` +Please note that: +* If `--transcriptome_host` or `--transcriptome_pathogen` is not given, the transcriptome fasta will be created by the pipeline using the provided genome and annotation files. * If `gff_host_tRNA` file is provided, the pipeline combines the files from `gff_host` and `gff_host_tRNA` to create a single host gff file. -* You don't have to specify the path to the host and pathogen transcriptomes in your conf/genomes.config file, as all transcriptome-based files are created automatically if needed. - #### B) Using pipeline-specific parameters -If preferred, you can specify each parameter manually and link to appropriate files. Reference and annotation files (`fasta` and `GFF3`) can be compressed (`.gz` or `.zip`) or uncompressed. +If you preferr, you can specify each parameter on the command line when you run the pipeline. +Reference and annotation files (`fasta` and `GFF3`) can be compressed (`.gz` or `.zip`) or uncompressed. Host: -`--fasta_host` "path to file" - -`--gff_host` "path to file" - -`--gff_host_tRNA` "path to file" - -`--read_transcriptome_fasta_host_from_file` - -`--transcriptome_host` "path to file" +* `--fasta_host /path/to/file` +* `--gff_host /path/to/file` +* `--gff_host_tRNA /path/to/file` +* `--transcriptome_host /path/to/file` Pathogen: -`--fasta_pathogen` "path to file" +* `--fasta_pathogen /path/to/file` +* `--gff_pathogen /path/to/file` +* `--transcriptome_pathogen /path/to/file` + +These parameters can also be set in a nextflow config, or supplied in a JSON / YAML file with `-params-file`. -`--gff_pathogen` "path to file" +For example, with a file `my-config.yml`: -`--read_transcriptome_fasta_pathogen_from_file` +```yaml +fasta_host: /path/to/file +gff_host: /path/to/file +gff_host_tRNA: /path/to/file +transcriptome_host: /path/to/file +``` -`--transcriptome_pathogen` "path to file" +You can run the pipeline with: -> Note: Since many dual RNA-seq experiments are likely to use pathogen-based references that have to be manually downloaded. We recommend adding a new entry to the `genomes.conf` file as depicted [above](#4-reference-genomes-and-annotation), or through specific parameters of `--fasta_pathogen` and `--gff_pathogen`. +```bash +nextflow run nf-core/dualrnaseq -params-file my-config.yml +``` ##### Host tRNA diff --git a/main.nf b/main.nf index 9109303b..13b2428b 100644 --- a/main.nf +++ b/main.nf @@ -78,15 +78,11 @@ if (params.gff_host_genome) { ch_gff_host_genome = file(params.gff_host_genome, params.gff_pathogen = params.genome_pathogen ? params.genomes[ params.genome_pathogen ].gff_pathogen ?: false : false if (params.gff_pathogen) { ch_gff_pathogen = file(params.gff_pathogen, checkIfExists: true) } -if(params.read_transcriptome_fasta_host_from_file){ - params.transcriptome_host = params.genome_host ? params.genomes[ params.genome_host ].transcriptome_host ?: false : false - if (params.transcriptome_host) { ch_transcriptome_host = file(params.transcriptome_host, checkIfExists: true) } -} +params.transcriptome_host = params.genome_host ? params.genomes[ params.genome_host ].transcriptome_host ?: false : false +if (params.transcriptome_host) { ch_transcriptome_host = file(params.transcriptome_host, checkIfExists: true) } -if(params.read_transcriptome_fasta_pathogen_from_file){ - params.transcriptome_pathogen = params.genome_pathogen ? params.genomes[ params.genome_pathogen ].transcriptome_pathogen ?: false : false - if (params.transcriptome_pathogen) { ch_transcriptome_pathogen = file(params.transcriptome_pathogen, checkIfExists: true) } -} +params.transcriptome_pathogen = params.genome_pathogen ? params.genomes[ params.genome_pathogen ].transcriptome_pathogen ?: false : false +if (params.transcriptome_pathogen) { ch_transcriptome_pathogen = file(params.transcriptome_pathogen, checkIfExists: true) } //---------- @@ -232,16 +228,16 @@ Channel //---------- // Channel for host and pathogen transcriptomes //---------- -if(params.read_transcriptome_fasta_host_from_file){ -Channel - .value(ch_transcriptome_host) - .into {host_transcriptome_to_combine; transcriptome_host_to_split_q_table_salmon; transcriptome_host_to_split_table_salmon; transcriptome_host_to_split_q_table_salmon_alignment_based; transcriptome_host_to_split_table_salmon_alignment; transcriptome_fasta_host_ref_names} +if(params.transcriptome_host){ + Channel + .value(ch_transcriptome_host) + .into {host_transcriptome_to_combine; transcriptome_host_to_split_q_table_salmon; transcriptome_host_to_split_table_salmon; transcriptome_host_to_split_q_table_salmon_alignment_based; transcriptome_host_to_split_table_salmon_alignment; transcriptome_fasta_host_ref_names} } -if(params.read_transcriptome_fasta_pathogen_from_file){ -Channel - .value(ch_transcriptome_pathogen) - .into {pathogen_transcriptome_to_combine; transcriptome_pathogen_to_split_table_salmon; transcriptome_pathogen_to_split_table_salmon_alignment; transcriptome_pathogen_to_split_q_table_salmon; transcriptome_pathogen_to_split_q_table_salmon_alignment_based;transcriptome_fasta_pathogen_ref_names} +if(params.transcriptome_pathogen){ + Channel + .value(ch_transcriptome_pathogen) + .into {pathogen_transcriptome_to_combine; transcriptome_pathogen_to_split_table_salmon; transcriptome_pathogen_to_split_table_salmon_alignment; transcriptome_pathogen_to_split_q_table_salmon; transcriptome_pathogen_to_split_q_table_salmon_alignment_based;transcriptome_fasta_pathogen_ref_names} } @@ -1301,7 +1297,7 @@ if(params.run_salmon_selective_alignment | params.run_salmon_alignment_based_mod - if(!params.read_transcriptome_fasta_host_from_file){ + if(!params.transcriptome_host){ /* * create host transcriptome fasta file @@ -1404,7 +1400,7 @@ if(params.run_salmon_selective_alignment | params.run_salmon_alignment_based_mod - if(!params.read_transcriptome_fasta_pathogen_from_file){ + if(!params.transcriptome_pathogen){ /* * create pathogen transcriptome fasta file diff --git a/nextflow.config b/nextflow.config index 07ee51ab..312b837c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -59,8 +59,6 @@ params { gene_feature_gff_to_create_transcriptome_host = "exon,tRNA" gene_attribute_gff_to_create_transcriptome_pathogen = "locus_tag" gene_feature_gff_to_create_transcriptome_pathogen = "gene,sRNA,tRNA,rRNA" - read_transcriptome_fasta_host_from_file = false - read_transcriptome_fasta_pathogen_from_file = false //-------- // Salmon Selective Alignment @@ -179,17 +177,6 @@ process.container = 'nfcore/dualrnaseq:dev' // Load base.config by default for all pipelines includeConfig 'conf/base.config' - -// Option to use a custom configuration file (which is included in conf/genomes.conf) -// false is default and thus the config file will be used. -// To not use, you can simply pass --genomes_ignore on the command line -genomes_ignore = false - -// Load genomes.config if required -if (!params.genomes_ignore) { - includeConfig 'conf/genomes.config' -} - // Load nf-core custom profiles from different Institutions try { includeConfig "${params.custom_config_base}/nfcore_custom.config" @@ -257,6 +244,11 @@ profiles { test_full { includeConfig 'conf/test_full.config' } } +// Load igenomes.config if required +if (!params.igenomes_ignore) { + includeConfig 'conf/igenomes.config' +} + // Export these variables to prevent local Python/R libraries from conflicting with those in the container env { diff --git a/nextflow_schema.json b/nextflow_schema.json index 843191e4..545a17dd 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -109,30 +109,17 @@ "description": "Pathogen transcriptome file", "fa_icon": "fas fa-file-alt" }, - "read_transcriptome_fasta_host_from_file": { - "type": "boolean", - "description": "If supplying custom transcriptome files", - "fa_icon": "far fa-file-alt" - }, - "read_transcriptome_fasta_pathogen_from_file": { - "type": "boolean", - "description": "If supplying custom transcriptome files", - "fa_icon": "fas fa-file-alt" - }, "genome_host": { "type": "string", - "default": "GRCh38", - "description": "Name of host genome in the genomes.conf file", - "fa_icon": "far fa-file" + "description": "Key for the host genome in iGenomes / your custom genomes config file", + "fa_icon": "far fa-file", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required host reference genome files e.g. `--genome_host GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, "genome_pathogen": { "type": "string", - "default": "SL1344", - "description": "Name of host genome in the genomes.conf file", - "fa_icon": "fas fa-file" - }, - "genomes_ignore": { - "type": "boolean" + "description": "Key for the pathogen genome in iGenomes / your custom genomes config file", + "fa_icon": "fas fa-file", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required pathogen reference genome files e.g. `--genome_pathogen SL1344`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, "igenomes_base": { "type": "string", From ac024c69ce4a78f76510e263ae9e39b36f4bc574 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Tue, 22 Jun 2021 09:38:32 +0200 Subject: [PATCH 2/2] Changelog --- CHANGELOG.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b4b5b74..c50d0203 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,12 @@ # nf-core/dualrnaseq: Changelog -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 1.1.0dev + +### Pipeline Updates + +* Update to the nf-core/tools template version 1.14 +* Removed the custom `genomes.config` in favour of the default `igenomes.config` with available pre-built reference genomes +* Removed the `--read_transcriptome_fasta_host_from_file` and `--read_transcriptome_fasta_pathogen_from_file` parameters, which were not needed ## 1.0.0 - Tarnica