diff --git a/modules/nf-core/gcta/reml/environment.yml b/modules/nf-core/gcta/reml/environment.yml new file mode 100644 index 00000000000..3e22ea7b9f2 --- /dev/null +++ b/modules/nf-core/gcta/reml/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::gcta=1.94.1 diff --git a/modules/nf-core/gcta/reml/main.nf b/modules/nf-core/gcta/reml/main.nf new file mode 100644 index 00000000000..b75bc23c301 --- /dev/null +++ b/modules/nf-core/gcta/reml/main.nf @@ -0,0 +1,45 @@ +process GCTA_REML { + tag "${meta.id}_${meta2.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/46/46b0d05f0daa47561d87d2a9cac5e51edc2c78e26f1bbab439c688386241a274/data' + : 'community.wave.seqera.io/library/gcta:1.94.1--9bc35dc424fcf6e9'}" + + input: + tuple val(meta), path(grm_files) + tuple val(meta2), path(phenotypes_file) + tuple val(meta3), path(quant_covariates_file) + tuple val(meta4), path(cat_covariates_file) + + output: + tuple val(meta), path("*.hsq"), emit: reml_results + tuple val("${task.process}"), val("gcta"), eval("gcta --version | sed -En 's/^[*] version v([0-9.]*).*/\\1/p'"), emit: versions_gcta, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def qcovar_param = quant_covariates_file ? "--qcovar ${quant_covariates_file}" : '' + def covar_param = cat_covariates_file ? "--covar ${cat_covariates_file}" : '' + """ + gcta \\ + --reml \\ + --grm ${meta.id} \\ + --pheno ${phenotypes_file} \\ + ${qcovar_param} \\ + ${covar_param} \\ + --out "${prefix}" \\ + --thread-num ${task.cpus} \\ + ${args} + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch "${prefix}.hsq" + """ +} diff --git a/modules/nf-core/gcta/reml/meta.yml b/modules/nf-core/gcta/reml/meta.yml new file mode 100644 index 00000000000..9b35bdda10d --- /dev/null +++ b/modules/nf-core/gcta/reml/meta.yml @@ -0,0 +1,104 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "gcta_reml" +description: Run univariate REML heritability estimation with a dense GRM +keywords: + - gcta + - genome-wide complex trait analysis + - reml + - restricted maximum likelihood + - grm + - genetic relationship matrix + - genetics +tools: + - "gcta": + description: "Genome-wide Complex Trait Analysis (GCTA) estimates genetic relationships, variance components, and association statistics from genome-wide data." + homepage: "https://yanglab.westlake.edu.cn/software/gcta/" + documentation: "https://yanglab.westlake.edu.cn/software/gcta/static/gcta_doc_latest.pdf" + tool_dev_url: "https://yanglab.westlake.edu.cn/software/gcta/" + licence: ["GPL-3.0-only"] + identifier: "biotools:gcta" +input: + - - meta: + type: map + description: | + Groovy map containing dense GRM metadata + `meta.id` must be the GRM basename passed to `--grm` + (for example `plink_simulated_dense` for `plink_simulated_dense.grm.{id,bin,N.bin}`) + - grm_files: + type: file + description: Dense GRM bundle containing sample identifier, binary matrix, + and sample-count matrix files + pattern: "*.grm.*" + ontologies: [] + - - meta2: + type: map + description: | + Groovy map containing phenotype metadata; should carry the same sample + identity as `meta`, for example `[ id:'plink_simulated_dense' ]` + - phenotypes_file: + type: file + description: Phenotype file passed to `--pheno` + pattern: "*.{phe,pheno,txt,tsv}" + ontologies: + - edam: "http://edamontology.org/format_3475" + - - meta3: + type: map + description: | + Groovy map containing quantitative covariate metadata; should carry the + same sample identity as `meta`, for example `[ id:'plink_simulated_dense' ]` + - quant_covariates_file: + type: file + description: Quantitative covariates file, pass `[]` when absent + pattern: "*.{covar,cov,txt,tsv}" + ontologies: + - edam: "http://edamontology.org/format_3475" + - - meta4: + type: map + description: | + Groovy map containing categorical covariate metadata; should carry the + same sample identity as `meta`, for example `[ id:'plink_simulated_dense' ]` + - cat_covariates_file: + type: file + description: Categorical covariates file, pass `[]` when absent + pattern: "*.{covar,cov,txt,tsv}" + ontologies: + - edam: "http://edamontology.org/format_3475" +output: + reml_results: + - - meta: + type: map + description: | + Groovy map containing dense GRM metadata + `meta.id` must be the GRM basename passed to `--grm` + (for example `plink_simulated_dense` for `plink_simulated_dense.grm.{id,bin,N.bin}`) + - "*.hsq": + type: file + description: REML result file + pattern: "*.{hsq}" + ontologies: + - edam: "http://edamontology.org/format_2330" + versions_gcta: + - - "${task.process}": + type: string + description: The process the version was collected from + - "gcta": + type: string + description: The tool name + - "gcta --version | sed -En 's/^[*] version v([0-9.]*).*/\\1/p'": + type: eval + description: The command used to retrieve the GCTA version +topics: + versions: + - - ${task.process}: + type: string + description: The process the version was collected from + - gcta: + type: string + description: The tool name + - "gcta --version | sed -En 's/^[*] version v([0-9.]*).*/\\1/p'": + type: eval + description: The command used to retrieve the GCTA version +authors: + - "@lyh970817" +maintainers: + - "@lyh970817" diff --git a/modules/nf-core/gcta/reml/tests/main.nf.test b/modules/nf-core/gcta/reml/tests/main.nf.test new file mode 100644 index 00000000000..ccb41167d80 --- /dev/null +++ b/modules/nf-core/gcta/reml/tests/main.nf.test @@ -0,0 +1,272 @@ +nextflow_process { + + name "Test Process GCTA_REML" + script "../main.nf" + process "GCTA_REML" + + tag "modules" + tag "modules_nfcore" + tag "gcta" + tag "gcta/reml" + tag "gcta/makegrm" + tag "gawk" + config "./nextflow.config" + + setup { + run("GAWK", alias: "GAWK_QUANTITATIVE_PHENOTYPE") { + script "../../../gawk/main.nf" + process { + """ + input[0] = [ + [ id:'QuantitativeTrait' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_quantitative_phenoname.phe', checkIfExists: true) + ] + input[1] = Channel.of('{ print \$1, \$2, \$3 }').collectFile(name:'quantitative_phenotypes.awk') + input[2] = false + """ + } + } + + run("GAWK", alias: "GAWK_MULTI_PHENOTYPES") { + script "../../../gawk/main.nf" + process { + """ + input[0] = [ + [ id:'QuantitativeTraitMpheno' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_quantitative_phenoname.phe', checkIfExists: true) + ] + input[1] = Channel.of('{ print \$1, \$2, \$3, (\$3 * 0.5) + ((NR % 7) / 10.0) }').collectFile(name:'multi_phenotypes.awk') + input[2] = false + """ + } + } + + run("GAWK", alias: "GAWK_BINARY_PHENOTYPE") { + script "../../../gawk/main.nf" + process { + """ + input[0] = [ + [ id:'BinaryTrait' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_binary_phenoname.phe', checkIfExists: true) + ] + input[1] = Channel.of('FNR == 1 { next } { print \$1, \$2, \$3 + 1 }').collectFile(name:'binary_phenotypes.awk') + input[2] = false + """ + } + } + + run("GAWK", alias: "GAWK_QUANTITATIVE_COVARIATES") { + script "../../../gawk/main.nf" + process { + """ + input[0] = [ + [ id:'covariates_quant' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_covariates.txt', checkIfExists: true) + ] + input[1] = Channel.of('FNR == 1 { next } { print \$1, \$2, \$4, \$5 }').collectFile(name:'quantitative_covariates.awk') + input[2] = false + """ + } + } + + run("GAWK", alias: "GAWK_CATEGORICAL_COVARIATES") { + script "../../../gawk/main.nf" + process { + """ + input[0] = [ + [ id:'covariates_cat' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_covariates.txt', checkIfExists: true) + ] + input[1] = Channel.of('FNR == 1 { next } { print \$1, \$2, \$3 }').collectFile(name:'categorical_covariates.awk') + input[2] = false + """ + } + } + + run("GCTA_MAKEGRM", alias: "GCTA_MAKEGRM_DENSE") { + script "../../makegrm/main.nf" + process { + """ + file('plink_simulated.mbfile').text = 'plink_simulated\\n' + + input[0] = [ + [ id:'plink_simulated_dense' ], + file('plink_simulated.mbfile'), + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.fam', checkIfExists: true) + ] + ] + """ + } + } + } + + test("homo_sapiens popgen - dense GRM with mpheno selection") { + when { + params { + module_args = '--mpheno 2' + } + process { + """ + input[0] = GCTA_MAKEGRM_DENSE.out.grm_files + input[1] = GAWK_MULTI_PHENOTYPES.out.output.map { meta, pheno -> [[ id:'plink_simulated_dense' ], pheno] } + input[2] = GAWK_QUANTITATIVE_COVARIATES.out.output.map { meta, covar -> [[ id:'plink_simulated_dense' ], covar] } + input[3] = GAWK_CATEGORICAL_COVARIATES.out.output.map { meta, covar -> [[ id:'plink_simulated_dense' ], covar] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.reml_results.size() == 1 }, + { assert process.out.reml_results.get(0).get(0).id == "plink_simulated_dense" }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + } + + test("homo_sapiens popgen - quantitative phenotype with dense GRM and covariates") { + when { + process { + """ + input[0] = GCTA_MAKEGRM_DENSE.out.grm_files + input[1] = GAWK_QUANTITATIVE_PHENOTYPE.out.output.map { meta, pheno -> [[ id:'plink_simulated_dense' ], pheno] } + input[2] = GAWK_QUANTITATIVE_COVARIATES.out.output.map { meta, covar -> [[ id:'plink_simulated_dense' ], covar] } + input[3] = GAWK_CATEGORICAL_COVARIATES.out.output.map { meta, covar -> [[ id:'plink_simulated_dense' ], covar] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.reml_results.size() == 1 }, + { assert process.out.reml_results.get(0).get(0).id == "plink_simulated_dense" }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + } + + test("homo_sapiens popgen - quantitative phenotype with dense GRM and no covariates") { + when { + process { + """ + input[0] = GCTA_MAKEGRM_DENSE.out.grm_files + input[1] = GAWK_QUANTITATIVE_PHENOTYPE.out.output.map { meta, pheno -> [[ id:'plink_simulated_dense' ], pheno] } + input[2] = [[ id:'plink_simulated_dense' ], []] + input[3] = [[ id:'plink_simulated_dense' ], []] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.reml_results.size() == 1 }, + { assert process.out.reml_results.get(0).get(0).id == "plink_simulated_dense" }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + } + + test("homo_sapiens popgen - binary phenotype with dense GRM and prevalence") { + when { + params { + module_args = '--prevalence 0.1' + } + process { + """ + input[0] = GCTA_MAKEGRM_DENSE.out.grm_files + input[1] = GAWK_BINARY_PHENOTYPE.out.output.map { meta, pheno -> [[ id:'plink_simulated_dense' ], pheno] } + input[2] = [[ id:'plink_simulated_dense' ], []] + input[3] = [[ id:'plink_simulated_dense' ], []] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.reml_results.size() == 1 }, + { assert process.out.reml_results.get(0).get(0).id == "plink_simulated_dense" }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + } + + test("homo_sapiens popgen - dense GRM defaults to first phenotype") { + when { + process { + """ + input[0] = GCTA_MAKEGRM_DENSE.out.grm_files + input[1] = GAWK_MULTI_PHENOTYPES.out.output.map { meta, pheno -> [[ id:'plink_simulated_dense' ], pheno] } + input[2] = GAWK_QUANTITATIVE_COVARIATES.out.output.map { meta, covar -> [[ id:'plink_simulated_dense' ], covar] } + input[3] = GAWK_CATEGORICAL_COVARIATES.out.output.map { meta, covar -> [[ id:'plink_simulated_dense' ], covar] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.reml_results.size() == 1 }, + { assert process.out.reml_results.get(0).get(0).id == "plink_simulated_dense" }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + } + + test("homo_sapiens popgen - fails when mpheno index is out of range") { + when { + params { + module_args = '--mpheno 3' + } + process { + """ + input[0] = GCTA_MAKEGRM_DENSE.out.grm_files + input[1] = GAWK_MULTI_PHENOTYPES.out.output.map { meta, pheno -> [[ id:'plink_simulated_dense' ], pheno] } + input[2] = GAWK_QUANTITATIVE_COVARIATES.out.output.map { meta, covar -> [[ id:'plink_simulated_dense' ], covar] } + input[3] = GAWK_CATEGORICAL_COVARIATES.out.output.map { meta, covar -> [[ id:'plink_simulated_dense' ], covar] } + """ + } + } + + then { + assertAll( + { assert !process.success }, + { assert process.exitStatus != 0 } + ) + } + } + + test("homo_sapiens popgen - quantitative phenotype with dense GRM - stub") { + options "-stub" + + when { + process { + """ + input[0] = GCTA_MAKEGRM_DENSE.out.grm_files + input[1] = GAWK_QUANTITATIVE_PHENOTYPE.out.output.map { meta, pheno -> [[ id:'plink_simulated_dense' ], pheno] } + input[2] = [[ id:'plink_simulated_dense' ], []] + input[3] = [[ id:'plink_simulated_dense' ], []] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.reml_results.size() == 1 }, + { assert process.out.reml_results.get(0).get(0).id == "plink_simulated_dense" }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + } +} diff --git a/modules/nf-core/gcta/reml/tests/main.nf.test.snap b/modules/nf-core/gcta/reml/tests/main.nf.test.snap new file mode 100644 index 00000000000..2f546b1a224 --- /dev/null +++ b/modules/nf-core/gcta/reml/tests/main.nf.test.snap @@ -0,0 +1,158 @@ +{ + "homo_sapiens popgen - quantitative phenotype with dense GRM - stub": { + "content": [ + { + "reml_results": [ + [ + { + "id": "plink_simulated_dense" + }, + "plink_simulated_dense.hsq:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_gcta": [ + [ + "GCTA_REML", + "gcta", + "1.94.1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-05-26T21:47:06.863951978" + }, + "homo_sapiens popgen - dense GRM defaults to first phenotype": { + "content": [ + { + "reml_results": [ + [ + { + "id": "plink_simulated_dense" + }, + "plink_simulated_dense.hsq:md5,17d5e79e461b582b0aaba2a40666c8f7" + ] + ], + "versions_gcta": [ + [ + "GCTA_REML", + "gcta", + "1.94.1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-05-26T21:46:19.771868442" + }, + "homo_sapiens popgen - quantitative phenotype with dense GRM and no covariates": { + "content": [ + { + "reml_results": [ + [ + { + "id": "plink_simulated_dense" + }, + "plink_simulated_dense.hsq:md5,51c1328c8feb6d53f3984cd58324fed7" + ] + ], + "versions_gcta": [ + [ + "GCTA_REML", + "gcta", + "1.94.1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-05-26T21:45:27.688042039" + }, + "homo_sapiens popgen - quantitative phenotype with dense GRM and covariates": { + "content": [ + { + "reml_results": [ + [ + { + "id": "plink_simulated_dense" + }, + "plink_simulated_dense.hsq:md5,17d5e79e461b582b0aaba2a40666c8f7" + ] + ], + "versions_gcta": [ + [ + "GCTA_REML", + "gcta", + "1.94.1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-05-26T21:45:01.211801644" + }, + "homo_sapiens popgen - dense GRM with mpheno selection": { + "content": [ + { + "reml_results": [ + [ + { + "id": "plink_simulated_dense" + }, + "plink_simulated_dense.hsq:md5,0a29048e72305f462889481b2dfb94db" + ] + ], + "versions_gcta": [ + [ + "GCTA_REML", + "gcta", + "1.94.1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-05-26T21:44:38.283352845" + }, + "homo_sapiens popgen - binary phenotype with dense GRM and prevalence": { + "content": [ + { + "reml_results": [ + [ + { + "id": "plink_simulated_dense" + }, + "plink_simulated_dense.hsq:md5,86672f6a0b3c49b2347d402cb03b8606" + ] + ], + "versions_gcta": [ + [ + "GCTA_REML", + "gcta", + "1.94.1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-05-26T21:45:51.759861217" + } +} \ No newline at end of file diff --git a/modules/nf-core/gcta/reml/tests/nextflow.config b/modules/nf-core/gcta/reml/tests/nextflow.config new file mode 100644 index 00000000000..b77f0e7d528 --- /dev/null +++ b/modules/nf-core/gcta/reml/tests/nextflow.config @@ -0,0 +1,9 @@ +params { + module_args = "" +} + +process { + withName: "GCTA_REML" { + ext.args = { params.module_args ?: "" } + } +}