Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions branchwater-web_MAGtest/multispecies_notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
*workflow for testing species in branchwater web. Summary outputs tracked in multispecies__tracking.xlsx*

# branchwater-web

- sequentially uploaded refseq genomes and downloaded default output

- Opted for .4 containment (~0.97 cANI) cutoff for Aca, Asp, Can, Crys

- Low numbers for Folsomia candida and it has a large genome, so 0.34 containment (0.95 cANI) threshold chosen to capture at least 3 sra accessions

- basic output/metadata pasted into multispecies_tracking.xlsx

## nf-core/fetchngs (12/11/23)

1. WD: /project/90daydata/gbru_fy23_branchwater/multispecies/fetch.

2. `nano multispeciessra.csv` - pasted sra accession list from multispecies_tracking.xlsx_

3. ``nextflow run nf-core/fetchngs -profile cluster --cpu_max --input multispeciessra.csv --outdir multi_fastq --force_sratools_download -bg --email suzanne.fleishman@usda.gov --nf_core_pipeline rnaseq -resume``

## nf-core/mag (12/11/23)

- removed previous issue sample that led to unsolvable error in preliminary test: ERX4889432_ERR5083269.fastq.gz

- Downloaded 'samplesheet.csv'

- Edited to SE and PE

- uploaded new SE and PE csv lists

## -- SE

`nextflow run nf-core/mag --input /project/90daydata/gbru_fy23_branchwater/multispecies/SE/paths_SE.csv --outdir SE_out_2 --email suzannemfleishman@gmail.com --binning_map_mode own --single_end -bg`

## -- PE

`nextflow run nf-core/mag --input /project/90daydata/gbru_fy23_branchwater/multispecies/PE/paths_PE.csv --outdir PE_out --email suzannemfleishman@gmail.com --binning_map_mode own -bg`



### mag output

    1258 bins total from all accessions

## Taxonomic Match with bbmap/sendsketch (12/13/23)

1. copied SE and PE bins to local: ./sendsketch/multi/bins

2. created mapping excel sheet from bin names, columns for accession, genus, and species

3. bash script:

1. Run sendsketch on a bin

2. if bin matches accession and genus, save output to sort.txt

3. if bin does not, save output to other.txt

4. saved output

5. added match level to multispecies_tracking.xlsx

WD:./sendsketch

multi_ss.sh

```bash
#!/bin/bash

FILES=./sendsketch/multi/bins/*
MAP=./sendsketch/multi/multi_mapping.csv

for f in $FILES; do
output=$(bbmap/sendsketch.sh in="$f" refseq)
match_found=0

while read -r line && [[ $match_found -eq 0 ]]; do
col1=$(echo "$line" | cut -d ',' -f 1)
col2=$(echo "$line" | cut -d ',' -f 2)
output_file="multi/$col2.txt"
touch "$output_file"

if echo "$output" | grep -q "$col1" && echo "$output" | grep -q "$col2"; then
echo "$output" >> "$output_file"
echo "$f,yes" >> multi/sort.txt
match_found=1
fi
done < "$MAP"

if [[ $match_found -eq 0 ]]; then
echo "$output" >> multi/other.txt
echo "$f,no" >> multi/sort.txt
fi
done
```

`nohup bash multi_ss.sh > output.log 2>&1 &`
Binary file not shown.
102 changes: 102 additions & 0 deletions branchwater-web_MAGtest/nfcore/MAG/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
executor {
name = 'slurm'
queueSize = 100
submitRateLimit = '1/sec'
}

process{
clusterOptions = '--account=gbru_fy23_branchwater'
cache = 'lenient'
queue = {
if ( task.memory > 384.GB ) {
"bigmem"
} else {
"atlas"
}
}
errorStrategy = 'ignore'
}


params {
skip_prokka = true
//skip_prodigal = true
skip_maxbin2 = true
skip_concoct = true
skip_spades = true
skip_spadeshybrid = true
max_memory = 1536.GB
max_cpus = 48
max_time = 14.d
}

singularity {
enabled = true
envWhitelist = "http_proxy, https_proxy"
autoMounts = true
cacheDir='/90daydata/gbru_fy23_branchwater/Oct10_pe/singularity_cache_dir'
}

process {
withName: ARIA2_UNTAR{
memory = '192.GB'
cpus = '24'
}

withName: BOWTIE2_PHIX_REMOVAL_ALIGN{
memory = '512.GB'
time = '14d'
cpus = '16'
errorStrategy = 'ignore'
}

withName: MEGAHIT{
memory = '512.GB'
time = '14d'

cpus = '16'
// errorStrategy = 'ignore'
}

withName: QUAST{
memory = '64.GB'
cpus = '8'
time = '14d'
// errorStrategy = 'ignore'
}
withName: BOWTIE2_ASSEMBLY_ALIGN{
cpus = '8'
memory = '64.GB'
time = '14d'
// errorStrategy = 'ignore'
}
withName: BOWTIE2_ASSEMBLY_BUILD{
memory = '64.GB'
cpus = '8'
time = '14d'
errorStrategy = 'ignore'
}
withName: PRODIGAL{
memory = '64.GB'
cpus = '8'
time = '14d'
errorStrategy = 'ignore'
}
withName: METABAT2_METABAT2{
memory = '64.GB'
cpus = '8'
time = '14d'
// errorStrategy = 'ignore'
}
withName: BUSCO{
memory = '64.GB'
cpus = '8'
time = '14d'
}
withName: PROKKA{
memory = '64.GB'
cpus = '8'
time = '14d'
errorStrategy = 'ignore'
}
}
35 changes: 35 additions & 0 deletions branchwater-web_MAGtest/nfcore/MAG/paths_PE.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
sample,group,short_reads_1,short_reads_2,long_reads
ERR2185279,ERR2185279,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/ERX2241398_ERR2185279_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/ERX2241398_ERR2185279_2.fastq.gz,
ERR2819892,ERR2819892,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/ERX2826699_ERR2819892_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/ERX2826699_ERR2819892_2.fastq.gz,
ERR3333611,ERR3333611,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/ERX3358326_ERR3333611_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/ERX3358326_ERR3333611_2.fastq.gz,
SRR14842381,SRR14842381,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX11165740_SRR14842381_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX11165740_SRR14842381_2.fastq.gz,
SRR2243572,SRR2243572,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX1184284_SRR2243572_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX1184284_SRR2243572_2.fastq.gz,
SRR16797981,SRR16797981,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX12996950_SRR16797981_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX12996950_SRR16797981_2.fastq.gz,
SRR17231405,SRR17231405,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX13410501_SRR17231405_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX13410501_SRR17231405_2.fastq.gz,
SRR17238487,SRR17238487,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX13417576_SRR17238487_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX13417576_SRR17238487_2.fastq.gz,
SRR18691161,SRR18691161,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX14792121_SRR18691161_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX14792121_SRR18691161_2.fastq.gz,
SRR18691155,SRR18691155,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX14792127_SRR18691155_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX14792127_SRR18691155_2.fastq.gz,
SRR18691152,SRR18691152,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX14792130_SRR18691152_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX14792130_SRR18691152_2.fastq.gz,
SRR18691151,SRR18691151,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX14792131_SRR18691151_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX14792131_SRR18691151_2.fastq.gz,
SRR18691112,SRR18691112,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX14792170_SRR18691112_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX14792170_SRR18691112_2.fastq.gz,
SRR20950657,SRR20950657,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX16969232_SRR20950657_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX16969232_SRR20950657_2.fastq.gz,
SRR20950787,SRR20950787,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX16969463_SRR20950787_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX16969463_SRR20950787_2.fastq.gz,
SRR3458562,SRR3458562,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX1734330_SRR3458562_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX1734330_SRR3458562_2.fastq.gz,
SRR3458563,SRR3458563,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX1734331_SRR3458563_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX1734331_SRR3458563_2.fastq.gz,
SRR22535178,SRR22535178,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX18499221_SRR22535178_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX18499221_SRR22535178_2.fastq.gz,
SRR5190219,SRR5190219,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX2506039_SRR5190219_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX2506039_SRR5190219_2.fastq.gz,
SRR5190220,SRR5190220,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX2506040_SRR5190220_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX2506040_SRR5190220_2.fastq.gz,
SRR5190255,SRR5190255,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX2506075_SRR5190255_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX2506075_SRR5190255_2.fastq.gz,
SRR5190256,SRR5190256,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX2506076_SRR5190256_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX2506076_SRR5190256_2.fastq.gz,
SRR5831603,SRR5831603,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX3008909_SRR5831603_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX3008909_SRR5831603_2.fastq.gz,
SRR6144754,SRR6144754,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX3256856_SRR6144754_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX3256856_SRR6144754_2.fastq.gz,
SRR6144753,SRR6144753,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX3256857_SRR6144753_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX3256857_SRR6144753_2.fastq.gz,
SRR9016984,SRR9016984,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX5795138_SRR9016984_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX5795138_SRR9016984_2.fastq.gz,
SRR9016985,SRR9016985,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX5795139_SRR9016985_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX5795139_SRR9016985_2.fastq.gz,
SRR11734791,SRR11734791,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8293690_SRR11734791_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8293690_SRR11734791_2.fastq.gz,
SRR11734785,SRR11734785,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8293696_SRR11734785_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8293696_SRR11734785_2.fastq.gz,
SRR11734780,SRR11734780,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8293701_SRR11734780_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8293701_SRR11734780_2.fastq.gz,
SRR11734772,SRR11734772,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8293709_SRR11734772_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8293709_SRR11734772_2.fastq.gz,
SRR12217748,SRR12217748,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8728196_SRR12217748_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8728196_SRR12217748_2.fastq.gz,
SRR12217734,SRR12217734,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8728210_SRR12217734_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX8728210_SRR12217734_2.fastq.gz,
SRR12959777,SRR12959777,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX9412560_SRR12959777_1.fastq.gz,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX9412560_SRR12959777_2.fastq.gz,
8 changes: 8 additions & 0 deletions branchwater-web_MAGtest/nfcore/MAG/paths_SE.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
sample,group,short_reads_1,short_reads_2,long_reads
ERR10162411,ERR10162411,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/ERX9699035_ERR10162411.fastq.gz,,
SRR14141927,SRR14141927,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX10511357_SRR14141927_1.fastq.gz,,
SRR2105903,SRR2105903,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX1099828_SRR2105903.fastq.gz,,
SRR20285055,SRR20285055,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX16318535_SRR20285055_1.fastq.gz,,
SRR20285028,SRR20285028,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX16318562_SRR20285028_1.fastq.gz,,
SRR5098299,SRR5098299,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX2415084_SRR5098299.fastq.gz,,
SRR5098319,SRR5098319,/90daydata/gbru_fy23_branchwater/multispecies/fetch/multi_fastq/fastq/SRX2415104_SRR5098319.fastq.gz,,
51 changes: 51 additions & 0 deletions branchwater-web_MAGtest/nfcore/MAG/software_versions.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
BOWTIE2_ASSEMBLY_ALIGN:
bowtie2: 2.4.2
pigz: 2.3.4
samtools: '1.11'
BOWTIE2_PHIX_REMOVAL_ALIGN:
bowtie2: 2.4.5
BUSCO:
R: 4.1.3
busco: 5.4.3
python: 3.9.13
CUSTOM_DUMPSOFTWAREVERSIONS:
python: 3.10.6
yaml: '6.0'
FASTP:
fastp: 0.23.2
FASTQC_RAW:
fastqc: 0.11.9
FASTQC_TRIMMED:
fastqc: 0.11.9
GTDBTK_CLASSIFY:
gtdbtk: 1.5.0
GUNZIP_BINS:
gunzip: '1.10'
MAG_DEPTHS:
pandas: 1.1.5
python: 3.6.7
MAG_DEPTHS_SUMMARY:
pandas: 1.4.3
python: 3.10.6
MEGAHIT:
megahit: 1.2.9
METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS:
metabat2: '2.15'
METABAT2_METABAT2:
metabat2: '2.15'
PRODIGAL:
pigz: '2.6'
prodigal: 2.6.3
QUAST:
metaquast: 5.0.2
python: 3.7.6
QUAST_BINS:
metaquast: 5.0.2
python: 3.7.6
SPLIT_FASTA:
biopython: 1.7.4
pandas: 1.1.5
python: 3.6.7
Workflow:
Nextflow: 23.04.0
nf-core/mag: 2.3.2
43 changes: 43 additions & 0 deletions branchwater-web_MAGtest/nfcore/fetchNGS/multispeciessra.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
SRR3458563
SRR3458562
ERR1992808
ERR2185279
SRR20950657
SRR20950787
SRR20285055
SRR14141927
ERR10162411
ERR2819892
SRR14842381
SRR6144754
SRR2243572
ERR5083269
SRR20285028
SRR6144753
SRR17231405
SRR17238487
SRR5190255
SRR12217748
SRR12217734
SRR18691161
SRR5190220
SRR18691155
SRR18691151
SRR5190256
ERR3333611
SRR18691152
SRR5190219
SRR22535178
SRR18691112
SRR12959777
SRR2105903
SRR5831603
SRR16797981
SRR5098299
SRR5098319
SRR9016985
SRR11734772
SRR11734785
SRR11734780
SRR9016984
SRR11734791
57 changes: 57 additions & 0 deletions branchwater-web_MAGtest/nfcore/fetchNGS/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
manifest {
homePage = 'https://github.com/USDA-ARS-GBRU'
description = 'Config to pull SRA metagenomes found with branchwater'
version = '2.0'
}


profiles {
standard {
process.executor = 'local'
process.cache = 'lenient'
}
cluster {
process.executor = 'slurm'
process.clusterOptions = '--account=gbru_fy23_branchwater'
process.cache = 'lenient'
queue = 'atlas'
}
}
executor {
name = 'slurm'
queueSize = 100
submitRateLimit = '10/sec'
}
params {
max_memory = 384.GB
max_cpus = 48
max_time = 14.d
}

singularity {
enabled = true
autoMounts = true
envWhitelist = "http_proxy, https_proxy"
cacheDir = '/90daydata/gbru_fy23_branchwater/bpm50/singularity_cache_dir'
}

process {
withName: SRATOOLS_PREFETCH {
ext.args = '--max-size 300g'
}

withLabel: cpu_med{
cpus = 4
memory = 64.GB
}
withLabel:cpu_high{
cpus = 8
memory = 64.GB
}
withLabel:cpu_max{
cpus = 24
memory = 192.GB
}
}


Loading