Load data | obi import --quality-sanger file_R1.fastq reads1 obi import --quality-sanger file_R2.fastq reads2 |
Import tags | obi import --ngsfilter-input taglist.txt ngsfile |
Align paired-end reads | obi alignpairedend -R reads2 reads1 aligned_reads |
Grep entries whose mode are alignment | obi grep -a mode:alignment aligned_reads good_sequences |
Assign alignments to individual PCRs | obi ngsfilter -t ngsfile -u unidentified_sequences good_sequences identified_sequences |
Filter out sequences | obi grep -p "sequence[’score']>50" identified_sequences identified_sequences_filtered obi grep -p "sequence[’score_norm']>0.9" identified_sequences_filtered identified_sequences_filtered_adj |
Dereplicate Sequences | obi uniq -m sample identified_sequences_filtered_adj dereplicated_sequences_filtered |
Keep only useful tags | obi annotate -k COUNT -k MERGED_sample dereplicated_sequences_filtered cleaned_metadata_sequences |
Discard sequences that are shorter than 60 bp (based on primer pair) | obi grep -p "len(sequence) ≥ 60 and sequence['COUNT'] ≥ 10" cleaned_metadata_sequences denoised_sequences |
Clean the sequences from PCR/sequencing errors | obi clean -s MERGED_sample -r 0.05 H denoised_sequences cleaned_sequences |
Load database | cp STD_MAM_1.dat.gz ~/edna_LauraB/master/mammalia/database/ |
Import it into DMS | obi import /data/scc/edna/LauraBa/master/mammalia/database/STD_MAM_1.dat.gz database_mam obi import --embl EMBL embl_refs |
Download the taxonomy | wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz |
Import the taxonomy in the DMS | obi import --taxdump /data/scc/edna/LauraBa/master/mammalia/taxdump.tar.gz taxonomy/my_tax |
Cleaning the database with in silico PCR | obi ecopcr -e 3 l 50 L 150 F CGAGAAGACCCTATGGAGCT -R CCGAGGTCRCCCCAACC --taxonomy taxonomy/my_tax embl_refs mam_refs |
Filter sequences | obi grep --require-rank=species --require-rank=genus --require-rank=family --taxonomy taxonomy/my_tax mam_refs mam_refs_clean |
Dereplicate identical sequences | obi uniq --taxonomy taxonomy/my_tax mam_refs_clean mam_refs_uniq |
Add taxid at the family level | obi grep --require-rank=family --taxonomy taxonomy/my_tax mam_refs_uniq |
Build the reference database | obi build_ref_db -t 0.97 --taxonomy taxonomy/my_tax mam_refs_uniq_clean mam_db_97 |
Assign each sequence to a taxon | obi ecotag -m 0.97 --taxonomy taxonomy/my_tax -R mam_db_97 cleaned_sequences assigned_sequences |
Align the sequences | obi align -t 0.95 assigned_sequences aligned_assigned_sequences |
Export tables for downstream data analysis | obi grep -A SCIENTIFIC_NAME assigned_sequences assigned_for_metabR |
Output two tables required by metabaR | obi annotate -k MERGED_sample assigned_for_metabR assigned_for_metabR_reads_tableobi export --tab-output --output-na-string 0 assigned_for_metabR_reads_table >mam_reads_01.txtobi annotate --taxonomy taxonomy/my_tax \ --with-taxon-at-rank superkingdom \ --with-taxon-at-rank kingdom \ --with-taxon-at-rank phylum \ --with-taxon-at-rank subphylum \ --with-taxon-at-rank class \ --with-taxon-at-rank subclass \ --with-taxon-at-rank order \ --with-taxon-at-rank suborder \ --with-taxon-at-rank infraorder \ --with-taxon-at-rank superfamily \ --with-taxon-at-rank family \ --with-taxon-at-rank genus \ --with-taxon-at-rank species \ --with-taxon-at-rank subspecies \ assigned_for_metabR assigned_for_metabR_taxInfo obi annotate \ -k BEST_IDENTITY -k TAXID -k SCIENTIFIC_NAME -k COUNT -k seq_length \ -k superkingdom_name \ -k kingdom_name \ -k phylum_name \ -k subphylum_name \ -k class_name \ -k subclass_name \ -k order_name \ -k suborder_name \ -k infraorder_name \ -k superfamily_name \ -k family_name \ -k genus_name \ -k species_name \ assigned_for_metabR_taxInfo assigned_for_metabR_motus obi export --tab-output assigned_for_metabR_motus >mam_motus_01.txt |
Further processing of the data sets was done using RStudio. |
Editing files for metabar | reads<- dt_reads %>% dplyr::select(-c("DEFINITION", "NUC_SEQ"))%>% as.data.frame() %>% janitor::row_to_names(row_number = 897, remove_rows_above = FALSE, remove_row = TRUE) %>% mutate_if(is.integer,as.numeric) |
Assign name to first column | reads <- cbind(rownames(reads),reads) rownames(reads) <- NULL colnames(reads) <- c(names(reads)) colnames(reads)(1) <- "pcr_id" |
Edit the names of the column | reads$pcr_id = strsplit(reads$pcr_id,"[.]") reads$pcr_id = sapply(reads$pcr_id, function(x) x[length(x)]) rownames(reads)<- reads$pcr_id |
Organizing the MOTUs table | motus<- dplyr::select(dt_motus, 'ID', 'NUC_SEQ', 'COUNT','BEST_IDENTITY', 'TAXID', 'SCIENTIFIC_NAME', ’superkingdom_name', ’species_name', 'class_name', 'order_name', 'family_name', 'genus_name', 'kingdom_name', 'phylum_name', ’subphylum_name', ’subclass_name', ’suborder_name') names(motus)[names(motus) == 'NUC_SEQ'] <- ’sequence' |