1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
//! Blunt-end trims using 0-based coordinates
//!
//! # Examples
//!
//! ## Trim five bases from the right side
//! ```bash
//! cat file.fastq | fasten_trim -l -5 > trimmed.fastq
//! ```
//!
//! ## Keep a maximum of 100bp
//! ```bash
//! cat file.fastq | fasten_trim -l 99 > trimmed.fastq
//! ```
//! ## Trim 5bp from the left side
//! ```bash
//! cat file.fastq | fasten_trim -f 4 > trimmed.fastq
//! ```
//!
//! # Usage
//!
//! ```text
//! Usage: fasten_trim [-h] [-n INT] [-p] [-v] [-f INT] [-l INT]
//!
//! Options:
//! -h, --help Print this help menu.
//! -n, --numcpus INT Number of CPUs (default: 1)
//! -p, --paired-end The input reads are interleaved paired-end
//! -v, --verbose Print more status messages
//! -f, --first-base INT
//! The first base to keep (default: 0)
//! -l, --last-base INT The last base to keep. If negative, counts from the
//! right. (default: 0)
//! ```
extern crate fasten;
extern crate statistical;
extern crate getopts;
extern crate threadpool;
use std::fs::File;
use std::io::BufReader;
use std::env;
use std::cmp::min;
use threadpool::ThreadPool;
use std::sync::mpsc::channel;
use fasten::fasten_base_options;
use fasten::logmsg;
use fasten::io::fastq;
use fasten::io::seq::Cleanable;
use fasten::io::seq::Seq;
fn main(){
let args: Vec<String> = env::args().collect();
let mut opts = fasten_base_options();
// script-specific options
opts.optopt("f","first-base","The first base to keep (default: 0)","INT");
opts.optopt("l","last-base","The last base to keep. If negative, counts from the right. (default: 0)","INT");
let matches = opts.parse(&args[1..]).expect("ERROR: could not parse parameters");
if matches.opt_present("help") {
println!("Blunt-end trims using 0-based coordinates\n{}", opts.usage(&opts.short_usage(&args[0])));
std::process::exit(0);
}
let (tx, rx):(std::sync::mpsc::Sender<String>,std::sync::mpsc::Receiver<String>) = channel();
//let paired_end:bool = matches.opt_present("paired-end");
let first_base:usize ={
if matches.opt_present("first-base") {
matches.opt_str("first-base")
.expect("ERROR: could not understand parameter --first-base")
.parse()
.expect("ERROR: --first-base is not an INT")
} else {
0
}
};
let last_base:usize ={
if matches.opt_present("last-base") {
matches.opt_str("last-base")
.expect("ERROR: could not understand parameter --last-base")
.parse()
.expect("ERROR: --last-base is not an INT")
} else {
0
}
};
let num_cpus:usize = {
if matches.opt_present("numcpus") {
/*
matches.opt_str("numcpus")
.expect("ERROR: could not understand parameter --numcpus")
.parse()
.expect("ERROR: --numcpus is not an INT");
*/
logmsg("Warning: multithreading this script currently slows it down. Resetting to 1 cpu. Avoid this warning by not using --numcpus");
1 as usize
} else {
1 as usize
}
};
/*
* Set up multithreading. Each thread will get 100k
* reads at a time.
*/
let pool = ThreadPool::with_name("worker".into(), num_cpus);
// Read from stdin
let my_file = File::open("/dev/stdin").expect("Could not open file");
let my_buffer=BufReader::new(my_file);
let fastq_reader = fastq::FastqReader::new(my_buffer);
let mut fastq_iter = fastq_reader.into_iter();
while let Some(seq) = fastq_iter.next() {
let mut seqs:Vec<Seq> = Vec::with_capacity(10000);
seqs.push(seq);
// Get an odd number to push onto the vector
// so that it is eventually an even number
// and we sidestep any paired end nuances.
for _ in 0..9999 { // 9999 + 1 => 10k seqs
// if the iterator returns nothing, then use
// a blank sequence. In the worker thread,
// it will check for a blank sequence and if
// it is blank, it will skip it.
let next_seq = fastq_iter.next()
.or(Some(Seq::blank())).unwrap();
seqs.push(
next_seq
//.expect("Tried to get the second sequence in a pair but ran into an error")
);
}
// Send this single end or paired end to the queue
let tx2 = tx.clone();
pool.execute(move|| {
trim_worker(seqs, first_base, last_base, tx2);
});
}
pool.join();
drop(tx); // disconnects the channel
let receiver = rx.iter();
for entry in receiver {
println!("{}",entry);
}
}
/// Trim a set of fastq entries and send it to a channel
fn trim_worker(seqs:Vec<Seq>, first_base:usize, last_base:usize, tx:std::sync::mpsc::Sender<String> ){
let blank_seq = Seq::blank();
for seq in seqs{
if seq.id == blank_seq.id && seq.seq == blank_seq.seq && seq.qual == blank_seq.qual {
continue;
}
// The last position is either the last_base parameter
// or the last position in the string, whichever is less.
let last_base_tmp = min(seq.seq.len(), last_base);
let sequence = &seq.seq[first_base..last_base_tmp];
let quality = &seq.qual[first_base..last_base_tmp];
let trimmed = format!("{}\n{}\n+\n{}", seq.id, sequence, quality);
match tx.send(trimmed){
Ok(_seq_obj) => {},
Err(_error) => {}
};
}
}
/*
fn trim_worker_old(sub_lines_buffer:&mut Vec<String>, first_base:usize, last_base:usize, tx:std::sync::mpsc::Sender<String> ){
let this_thread = thread::current();
let _tid = this_thread.id(); // for debugging
sub_lines_buffer.reverse();
while sub_lines_buffer.len() > 0 {
//let mut entry_splice = &sub_lines_buffer.splice(0..4, vec![]);
//let entry = vec![entry_splice];
let id = sub_lines_buffer.pop().unwrap();
let mut seq = sub_lines_buffer.pop().unwrap();
let plus = sub_lines_buffer.pop().unwrap();
let mut qual = sub_lines_buffer.pop().unwrap();
let last_base_tmp = min(seq.len(), last_base);
seq = String::from(seq);
qual = String::from(qual);
let entry = format!("{}\n{}\n{}\n{}",
id, &seq[first_base..last_base_tmp], plus, &qual[first_base..last_base_tmp]
);
tx.send(entry).unwrap();
}
//eprintln!("{:?} finished {}", &_tid, &num_lines);
}
*/
