var compile = function(flag, thresh, output_filename) { var headings = "Sample,Raw Reads,On-Target Reads,%On-Target,%GT,IFI,,Ots_SEXY3-1,"; gene_info.forEach(function(g) { switch(flag) { case "C": headings += ","+g.name; break; default: headings += ","+g.name+"_1"; headings += ","+g.name+"_2"; break; } }); headings += "\n"; var aa = [] aa.push(headings.trim().split(",")); // XXX avoid this split // iterate through all the the little fishies for(var name in fishies) { var fish = fishies[name]; // Wanda // XXX this is only being used for "N" - why not others? var enough_typed = fish.pct_typed >= thresh; var a = [ fish.name, fish.raw_reads, fish.hits, fish.hit_pct, mk_pct(gene_info.length, fish.num_typed), fish.ifi, "", ]; if(sex_fp) { // XXX this special case is annoying. switch(flag) { case "C": a.push( fish.sex_hits ); break; case "N": var nt = "-"; if(enough_typed) { // XXX nt = "00"; switch(fish.sex_genoclass) { case "A1HOM": nt = "11"; break; case "A2HOM": nt = "22"; break; case "HET": nt = "12"; break; } } a.push(nt); break; case "S": default: a.push( fish.sex_genotype ); break; } a.push(""); } // iterate through the genes and output a column for each gene_info.forEach(function(g) { var fg = fish.genes[g.name]; switch(flag) { case "C": a.push( fg.hits ); break; case "N": var nt = "-"; if(enough_typed) { // XXX nt = "00"; switch(fg.genoclass) { case "A1HOM": nt = "11"; break; case "A2HOM": nt = "22"; break; case "HET": nt = "12"; break; } } a.push(nt[0]); a.push(nt[1]); break; case "S": default: if(fg.genotype[0] == '-') { a.push( fg.genotype ); a.push( fg.genotype ); } else { a.push( fg.genotype[0] ); a.push( fg.genotype[1] ); } break; } }); aa.push(a); } xlsx.save(aa, output_filename+".xlsx", flag); }
gunzip(inpath, function(data) { // data is the uncompressed contents of the entire fastq file if(!data) { log("Skipping empty input file: "+file); finish(); return; } // ----------------------------- // Hash the sequence data // ----------------------------- var lines = data.trim().split("\n"); // break the data into lines // create temp array, "a", containing all the nucleotide sequences from the fastq data, discarding the rest of the data var a = []; // create the array (empty) for(var i = 0; i < lines.length; i += 4) { // traverse the lines in groups of 4 throwIf( lines[i+2].trim() != "+" ); // sanity check - expect this line to contain just a "+" sign a.push( lines[i+1].trim() ); // add the line with the sequence to the array } // 'a' should now look like [ "ACTG...", "GTCA...", ... ] log("Fish: \""+fish.name+"\" ("+a.length+" sequences)"); fish.raw_reads = a.length; // note # of raw reads found in fastq file (sex line not included) // Create a temp hash with one entry per unique sequence, where key is the actual sequence // and the value is the number of times that sequence appears in the fastq data. var hash = {}; // create empty object a.forEach(function(seq) { if(!hash[seq]) { hash[seq] = 1; } else { hash[seq] += 1; } }); // 'hash' now looks like { "ACTG...": 123, "GTCA...": 456, ... } // Convert the hash into an array. // Each array entry is an object containing the sequence and count. var sequences = []; for(var seq in hash) { sequences.push( { sequence: seq, count: hash[seq] } ); } // Sort the array, largest count to smallest count. sequences.sort(function(a, b) { if(a.count < b.count) return 1; if(a.count > b.count) return -1; return 0; }); // 'sequences' now looks like [ { sequence: "GTCA", count: 456 }, ... ] fs.writeFileSync(outpath+"-hash.json", util.inspect(sequences), "utf8"); // ----------------------------- // Scan for and count genes and alleles // ----------------------------- fish.genes = {}; // this holds info for this fish related to the genes we're looking for fish.hits = 0; // hits is my name for "on-target reads" // traverse the list of genes in the assay/locus data gene_info.forEach(function(g) { var fwd_prm = g.fwd_prm; // the fwd primer sequence for this gene var p1 = g.probe1; // probe1 sequence for this gene var p1rc = g.probe1rc; // probe1's RC var p2 = g.probe2; // probe2 sequence for this gene var p2rc = g.probe2rc; // probe2's RC var rx_p1 = new RegExp( g.probe1 ); var rx_p1rc = new RegExp( g.probe1rc ); var rx_p2 = new RegExp( g.probe2 ); var rx_p2rc = new RegExp( g.probe2rc ); // init counters var fp_hits = 0; // # of times fwd prm seen var p1_hits = 0; // # of times fwd prm AND probe1 seen together var p2_hits = 0; // # of times fwd prm AND probe2 seen together sequences.forEach(function(sc) { var seq = sc.sequence; // the nucleotide sequence var count = sc.count; // # time seq seen in fastq data if(seq.indexOf(fwd_prm) == 0) { // sequence "starts" with fwd prm fp_hits += count; if( rx_p1.test(seq) || rx_p1rc.test(seq) ) { // sequence contains either probe1 or its RC p1_hits += count; } else if( rx_p2.test(seq) || rx_p2rc.test(seq) ) { // sequence contains either probe2 or its RC p2_hits += count; } } }); // create gene tracking object for this gene (for this fish) var fg = {}; fish.genes[g.name] = fg; // attach tracking object to fish using gene name as tag (same as gene_info) fg.p1_hits = p1_hits; // probe1 hits for this gene, this fish fg.p2_hits = p2_hits; // probe2 hits for this gene, this fish fg.hits = p1_hits + p2_hits; // sum of probe1 and probe2 hits (on-target reads for this gene, this fish) fg.hit_pct = mk_pct(fp_hits, fg.hits); // ratio of probe-hits:fwd-prm-hits fish.hits += fg.hits; // add all the hits for this gene to the # of hits for the whole fish }); // compute hit_pct_fish after all genes scanned so fish.hits is valid when I do so gene_info.forEach(function(g) { var fg = fish.genes[g.name]; // gene tracking object fg.hit_pct_fish = mk_pct(fish.hits, fg.hits); }); // ----------------------------- // Derive genotype info // ----------------------------- // XXX ifi? /**/ fish.hom_ct = 0; /**/ fish.bkgrd_ct = 0; /**/ fish.ifi = 0; fish.num_typed = 0; fish.num_typed_hom = 0; fish.num_typed_hom_a1 = 0; fish.num_typed_hom_a2 = 0; fish.num_typed_het = 0; gene_info.forEach(function(g) { var fg = fish.genes[g.name]; // gene tracking object // uncorrected a1:a2 ratio fg.a1a2_ratio_uncorr = toInt(((fg.p1_hits || 0.1) / (fg.p2_hits || 0.1)) * 1000) / 1000; // apply correction factors (XXX wtf is this anyway?) fg.corr_p1_hits = fg.p1_hits - ((fg.hits / 4) * g.a1_corr); fg.corr_p2_hits = fg.p2_hits - ((fg.hits / 4) * g.a2_corr); if(fg.corr_p1_hits < 0) fg.corr_p1_hits = 0; if(fg.corr_p2_hits < 0) fg.corr_p2_hits = 0; fg.corr_p1_hits = toInt(fg.corr_p1_hits); fg.corr_p2_hits = toInt(fg.corr_p2_hits); // a1:a2 ratio with correction fg.a1a2_ratio = toInt(((fg.corr_p1_hits || 0.1) / (fg.corr_p2_hits || 0.1)) * 1000) / 1000; if((fg.corr_p1_hits + fg.corr_p2_hits) < 10) { // low allele count fg.genotype = "-lac-"; fg.genoclass = "NA"; } else { if(fg.a1a2_ratio >= 10) { // allele1 homozygotes fg.genotype = g.allele1 + g.allele1; fg.genoclass = "A1HOM"; fish.num_typed += 1; fish.num_typed_hom += 1; fish.num_typed_hom_a1 += 1; /**/ fish.hom_ct += fg.corr_p1_hits; /**/ fish.bkgrd_ct += fg.corr_p2_hits; } else if(fg.a1a2_ratio >= 5) { // in-betweeners fg.genotype = "-ib1-"; fg.genoclass = "NA"; /**/ fish.hom_ct += fg.corr_p1_hits; /**/ fish.bkgrd_ct += fg.corr_p2_hits; } else if(fg.a1a2_ratio >= 0.2) { // heterozygotes fg.genotype = g.allele1 + g.allele2; fg.genoclass = "HET"; fish.num_typed += 1; fish.num_typed_het += 1; } else if(fg.a1a2_ratio >= 0.1) { // in-betweeners fg.genotype = "-ib2-"; fg.genoclass = "NA"; /**/ fish.hom_ct += fg.corr_p2_hits; /**/ fish.bkgrd_ct += fg.corr_p1_hits; } else { // allele2 homozygotes fg.genotype = g.allele2 + g.allele2; fg.genoclass = "A2HOM"; fish.num_typed += 1; fish.num_typed_hom += 1; fish.num_typed_hom_a2 += 1; /**/ fish.hom_ct += fg.corr_p2_hits; /**/ fish.bkgrd_ct += fg.corr_p1_hits; } } }); /**/ fish.ifi = mk_pct(fish.hom_ct, fish.bkgrd_ct); fish.hit_pct = mk_pct(fish.raw_reads, fish.hits); fish.pct_typed = mk_pct(gene_info.length, fish.num_typed); // ----------------------------- // Write out the genos file // ----------------------------- var aa = [] aa.push([file]); aa.push(["Raw-Reads", fish.raw_reads,]); aa.push(["On-Target reads", fish.hits,]); aa.push(["% On-Target", fish.hit_pct,]); aa.push(["IFI score", fish.ifi,]); aa.push([ "Gene", "# A1", "# A2", "A1:A2 ratio", "# A1 corr.", "# A2 corr.", "A1:A2 ratio corr.", "Geno type", "Geno class", "A1 corr.", "A2 corr.", "# Gene reads", "% On-target gene", "% On-target fish ", ]) aa.push([]); // determine sex and write out line for it var fp_hits = 0; var prb_hits = 0; fish.sex_hits = 0; if(sex_fp) { sequences.forEach(function(sc) { var seq = sc.sequence; var count = sc.count; if(seq.indexOf(sex_fp) == 0) { fp_hits += count; if( seq.indexOf(sex_prb) != -1) { prb_hits += count; fish.sex_hits += count; } } }); if(fp_hits == 0) fp_hits = 1; var hit_pct = mk_pct(fp_hits, prb_hits); var adj_hits = toInt(fish.hits * 0.004); // XXX ?? if(adj_hits == 0) adj_hits = 1; if(prb_hits == 0) prb_hits = 1; var ratio = Math.round((adj_hits / prb_hits) * 1000) / 1000; //var sex_genotype, sex_genoclass; if(adj_hits + prb_hits < 10) { fish.sex_genotype = "-lac-"; fish.sex_genoclass = "NA"; } else { if(ratio >= 10) { fish.sex_genotype = "XX"; fish.sex_genoclass = "A1HOM"; } else if(ratio >= 5) { fish.sex_genotype = "-ib1-"; fish.sex_genoclass = "NA"; } else if(ratio >= 0.2) { fish.sex_genotype = "XY"; fish.sex_genoclass = "HET"; } else if(ratio >= 0.1) { fish.sex_genotype = "-ib2-"; fish.sex_genoclass = "NA"; } else { fish.sex_genotype = "XY"; fish.sex_genoclass = "A2HOM"; } } aa.push( ["Ots_SEXY3-1","X="+adj_hits,"Y="+prb_hits,ratio,"","","",fish.sex_genotype,fish.sex_genoclass,"","",fish.sex_hits,hit_pct]); aa.push([]); } // Write out a line for each gene gene_info.forEach(function(g) { var fg = fish.genes[g.name]; aa.push([ g.name, // fish file name g.allele1 + "="+ fg.p1_hits, // # of reads for allele 1 g.allele2 + "="+fg.p2_hits, // # of reads for allele 2 fg.a1a2_ratio_uncorr, // ratio A1:A2 g.allele1 + "="+ fg.corr_p1_hits, // # of reads for allele 1 corrected g.allele2 + "="+fg.corr_p2_hits, // # of reads for allele 2 corrected fg.a1a2_ratio, // ratio A1:A2 corrected fg.genotype, // genotype fg.genoclass, // genotype class (HOM vs HET) g.a1_corr, // A1 correction factor g.a2_corr, // A2 correction factor fg.hits, // # of reads for gene fg.hit_pct, // % on target for gene only fg.hit_pct_fish, // % on target for gene in total on target reads (total on-target for fish) "-", "-", ]); }); aa.push([]); xlsx.save(aa, outpath + "-genos.xlsx", file); // write out a JSON file containing the whole fish object //fs.writeFile(data_out+"/"+fish.name+"-fish.json", util.inspect(fish), "utf8"); // All done with this fish; add it to the growing school of processed fish. fishies[fish.name] = fish; finish(); });