So I have a gene alignment result (I think it's from RNA-seq) in which specific sequences are matched to certain genes (those genes repeat themselves sometimes). I am now using C++ to find the earliest possible position at which most unique genes have been counted and I can confidently use only the first part for further analysis. The problem is that the file I have is sorted (so sequences from the same gene are put together), but I want to calculate that for non-sorted files.
What I am doing now is to manually std::shuffle
my std::vector<string> geneList
before iterating through it. In each iteration I would generate a unique gene list at the same time and compare every coming gene from the geneList to the whole unique list to see if they are unique. Then I would sample count_gene and count_unique_gene with some interval and finally get the x% position. This is very costly...with only 140,000 genes costing me several minutes. Sample code (also included my input code for better understanding):
#include <stdlib.h>
#include <stdint.h>
#include <iostream>
#include <string>
#include <vector>
#include <set>
#include <fstream>
#include <sstream>
#include <random>
#include <algorithm>
class COUNT{
public:
// COUNT() : sampRate(1) {};
// ~COUNT(){};
void initialize();
void readData();
void calcCount();
void writeCount();
private:
int sampRate;
int countRead;
int countUniqGene;
string fname;
vector<string> geneList;
vector<int> sampCountRead, sampCountUniqGene;
};
void COUNT::readData(){
std::cout<<"Type in fname: "<<endl;
// std::cin>>fname;
fname = "testdata.sam";
std::cout<<"Type in sample rate: "<<endl;
std::cin>>sampRate;
std::string line;
std::fstream readFile(fname);
while (getline(readFile, line)) {
if (((char)line.back() == '-') | ((char)line.back() == '+')) {
std::istringstream ss(line);
std::string others, oriGeneName, realGeneName;
ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others; ss >> others;
if (ss >> others) {
ss >> oriGeneName;
realGeneName = oriGeneName.substr(5, oriGeneName.size());
geneList.push_back(realGeneName);
}
else continue;
}
else continue;
}
readFile.close();
}
void COUNT::calcCount() {
set<string> uniqGeneList = {"test"};
auto rng = std::default_random_engine{};
std::shuffle(std::begin(geneList), std::end(geneList), rng); // this is fast (0.02s)
vector<string>::iterator uniqIter;
string geneName;
for (vector<string>::iterator iter = geneList.begin(); iter != geneList.end(); ++iter) { //this is very slow
geneName = string(iter[0]);
if (all_of(uniqGeneList.begin(), uniqGeneList.end(), [geneName](const std::string gene) {return geneName != gene; })) {
uniqGeneList.insert(geneName);
countUniqGene++;
}
countRead++;
if (countRead % sampRate < 1) {
sampCountRead.push_back(countRead);
sampCountUniqGene.push_back(countUniqGene);
}
}
}
int main(){
(not important, plotting and intersecting)
}
Also sample data (I extracted genename from the 'GE:Z:<gene_name>' item):
V300067289_HH26L1C001R0010008289 784 7 141178046 3 2S48M * 0 0 CCCCACCTGCTTGCGGACCCTAATGTGACGTTGGCGGATGAGCACACGGG F)BF;E2A3*F<+AFFB-B,FE?FEFFF@EF3BFB;<:FECEF2DFF@CE NH:i:2 HI:i:1 AS:i:47 nM:i:0 CB:Z:53_34_81098_51183 UR:Z:GTTTTATTA UY:Z:E/E@EAG?F
V300067289_HH26L1C001R0010008294 1040 3 34078775 255 50M * 0 0 CCTTGTCTGGGTGATTTAATAGCATAATCCGGCGATGAGCATCCCTGATC FGFBGFFFDEGEBFEFBDFEEFFGCFFFADEEFGFFFFGFGFFDFFEEFF NH:i:1 HI:i:1 AS:i:49 nM:i:0 CB:Z:49_31_75043_46832 UR:Z:CCGGACCCA UY:Z:EEEFDBFEA XF:Z:CODING GE:Z:Dnajc19 GS:Z:-
V300067289_HH26L1C001R0010008295 1040 3 34078777 255 2S48M * 0 0 CGTTGTCTGGGTGATTTAATACCAAAATCCGGCGATGAGCATCCCTGATC E(FC6D>+EDEFECA?@?:C.'D4&03@&;:3?7CFAEF@C7A4.4.@B? NH:i:1 HI:i:1 AS:i:43 nM:i:2 CB:Z:49_31_75043_46832 UR:Z:CCGGACCCA UY:Z:CFFD?EEDF XF:Z:CODING GE:Z:Dnajc19 GS:Z:-
V300067289_HH26L1C001R0010008298 1040 15 82351046 255 50M * 0 0 ACTTTATCCCGTCCTTGTTTCACCGTGATATCCAGCTGCATTAAGTGCAC )EFFF=FE?DC=FABGFFF7F&=FEF9FBEE=BEB9FFF;FFCF9DBF86 NH:i:1 HI:i:1 AS:i:49 nM:i:0 CB:Z:49_34_74093_51329 UR:Z:CAATATAGG UY:Z:DFFFFFFFF XF:Z:CODING GE:Z:Ndufa6 GS:Z:-
I have thought of assuming gene appearance to follow a Poisson distribution and just count the unique gene number and calculate the confidence level of attaining x% at each position. But better use a simulation first. Thanks in advance!
Aucun commentaire:
Enregistrer un commentaire