00001 #include "log.h"
00002 #include "format.h"
00003 #include "bowtieParser.h"
00004 #include "fasta.h"
00005 #include "gfr.h"
00006 #include <stdio.h>
00007 #include <sys/types.h>
00008 #include <unistd.h>
00009 #include "geneFusionsConfig.h"
00010 #include "intervalFind.h"
00011 #include "blastParser.h"
00012 #include "util.h"
00013
00014 #define MAX_FRACTION_HOMOLOGOUS 0.05
00015 #define MAX_OVERLAP_ALLOWED 0.75
00016
00017 static int sortBowtieQueriesBySequenceName (BowtieQuery *a, BowtieQuery *b)
00018 {
00019 return strcmp (a->sequenceName,b->sequenceName);
00020 }
00021
00022
00023
00024 static int sortFastaSequencesByName (Seq *a, Seq *b)
00025 {
00026 return strcmp (a->name,b->name);
00027 }
00028
00029
00030
00031 static char getTranscriptNumber (char *seqName)
00032 {
00033 char *pos;
00034
00035 pos = strchr (seqName,'|');
00036 return *(pos + 1);
00037 }
00038
00039 int main (int argc, char *argv[])
00040 {
00041 GfrEntry *currGE;
00042 int i,j,k,l,index;
00043 Stringa buffer,cmd,fnSequencesToAlign;
00044 FILE *fp;
00045 FILE *fp1;
00046 FILE *fp2;
00047 FILE *freads1;
00048 FILE *freads2;
00049 Array gfrEntries;
00050 BowtieQuery *currBQ,testBQ;
00051 BowtieEntry *currBE;
00052 Texta seqNames;
00053 int readSize1, readSize2;
00054 Array bowtieQueries;
00055 char transcriptNumber;
00056 int isHomologous,homologousCount;
00057 int count;
00058 int countRemoved;
00059 BlatQuery *blQ;
00060
00061
00062 Array transcriptome = arrayCreate( 20000, Seq );
00063 buffer = stringCreate(100);
00064 stringPrintf( buffer, "%s/%s", TRANSCRIPT_COMPOSITE_MODEL_DIR, TRANSCRIPT_COMPOSITE_MODEL_FA_FILENAME);
00065
00066 fasta_initFromFile( string(buffer) );
00067 transcriptome = fasta_readAllSequences( 0 );
00068 fasta_deInit();
00069 arraySort( transcriptome,(ARRAYORDERF)sortFastaSequencesByName );
00070
00071 gfr_init ("-");
00072 gfrEntries = gfr_parse ();
00073 if (arrayMax (gfrEntries) == 0){
00074 puts (gfr_writeHeader ());
00075 gfr_deInit ();
00076 return 0;
00077 }
00078 seqNames = textCreate (10000);
00079 buffer = stringCreate (100);
00080 cmd = stringCreate (100);
00081 fnSequencesToAlign = stringCreate (100);
00082 stringPrintf (fnSequencesToAlign,"sequences_%d.fasta",getpid ());
00083 if (!(fp = fopen (string (fnSequencesToAlign),"w"))) {
00084 die ("Unable to open file: %s",string (fnSequencesToAlign));
00085 }
00086 for (i = 0; i < arrayMax (gfrEntries); i++) {
00087 currGE = arrp (gfrEntries,i,GfrEntry);
00088 for (j = 0; j < arrayMax (currGE->readsTranscript1); j++) {
00089 stringPrintf (buffer,"%s|1|%05d",currGE->id,j + 1);
00090 textAdd (seqNames,string (buffer));
00091 fprintf (fp,">%s\n%s\n",string (buffer),textItem (currGE->readsTranscript1,j));
00092 }
00093 for (j = 0; j < arrayMax (currGE->readsTranscript2); j++) {
00094 stringPrintf (buffer,"%s|2|%05d",currGE->id,j + 1);
00095 textAdd (seqNames,string (buffer));
00096 fprintf (fp,">%s\n%s\n",string (buffer),textItem (currGE->readsTranscript2,j));
00097 }
00098 }
00099 fclose (fp);
00100 stringPrintf (cmd,"bowtie -a -v 3 --quiet -f %s/%s/%s %s", BOWTIE_INDEXES, BOWTIE_COMPOSITE, BOWTIE_COMPOSITE, string (fnSequencesToAlign) );
00101 bowtieParser_initFromPipe (string (cmd));
00102 bowtieQueries = bowtieParser_getAllQueries ();
00103 bowtieParser_deInit ();
00104 arraySort (bowtieQueries,(ARRAYORDERF)sortBowtieQueriesBySequenceName);
00105
00106 count = 0;
00107 countRemoved = 0;
00108 puts (gfr_writeHeader ());
00109 j = 0;
00110 for (i = 0; i < arrayMax (gfrEntries); i++) {
00111 currGE = arrp (gfrEntries,i,GfrEntry);
00112 homologousCount = 0;
00113 while (j < arrayMax (seqNames)) {
00114 if (strStartsWith (textItem (seqNames,j),currGE->id)) {
00115
00116 testBQ.sequenceName = hlr_strdup (textItem (seqNames,j));
00117 if (arrayFind (bowtieQueries,&testBQ,&index,(ARRAYORDERF)sortBowtieQueriesBySequenceName)) {
00118 isHomologous = 0;
00119 transcriptNumber = getTranscriptNumber (textItem (seqNames,j));
00120 currBQ = arrp (bowtieQueries,index,BowtieQuery);
00121 k = 0;
00122 while (k < arrayMax (currBQ->entries)) {
00123 currBE = arrp (currBQ->entries,k,BowtieEntry);
00124 if (transcriptNumber == '1') {
00125 if (strEqual (currBE->chromosome,currGE->nameTranscript2)) {
00126 isHomologous = 1;
00127 break;
00128 }
00129 }
00130 if (transcriptNumber == '2') {
00131 if (strEqual (currBE->chromosome,currGE->nameTranscript1)) {
00132 isHomologous = 1;
00133 break;
00134 }
00135 }
00136 k++;
00137 }
00138
00139 if (isHomologous == 1) {
00140 homologousCount++;
00141 }
00142 }
00143 hlr_free (testBQ.sequenceName);
00144 }
00145 else {
00146 break;
00147 }
00148 j++;
00149 }
00150
00151 if (((double)homologousCount / arrayMax(currGE->readsTranscript1)) <= MAX_FRACTION_HOMOLOGOUS) {
00152
00153
00154
00155 Stringa fa1 = stringCreate( 100 );
00156 Stringa fa2 = stringCreate( 100 );
00157 stringPrintf( fa1, "%s_transcript1.fa", currGE->id);
00158 if (!(fp1 = fopen ( string(fa1) ,"w"))) {
00159 die ("Unable to open file: %s",string (fa1));
00160 }
00161 arrayFind(transcriptome, &currGE->nameTranscript1, &index,(ARRAYORDERF) sortFastaSequencesByName);
00162 fprintf( fp1, ">%s\n%s\n", arrp( transcriptome, index, Seq )->name,arrp( transcriptome, index, Seq )->sequence );
00163 fclose( fp1 );
00164
00165 stringPrintf( fa2, "%s_transcript2.fa", currGE->id);
00166 if (!(fp2 = fopen ( string(fa2) ,"w"))) {
00167 die ("Unable to open file: %s",string (fa2));
00168 }
00169 arrayFind(transcriptome, &currGE->nameTranscript2, &index,(ARRAYORDERF) sortFastaSequencesByName);
00170 fprintf( fp2, ">%s\n%s\n", arrp( transcriptome, index, Seq )->name,arrp( transcriptome, index, Seq )->sequence );
00171 fclose( fp2 );
00172
00173
00174 stringPrintf( fa1, "%s_reads1.fa", currGE->id);
00175 if (!(freads1 = fopen ( string(fa1) ,"w"))) {
00176 die ("Unable to open file: %s",string (fa1));
00177 }
00178
00179 for (l = 0; l < arrayMax (currGE->readsTranscript1); l++) {
00180 char* currRead1 = hlr_strdup( textItem (currGE->readsTranscript1,l));
00181 readSize1 = strlen( currRead1 );
00182 fprintf( freads1, ">%d\n%s\n", l, currRead1 );
00183 hlr_free( currRead1 );
00184 }
00185 fclose( freads1 );
00186
00187 stringPrintf( fa2, "%s_reads2.fa", currGE->id);
00188 if (!(freads2 = fopen ( string(fa2) ,"w"))) {
00189 die ("Unable to open file: %s",string (fa2));
00190 }
00191
00192 for (l = 0; l < arrayMax (currGE->readsTranscript2); l++) {
00193 char* currRead2 = hlr_strdup( textItem (currGE->readsTranscript2,l));
00194 readSize2 = strlen( currRead2 );
00195 fprintf( freads2, ">%d\n%s\n", l, currRead2 );
00196 hlr_free( currRead2 );
00197 }
00198 fclose( freads2 );
00199
00200
00201 stringPrintf( cmd, "blat -t=dna -out=psl -fine -tileSize=15 %s_transcript1.fa %s_reads2.fa stdout", currGE->id, currGE->id );
00202
00203
00204 blatParser_initFromPipe( string(cmd) );
00205 while( blQ = blatParser_nextQuery() ) {
00206 int nucleotideOverlap = getNucleotideOverlap ( blQ );
00207 if ( nucleotideOverlap > ( ((double)readSize2)*MAX_OVERLAP_ALLOWED) )
00208 homologousCount++;
00209 }
00210 blatParser_deInit();
00211
00212
00213 stringPrintf( cmd, "blat -t=dna -out=psl -fine -tileSize=15 %s_transcript2.fa %s_reads1.fa stdout", currGE->id, currGE->id );
00214 blatParser_initFromPipe( string(cmd) );
00215 while( blQ = blatParser_nextQuery() ) {
00216 int nucleotideOverlap = getNucleotideOverlap ( blQ );
00217 if ( nucleotideOverlap > ( ((double)readSize1)*MAX_OVERLAP_ALLOWED) ) {
00218 homologousCount++;
00219 }
00220 }
00221 blatParser_deInit();
00222
00223
00224 if (((double)homologousCount / (double)arrayMax(currGE->readsTranscript1)) <= MAX_FRACTION_HOMOLOGOUS) {
00225
00226 puts (gfr_writeGfrEntry (currGE));
00227 count++;
00228 } else {
00229 countRemoved++;
00230 }
00231
00232 stringPrintf (cmd,"rm -rf %s_reads?.fa %s_transcript?.fa", currGE->id,currGE->id);
00233 hlr_system( string(cmd) , 0);
00234 }
00235 else {
00236 countRemoved++;
00237 }
00238 }
00239 gfr_deInit ();
00240 stringPrintf (cmd,"rm -rf %s",string (fnSequencesToAlign));
00241 hlr_system (string (cmd),0);
00242
00243 stringDestroy (fnSequencesToAlign);
00244 stringDestroy (cmd);
00245 stringDestroy (buffer);
00246 warn ("%s_numRemoved: %d",argv[0],countRemoved);
00247 warn ("%s_numGfrEntries: %d",argv[0],count);
00248 return 0;
00249 }
00250