00001 #include "log.h"
00002 #include "format.h"
00003 #include "bowtieParser.h"
00004 #include "fasta.h"
00005 #include "gfr.h"
00006 #include <stdio.h>
00007 #include <sys/types.h>
00008 #include <unistd.h>
00009 #include "geneFusionsConfig.h"
00010 #include "intervalFind.h"
00011 #include "blatParser.h"
00012 #include "util.h"
00013
00014 #define MAX_FRACTION_HOMOLOGOUS 0.05
00015 #define MAX_OVERLAP_ALLOWED 0.75
00016
00017 int main (int argc, char *argv[])
00018 {
00019 GfrEntry *currGE;
00020 int i,j,l;
00021 Stringa cmd;
00022 FILE *freads;
00023 Array gfrEntries;
00024 int ribosomalCount;
00025 int count;
00026 int countRemoved;
00027 int readSize1,readSize2;
00028 BlatQuery *blQ=NULL;
00029
00030 gfr_init ("-");
00031 gfrEntries = arrayCreate( 100, GfrEntry );
00032 gfrEntries = gfr_parse ();
00033 if (arrayMax (gfrEntries) == 0){
00034 puts (gfr_writeHeader ());
00035 gfr_deInit ();
00036 return 0;
00037 }
00038
00039 cmd = stringCreate (100);
00040 count = 0;
00041 countRemoved = 0;
00042 puts (gfr_writeHeader ());
00043 j = 0;
00044 for (i = 0; i < arrayMax (gfrEntries); i++) {
00045 currGE = arrp (gfrEntries,i,GfrEntry);
00046 ribosomalCount = 0;
00047
00048
00049 Stringa readsFA = stringCreate( 100 );
00050
00051
00052 stringPrintf( readsFA, "%s_reads.fa", currGE->id);
00053 freads = fopen ( string(readsFA) ,"w");
00054 if (freads == NULL) {
00055 die ("Unable to open file: %s",string (readsFA));
00056 }
00057 if( arrayMax( currGE->readsTranscript1 ) != arrayMax( currGE->readsTranscript2 ) )
00058 die("Error: different number of inter-transcript reads %d vs. %d", arrayMax( currGE->readsTranscript1 ),arrayMax( currGE->readsTranscript2 ));
00059
00060
00061 for (l = 0; l < arrayMax (currGE->readsTranscript1); l++) {
00062 char* currRead1 = hlr_strdup( textItem (currGE->readsTranscript1,l));
00063 char* currRead2 = hlr_strdup( textItem (currGE->readsTranscript2,l));
00064 fprintf( freads, ">%d/1\n%s\n>%d/2\n%s\n", l+1, currRead1, l+1, currRead2 );
00065 readSize1 = strlen( currRead1 );
00066 readSize2 = strlen( currRead2 );
00067 if(readSize1 != readSize2 ) die("The two reads have different lengths: 1:%d vs 2:%d", readSize1, readSize2);
00068 hlr_free( currRead1 );
00069 hlr_free( currRead2 );
00070 }
00071 fclose( freads );
00072 freads=NULL;
00073 stringDestroy( readsFA );
00074
00075
00076
00077
00078
00079 stringPrintf( cmd, "blat -t=dna -q=dna -out=psl -fine -repMatch=1000000 -tileSize=15 %s/%s %s_reads.fa stdout", RIBOSOMAL_DIR, RIBOSOMAL_FILENAME, currGE->id );
00080
00081 blatParser_initFromPipe( string(cmd) );
00082 while( blQ = blatParser_nextQuery() ) {
00083 int nucleotideOverlap = getNucleotideOverlap ( blQ );
00084 if ( nucleotideOverlap > ( ((double)readSize1)*MAX_OVERLAP_ALLOWED) ) {
00085 ribosomalCount++;
00086 }
00087 }
00088 blatParser_deInit();
00089 if (( (double)ribosomalCount / ( (double)currGE->numInter * 2.0 ) ) <= MAX_FRACTION_HOMOLOGOUS) {
00090
00091 puts (gfr_writeGfrEntry (currGE));
00092 count++;
00093 } else {
00094 countRemoved++;
00095 }
00096
00097 stringPrintf (cmd,"rm -rf %s_reads.fa", currGE->id );
00098 hlr_system( string(cmd) , 0);
00099 }
00100
00101 gfr_deInit ();
00102 arrayDestroy ( gfrEntries );
00103 stringDestroy( cmd );
00104 warn ("%s_numRemoved: %d",argv[0],countRemoved);
00105 warn ("%s_numGfrEntries: %d",argv[0],count);
00106 return 0;
00107 }
00108