00001 #include "log.h"
00002 #include "format.h"
00003 #include "bowtieParser.h"
00004 #include "fasta.h"
00005 #include "gfr.h"
00006 #include <stdio.h>
00007 #include <sys/types.h>
00008 #include <unistd.h>
00009 #include "geneFusionsConfig.h"
00010 #include "intervalFind.h"
00011 #include "blatParser.h"
00012 #include "util.h"
00013 #include "time.h"
00014
00015 #define MAX_FRACTION_SPLICES 0.05
00016
00017 int sortGfrById( GfrEntry* a, GfrEntry* b) {
00018 return strcmp( a->id, b->id);
00019 }
00020
00021 int sortGfrByDASPER( GfrEntry* a, GfrEntry* b) {
00022 return (int) ( b->DASPER*1e06 - a->DASPER*1e06);
00023 }
00024
00025 int main (int argc, char *argv[])
00026 {
00027 GfrEntry *currGE;
00028 int i,j,l;
00029 Stringa cmd;
00030 FILE *freads;
00031 Array gfrEntries;
00032 int count;
00033 int countRemoved;
00034 int readSize1,readSize2;
00035 BlatQuery *blQ=NULL;
00036
00037 if( argc != 2 ) {
00038 usage("%s <splice_junction_library>\nNB: the splice junction library should be in 2bit format.\nEx: %s ucsc_nh_sj75.2bit", argv[0], argv[0]);
00039 }
00040 char* spliceJunctionLibrary = argv[1];
00041
00042 gfr_init ("-");
00043 gfrEntries = arrayCreate( 100, GfrEntry );
00044 gfrEntries = gfr_parse ();
00045 if (arrayMax (gfrEntries) == 0){
00046 puts (gfr_writeHeader ());
00047 gfr_deInit ();
00048 return 0;
00049 }
00050
00051
00052 Stringa readsFA = stringCreate( 100 );
00053
00054 stringPrintf( readsFA, "%d_reads.fa", (int) getpid());
00055 freads = fopen ( string(readsFA) ,"w");
00056 if (freads == NULL) {
00057 die ("Unable to open file: %s",string (readsFA));
00058 }
00059 cmd = stringCreate (100);
00060 puts (gfr_writeHeader ());
00061 count = 0;
00062 countRemoved = 0;
00063 for (i = 0; i < arrayMax (gfrEntries); i++) {
00064 currGE = arrp (gfrEntries,i,GfrEntry);
00065 if( strEqual( currGE->fusionType, "read-through") ) {
00066 continue;
00067 }
00068
00069 if( arrayMax( currGE->readsTranscript1 ) != arrayMax( currGE->readsTranscript2 ) )
00070 die("Error: different number of inter-transcript reads %d vs. %d", arrayMax( currGE->readsTranscript1 ),arrayMax( currGE->readsTranscript2 ));
00071
00072 for (l = 0; l < arrayMax (currGE->readsTranscript1); l++) {
00073 char* currRead1 = hlr_strdup( textItem (currGE->readsTranscript1,l));
00074 char* currRead2 = hlr_strdup( textItem (currGE->readsTranscript2,l));
00075 fprintf( freads, ">%s_1_%d\n%s\n>%s_2_%d\n%s\n", currGE->id, l+1, currRead1, currGE->id, l+1, currRead2 );
00076 readSize1 = strlen( currRead1 );
00077 readSize2 = strlen( currRead2 );
00078 if(readSize1 != readSize2 ) die("The two reads have different lengths: 1:%d vs 2:%d", readSize1, readSize2);
00079 hlr_free( currRead1 );
00080 hlr_free( currRead2 );
00081 }
00082 }
00083 fclose( freads );
00084 freads=NULL;
00085
00086
00087
00088 stringPrintf( cmd, "blat -t=dna %s %s stdout", spliceJunctionLibrary, string (readsFA) );
00089
00090 arraySort( gfrEntries, (ARRAYORDERF) sortGfrById );
00091 blatParser_initFromPipe( string(cmd) );
00092 Texta toRemove=textCreate( 10 );
00093 while( blQ = blatParser_nextQuery() ) {
00094 Texta tok = textFieldtokP( blQ->qName, "_");
00095 Stringa readID = stringCreate( 20 );
00096 for( i=0; i< (arrayMax(tok)-2); i++) {
00097 stringAppendf( readID, "%s%s", textItem( tok, i), i < (arrayMax(tok)-3) ? "_" : "" );
00098 }
00099 while( l<arrayMax( toRemove) ) {
00100 if( strEqual( string(readID), textItem( toRemove, l ) ) )
00101 break;
00102 l++;
00103 }
00104 if( l<arrayMax( toRemove ) )
00105 continue;
00106
00107 GfrEntry* gfrTest;
00108 AllocVar( gfrTest );
00109 gfrTest->id = hlr_strdup( string(readID) );
00110 int index=-1;
00111 arrayFind( gfrEntries, gfrTest, &index, (ARRAYORDERF) sortGfrById);
00112 GfrEntry* currGE = arrp( gfrEntries, index, GfrEntry );
00113 int numHits=0;
00114 for( j=0; j<arrayMax( blQ->entries ); j++ ) {
00115 PslEntry *currE = arrp( blQ->entries, j, PslEntry );
00116 Texta readPos = textFieldtokP( currE->tName, "|");
00117 int readNum = atoi( textItem( tok, arrayMax(tok)-2 ) );
00118 if( readNum == 1 ) {
00119 if( strEqual( currGE->chromosomeTranscript2, textItem( readPos, 0 )) &&
00120 currGE->startTranscript2 <= atoi( textItem( readPos, 1 )) &&
00121 currGE->endTranscript2 >= atoi( textItem( readPos, 2 )) ) {
00122 numHits++;
00123 }
00124 }
00125 if( readNum == 2 ) {
00126 if( strEqual( currGE->chromosomeTranscript1, textItem( readPos, 0 )) &&
00127 currGE->startTranscript1 <= atoi( textItem(readPos, 1 )) &&
00128 currGE->endTranscript1 >= atoi(textItem( readPos, 2 )) ) {
00129 numHits++;
00130 }
00131 }
00132 textDestroy( readPos);
00133 }
00134 if( numHits > arrayMax( currGE->interReads ) * MAX_FRACTION_SPLICES ) {
00135 textAdd( toRemove, currGE->id);
00136 }
00137
00138 hlr_free( gfrTest->id );
00139 freeMem( gfrTest );
00140 textDestroy( tok );
00141 stringDestroy( readID );
00142 }
00143 blatParser_deInit();
00144 arraySort( toRemove, (ARRAYORDERF) arrayStrcmp );
00145 arraySort( gfrEntries, (ARRAYORDERF) sortGfrByDASPER );
00146 for( i=0; i<arrayMax( gfrEntries ); i++ ) {
00147 GfrEntry *currGE = arrp( gfrEntries, i, GfrEntry);
00148 if( arrayFind( toRemove, &currGE->id, NULL, (ARRAYORDERF) arrayStrcmp ) ) {
00149 countRemoved++;
00150 } else {
00151 puts( gfr_writeGfrEntry( currGE ) );
00152 count++;
00153 }
00154 }
00155
00156
00157 stringPrintf (cmd,"rm -rf %s", string(readsFA) );
00158 hlr_system( string(cmd) , 0);
00159
00160 gfr_deInit ();
00161 arrayDestroy ( gfrEntries );
00162 arrayDestroy ( toRemove );
00163 stringDestroy( cmd );
00164 stringDestroy( readsFA );
00165 warn ("%s_spliceJunctionLibrary: %s",argv[0],spliceJunctionLibrary);
00166 warn ("%s_numRemoved: %d",argv[0],countRemoved);
00167 warn ("%s_numGfrEntries: %d",argv[0],count);
00168 return 0;
00169 }
00170