00001 #include "log.h"
00002 #include "format.h"
00003 #include "numUtil.h"
00004 #include "fasta.h"
00005 #include "gfr.h"
00006 #include "geneFusionsConfig.h"
00007 #include <sys/types.h>
00008 #include <unistd.h>
00009 #include <stdio.h>
00010
00011
00012
00013 #define MAX_NUMBER_OF_JUNCTIONS_PER_FILE 2000000
00014
00015 typedef struct {
00016 int start;
00017 int end;
00018 } Region;
00019
00020
00021 static int sortTilesByName (Seq *a, Seq *b)
00022 {
00023 return strcmp (a->name,b->name);
00024 }
00025
00026 static void updateCounts (Array positions, int start, int end)
00027 {
00028 int i;
00029 for (i = start; i <= end; i++) {
00030 array (positions,i,int)++;
00031 }
00032 }
00033
00034 void createSignal( Array regions, Array positions ) {
00035 Region *currRegion;
00036 int j = 0;
00037 arrayClear (positions);
00038 while (j < arrayMax (regions)) {
00039 currRegion = arrp (regions,j,Region);
00040 updateCounts (positions,currRegion->start,currRegion->end);
00041 j++;
00042 }
00043
00044 }
00045
00046 static void getInterEndPoints (Array interReads, int *min1, int *max1, int *min2, int *max2)
00047 {
00048 GfrInterRead *currGIR;
00049 int i;
00050 Array regions1, regions2;
00051 Region *currRegion1, *currRegion2;
00052
00053 regions1 = arrayCreate( arrayMax( interReads), Region);
00054 regions2 = arrayCreate( arrayMax( interReads), Region);
00055
00056 for (i = 0; i < arrayMax (interReads); i++) {
00057 currGIR = arrp (interReads,i,GfrInterRead);
00058 currRegion1 = arrayp( regions1, arrayMax( regions1 ), Region);
00059 currRegion2 = arrayp( regions2, arrayMax( regions2 ), Region);
00060 currRegion1->start = currGIR->readStart1;
00061 currRegion1->end = currGIR->readEnd1;
00062 currRegion2->start = currGIR->readStart2;
00063 currRegion2->end = currGIR->readEnd2;
00064 }
00065 Array positions = arrayCreate (10000000,int);
00066 createSignal( regions1, positions );
00067 *min1=-1;*max1=-1;
00068 for( i=0; i<arrayMax( positions ); i++) {
00069 if( arru( positions, i, int) > 1 & (*min1)==-1 ) {
00070 *min1 = i+1;
00071 break;
00072 }
00073 }
00074 for( i=arrayMax(positions)-1; i>0; i--) {
00075 if( arru( positions, i, int) > 1 & (*max1)==-1 ) {
00076 *max1 = i+1;
00077 break;
00078 }
00079 }
00080 createSignal( regions2, positions );
00081 *min2=-1;*max2=-1;
00082 for( i=0; i<arrayMax( positions ); i++) {
00083 if( arru( positions, i, int) > 1 & (*min2)==-1 ) {
00084 *min2 = i+1;
00085 break;
00086 }
00087 }
00088 for( i=arrayMax(positions)-1; i>0; i--) {
00089 if( arru( positions, i, int) > 1 & (*max2)==-1 ) {
00090 *max2 = i+1;
00091 break;
00092 }
00093 }
00094 arrayDestroy( regions1 );
00095 arrayDestroy( regions2 );
00096 arrayDestroy( positions);
00097 }
00098
00099
00100
00101 static Array getTilesForRegion (int tileSize, char *chromosome, int start, int end, int junctionIsToTheRightOfTile)
00102 {
00103 Stringa buffer;
00104 Stringa targetsFile;
00105 FILE *fp;
00106 Array targetSeqs;
00107 int i;
00108
00109 buffer = stringCreate (100);
00110 targetsFile = stringCreate (100);
00111 stringPrintf (targetsFile,"targets_%d.txt",getpid ());
00112 if (!(fp = fopen (string (targetsFile),"w")) ){
00113 die ("Unable to open target file: %s",string (targetsFile));
00114 }
00115 if (junctionIsToTheRightOfTile == 1) {
00116 for (i = start - tileSize; i <= end - tileSize; i++) {
00117 fprintf (fp,"%s:%d-%d\n",chromosome,i,i + tileSize);
00118 }
00119 }
00120 else {
00121 for (i = start; i <= end; i++) {
00122 fprintf (fp,"%s:%d-%d\n",chromosome,i,i + tileSize);
00123 }
00124 }
00125 fclose (fp);
00126 stringPrintf (buffer,"%s %s/%s stdout -noMask -seqList=%s",
00127 BLAT_TWO_BIT_TO_FA,BLAT_DATA_DIR,BLAT_TWO_BIT_DATA_FILENAME,string (targetsFile));
00128 fasta_initFromPipe (string (buffer));
00129 targetSeqs = fasta_readAllSequences (0);
00130 fasta_deInit ();
00131 stringPrintf (buffer,"rm -rf %s",string (targetsFile));
00132 hlr_system (string (buffer),0);
00133 stringDestroy (targetsFile);
00134 stringDestroy (buffer);
00135 return targetSeqs;
00136 }
00137
00138
00139
00140 static void collectTiles (Array totalTiles, Array regionalTiles)
00141 {
00142 int i;
00143 Seq *currSeq1,*currSeq2;
00144
00145 for (i = 0; i < arrayMax (regionalTiles); i++) {
00146 currSeq1 = arrp (regionalTiles,i,Seq);
00147 currSeq2 = arrayp (totalTiles,arrayMax (totalTiles),Seq);
00148 currSeq2->name = hlr_strdup (currSeq1->name);
00149 currSeq2->sequence = hlr_strdup (currSeq1->sequence);
00150 currSeq2->size = currSeq1->size;
00151 hlr_free (currSeq1->name);
00152 hlr_free (currSeq1->sequence);
00153 }
00154 arrayDestroy (regionalTiles);
00155 }
00156
00157
00158
00159 static Array processRegion (Array exonCoordinates, char *chromosome, int regionStart, int regionEnd, int junctionIsToTheRightOfTile, int tileSize, int sizeFlankingRegion)
00160 {
00161 GfrExonCoordinate *currEC;
00162 int i;
00163 Array totalTiles;
00164 int overlap;
00165 int exonOverlap=0;
00166 totalTiles = arrayCreate (10000,Seq);
00167 for (i = 0; i < arrayMax (exonCoordinates); i++) {
00168 currEC = arrp (exonCoordinates,i,GfrExonCoordinate);
00169 overlap = rangeIntersection (currEC->start,currEC->end,regionStart,regionEnd);
00170 if (overlap > 0) {
00171 exonOverlap++;
00172 if (regionStart < currEC->start && currEC->end < regionEnd) {
00173
00174 collectTiles (totalTiles,getTilesForRegion (tileSize,chromosome,currEC->start - sizeFlankingRegion,currEC->end + sizeFlankingRegion,junctionIsToTheRightOfTile));
00175 }
00176 else if (regionStart >= currEC->start && currEC->end <= regionEnd) {
00177
00178 collectTiles (totalTiles,getTilesForRegion (tileSize,chromosome,regionStart,currEC->end + sizeFlankingRegion,junctionIsToTheRightOfTile));
00179 }
00180 else if (regionStart <= currEC->start && currEC->end >= regionEnd) {
00181
00182 collectTiles (totalTiles,getTilesForRegion (tileSize,chromosome,currEC->start - sizeFlankingRegion,regionEnd,junctionIsToTheRightOfTile));
00183 }
00184 }
00185 }
00186 if( exonOverlap == 0 ) {
00187 warn( "Only intronic");
00188 }
00189 return totalTiles;
00190 }
00191
00192
00193
00194 static void freeTiles (Array tiles)
00195 {
00196 Seq *currSeq;
00197 int i;
00198
00199 for (i = 0; i < arrayMax (tiles); i++) {
00200 currSeq = arrp (tiles,i,Seq);
00201 hlr_free (currSeq->name);
00202 hlr_free (currSeq->sequence);
00203 }
00204 arrayDestroy (tiles);
00205 }
00206
00207
00208
00209 static void printCommands (char *id, char *orientation, int fileCount, FILE *fpJobList1, FILE *fpJobList2, char *gfrPrefix, int isFirst, int isLast)
00210 {
00211 long size;
00212 char *buf;
00213 static char *currentDirectory = NULL;
00214 static int first = 1;
00215
00216 if (first == 1) {
00217 size = pathconf(".", _PC_PATH_MAX);
00218 if ((buf = (char *)malloc((size_t)size)) != NULL) {
00219 currentDirectory = getcwd(buf, (size_t)size);
00220 }
00221 first = 0;
00222 }
00223 fprintf (fpJobList1,"cd %s; bowtie-build -q -f %s_%s_%d.fa %s/%s_%s_%d; zcat %s_allReads.txt.gz | bowtie --quiet -p 4 -r %s/%s_%s_%d - %s_%s_%d.bowtie; rm -rf %s_%s_%d.fa %s/%s_%s_%d.*.ebwt\n",
00224 currentDirectory,id,orientation,fileCount,BOWTIE_INDEXES,id,orientation,fileCount,gfrPrefix,BOWTIE_INDEXES,id,orientation,fileCount,id,orientation,fileCount,id,orientation,fileCount,BOWTIE_INDEXES,id,orientation,fileCount);
00225 if (isFirst == 1) {
00226 fprintf (fpJobList2,"cd %s; cat ",currentDirectory);
00227 }
00228 fprintf (fpJobList2,"%s_%s_%d.bowtie ",id,orientation,fileCount);
00229 if (isLast == 1) {
00230 fprintf (fpJobList2,"| sort -n | bowtie2bp > %s_%s.bp\n",id,orientation);
00231 }
00232 }
00233
00234
00235
00236 static void writeTiles (Array tiles1, Array tiles2, char *id, char *orientation, FILE *fpJobList1, FILE *fpJobList2, char *gfrPrefix)
00237 {
00238 static Stringa buffer = NULL;
00239 int fileCount;
00240 int numJunctions;
00241 FILE *fp;
00242 int i,j;
00243 Seq *firstSeq,*secondSeq;
00244
00245 arraySort (tiles1,(ARRAYORDERF)sortTilesByName);
00246 arrayUniq (tiles1,NULL,(ARRAYORDERF)sortTilesByName);
00247 arraySort (tiles2,(ARRAYORDERF)sortTilesByName);
00248 arrayUniq (tiles2,NULL,(ARRAYORDERF)sortTilesByName);
00249 warn ("%s, %s, num_tiles_transcript_1: %d, num_tiles_transcript_2: %d",id,orientation,arrayMax (tiles1),arrayMax (tiles2));
00250 fileCount = 1;
00251 stringCreateClear (buffer,100);
00252 stringPrintf (buffer,"%s_%s_%d.fa",id,orientation,fileCount);
00253 fp = fopen (string (buffer),"w");
00254 if (fp == NULL) {
00255 die ("Unable to open file: %s",string (buffer));
00256 }
00257 numJunctions = 0;
00258 for (i = 0; i < arrayMax (tiles1); i++) {
00259 firstSeq = arrp (tiles1,i,Seq);
00260 for (j = 0; j < arrayMax (tiles2); j++) {
00261 secondSeq = arrp (tiles2,j,Seq);
00262 numJunctions++;
00263 if (numJunctions == MAX_NUMBER_OF_JUNCTIONS_PER_FILE) {
00264 printCommands (id,orientation,fileCount,fpJobList1,fpJobList2,gfrPrefix,fileCount == 1 ? 1 : 0,0);
00265 fclose (fp);
00266 numJunctions = 0;
00267 fileCount++;
00268 stringPrintf (buffer,"%s_%s_%d.fa",id,orientation,fileCount);
00269 fp = fopen (string (buffer),"w");
00270 if (fp == NULL) {
00271 die ("Unable to open file: %s",string (buffer));
00272 }
00273 }
00274 fprintf (fp,">%s|%s\n",firstSeq->name,secondSeq->name);
00275 fprintf (fp,"%s%s\n",firstSeq->sequence,secondSeq->sequence);
00276 }
00277 }
00278 printCommands (id,orientation,fileCount,fpJobList1,fpJobList2,gfrPrefix,fileCount == 1 ? 1 : 0,1);
00279 fclose (fp);
00280 freeTiles (tiles1);
00281 freeTiles (tiles2);
00282 }
00283
00284
00285
00286 int main (int argc, char *argv[])
00287 {
00288 GfrEntry *currGE;
00289 int min1,max1,min2,max2;
00290 int tileSize;
00291 int sizeFlankingRegion;
00292 Array tiles1;
00293 Array tiles2;
00294 char *gfrPrefix;
00295 char *pos;
00296 FILE *fpJobList1,*fpJobList2;
00297 Stringa buffer;
00298 double minDASPER;
00299
00300 if (argc <= 4) {
00301 usage ("%s <file.gfr> <tileSize> <sizeFlankingRegion> [minDASPER]",argv[0]);
00302 }
00303 gfr_init (argv[1]);
00304 tileSize = atoi (argv[2]);
00305 sizeFlankingRegion = atoi (argv[3]);
00306 if(argc==5) {
00307 minDASPER = atof( argv[4] );
00308 } else {
00309 minDASPER = -100000000;
00310 }
00311 gfrPrefix = hlr_strdup (argv[1]);
00312 pos = strchr (gfrPrefix,'.');
00313 if (pos == NULL) {
00314 die ("Expected a '.gfr' extension: %s",argv[1]);
00315 }
00316 *pos = '\0';
00317 buffer = stringCreate (100);
00318 stringPrintf (buffer,"%s_jobList1.txt",gfrPrefix);
00319 fpJobList1 = fopen (string (buffer),"w");
00320 stringPrintf (buffer,"%s_jobList2.txt",gfrPrefix);
00321 fpJobList2 = fopen (string (buffer),"w");
00322 if (fpJobList1 == NULL || fpJobList2 == NULL) {
00323 die ("Unable to open job list files!");
00324 }
00325 while (currGE = gfr_nextEntry ()){
00326 if( currGE->DASPER < minDASPER ) continue;
00327 getInterEndPoints (currGE->interReads,&min1,&max1,&min2,&max2);
00328 tiles1 = processRegion (currGE->exonCoordinatesTranscript1,currGE->chromosomeTranscript1,min1,currGE->endTranscript1,1,tileSize,sizeFlankingRegion);
00329 tiles2 = processRegion (currGE->exonCoordinatesTranscript2,currGE->chromosomeTranscript2,currGE->startTranscript2,max2,0,tileSize,sizeFlankingRegion);
00330 writeTiles (tiles1,tiles2,currGE->id,"AB",fpJobList1,fpJobList2,gfrPrefix);
00331 tiles1 = processRegion (currGE->exonCoordinatesTranscript2,currGE->chromosomeTranscript2,min2,currGE->endTranscript2,1,tileSize,sizeFlankingRegion);
00332 tiles2 = processRegion (currGE->exonCoordinatesTranscript1,currGE->chromosomeTranscript1,currGE->startTranscript1,max1,0,tileSize,sizeFlankingRegion);
00333 writeTiles (tiles1,tiles2,currGE->id,"BA",fpJobList1,fpJobList2,gfrPrefix);
00334 }
00335 gfr_deInit ();
00336 fclose (fpJobList1);
00337 fclose (fpJobList2);
00338 hlr_free (gfrPrefix);
00339 stringDestroy (buffer);
00340 return 0;
00341 }