00001 #include "log.h"
00002 #include "format.h"
00003 #include "intervalFind.h"
00004 #include <stdlib.h>
00005 #include <time.h>
00006 #include "geneFusionsConfig.h"
00007 #include "gfr.h"
00008 #include "mrf.h"
00009 #include <math.h>
00010
00011
00012 #define SAMPLING_ITERATIONS 100000
00013
00014
00015
00016 typedef struct {
00017 Interval *transcript;
00018 char* read1;
00019 char* read2;
00020 int readStart1;
00021 int readStart2;
00022 int readEnd1;
00023 int readEnd2;
00024 } Intra;
00025
00026
00027
00028 typedef struct {
00029 Interval *transcript1;
00030 Interval *transcript2;
00031 char* read1;
00032 char* read2;
00033 int readStart1;
00034 int readStart2;
00035 int readEnd1;
00036 int readEnd2;
00037 int pairType;
00038 int number1;
00039 int number2;
00040 } Inter;
00041
00042
00043
00044 typedef struct {
00045 Interval *transcript;
00046 Array intras;
00047 } SuperIntra;
00048
00049
00050
00051 typedef struct {
00052 Interval *transcript1;
00053 Interval *transcript2;
00054 Array inters;
00055 } SuperInter;
00056
00057
00058
00059 typedef struct {
00060 char* chromosome;
00061 int genomic;
00062 int transcript;
00063 } Coordinate;
00064
00065
00066 int getNumberOfIntras( SuperIntra* a ) {
00067 int i;
00068 double numberOfIntras=0.0;
00069 Intra* currIntra;
00070 for( i=0; i<arrayMax( a->intras); i++) {
00071 currIntra = arru( a->intras, i, Intra* );
00072 if( currIntra->read1==NULL | currIntra->read2==NULL |
00073 (currIntra->readEnd1 - currIntra->readStart1 + 1) == 0 |
00074 (currIntra->readEnd2 - currIntra->readStart2 + 1) == 0 )
00075 die("Something is wrong with the intra pairs: read1[%s](e%d-s%d +1 )=%d read2[%s](e%d-s%d + 1 )=%d",
00076 currIntra->read1, currIntra->readEnd1, currIntra->readStart1, (currIntra->readEnd1 - currIntra->readStart1 + 1),
00077 currIntra->read2, currIntra->readEnd2, currIntra->readStart2, (currIntra->readEnd2 - currIntra->readStart2 + 1) );
00078 if( (currIntra->readEnd1 - currIntra->readStart1 + 1) != strlen(currIntra->read1) &
00079 (currIntra->readEnd2 - currIntra->readStart2 + 1) != strlen(currIntra->read2) ) {
00080 numberOfIntras += 0.25;
00081 } else if ( (currIntra->readEnd1 - currIntra->readStart1 + 1) != strlen(currIntra->read1) |
00082 (currIntra->readEnd2 - currIntra->readStart2 + 1) != strlen(currIntra->read2) ) {
00083 numberOfIntras += 0.5;
00084 } else {
00085 numberOfIntras += 1.0;
00086 }
00087 }
00088 return (int) rint( numberOfIntras );
00089 }
00090
00091 int getNumberOfInters( SuperInter* a ) {
00092 int i;
00093 double numberOfInters=0.0;
00094 Inter* currInter;
00095 for( i=0; i<arrayMax( a->inters); i++) {
00096 currInter = arru( a->inters, i, Inter* );
00097 if( currInter->read1==NULL | currInter->read2==NULL |
00098 (currInter->readEnd1 - currInter->readStart1 + 1) == 0 |
00099 (currInter->readEnd2 - currInter->readStart2 + 1) == 0 )
00100 die("Something is wrong with the inter pairs: read1[%s](e%d-s%d +1 )=%d read2[%s](e%d-s%d + 1 )=%d",
00101 currInter->read1, currInter->readEnd1, currInter->readStart1, (currInter->readEnd1 - currInter->readStart1 + 1),
00102 currInter->read2, currInter->readEnd2, currInter->readStart2, (currInter->readEnd2 - currInter->readStart2 + 1) );
00103 if( (currInter->readEnd1 - currInter->readStart1 + 1) != strlen(currInter->read1) &
00104 (currInter->readEnd2 - currInter->readStart2 + 1) != strlen(currInter->read2) ) {
00105 numberOfInters += 0.25;
00106 } else if ( (currInter->readEnd1 - currInter->readStart1 + 1) != strlen(currInter->read1) |
00107 (currInter->readEnd2 - currInter->readStart2 + 1) != strlen(currInter->read2) ) {
00108 numberOfInters += 0.5;
00109 } else {
00110 numberOfInters += 1.0;
00111 }
00112 }
00113 return (int) rint( numberOfInters );
00114 }
00115
00116
00117 static int sortIntersByTranscript (Inter *a, Inter *b)
00118 {
00119 int diff;
00120
00121 diff = a->transcript1 - b->transcript1;
00122 if (diff != 0) {
00123 return diff;
00124 }
00125 return a->transcript2 - b->transcript2;
00126 }
00127
00128
00129
00130 static int sortIntrasByTranscript (Intra *a, Intra *b)
00131 {
00132 return a->transcript - b->transcript;
00133 }
00134
00135
00136
00137 static int sortSuperInters (SuperInter *a, SuperInter *b)
00138 {
00139
00140 return getNumberOfInters(b) - getNumberOfInters(a);
00141 }
00142
00143
00144
00145 static int sortSuperIntras (SuperIntra *a, SuperIntra *b)
00146 {
00147 return a->transcript - b->transcript;
00148 }
00149
00150
00151
00152 static int getExonNumber (Interval* transcript, int start, int end)
00153 {
00154 SubInterval *currExon;
00155 int i;
00156
00157 for (i = 0; i < arrayMax (transcript->subIntervals); i++) {
00158 currExon = arrp (transcript->subIntervals,i,SubInterval);
00159 if (start >= currExon->start && end <= currExon->end) {
00160 return i + 1;
00161 }
00162 }
00163 return 0;
00164 }
00165
00166
00167
00168 static int getIntronNumber (Interval* transcript, int start, int end)
00169 {
00170 SubInterval *currExon,*prevExon;
00171 int i;
00172
00173 for (i = 1; i < arrayMax (transcript->subIntervals); i++) {
00174 prevExon = arrp (transcript->subIntervals,i - 1,SubInterval);
00175 currExon = arrp (transcript->subIntervals,i,SubInterval);
00176 if (start > prevExon->end && end < currExon->start) {
00177 return i;
00178 }
00179 }
00180 return 0;
00181 }
00182
00183
00184
00185 static int getJunctionNumber (Interval* transcript, int start, int end, int exonNumber, int intronNumber)
00186 {
00187 SubInterval *currExon;
00188 int i;
00189
00190 if (exonNumber > 0 || intronNumber > 0) {
00191 return 0;
00192 }
00193 for (i = 0; i < arrayMax (transcript->subIntervals); i++) {
00194 currExon = arrp (transcript->subIntervals,i,SubInterval);
00195 if (start <= currExon->start && end >= currExon->start) {
00196 return i * 2 + 1;
00197 }
00198 if (start <= currExon->end && end >= currExon->end) {
00199 return i * 2 + 2;
00200 }
00201 }
00202 return 0;
00203 }
00204
00205
00206
00207 static void assignPairType (Inter *currInter, int exon1, int intron1, int junction1, int exon2, int intron2, int junction2)
00208 {
00209 if (exon1 > 0 && exon2 > 0) {
00210 currInter->pairType = GFR_PAIR_TYPE_EXONIC_EXONIC;
00211 currInter->number1 = exon1;
00212 currInter->number2 = exon2;
00213 }
00214 else if (exon1 > 0 && intron2 > 0) {
00215 currInter->pairType = GFR_PAIR_TYPE_EXONIC_INTRONIC;
00216 currInter->number1 = exon1;
00217 currInter->number2 = intron2;
00218 }
00219 else if (exon1 > 0 && junction2 > 0) {
00220 currInter->pairType = GFR_PAIR_TYPE_EXONIC_JUNCTION;
00221 currInter->number1 = exon1;
00222 currInter->number2 = junction2;
00223 }
00224 else if (intron1 > 0 && exon2 > 0) {
00225 currInter->pairType = GFR_PAIR_TYPE_INTRONIC_EXONIC;
00226 currInter->number1 = intron1;
00227 currInter->number2 = exon2;
00228 }
00229 else if (intron1 > 0 && intron2 > 0) {
00230 currInter->pairType = GFR_PAIR_TYPE_INTRONIC_INTRONIC;
00231 currInter->number1 = intron1;
00232 currInter->number2 = intron2;
00233 }
00234 else if (intron1 > 0 && junction2 > 0) {
00235 currInter->pairType = GFR_PAIR_TYPE_INTRONIC_JUNCTION;
00236 currInter->number1 = intron1;
00237 currInter->number2 = junction2;
00238 }
00239 else if (junction1 > 0 && junction2 > 0) {
00240 currInter->pairType = GFR_PAIR_TYPE_JUNCTION_JUNCTION;
00241 currInter->number1 = junction1;
00242 currInter->number2 = junction2;
00243 }
00244 else if (junction1 > 0 && exon2 > 0) {
00245 currInter->pairType = GFR_PAIR_TYPE_JUNCTION_EXONIC;
00246 currInter->number1 = junction1;
00247 currInter->number2 = exon2;
00248 }
00249 else if (junction1 > 0 && intron2 > 0) {
00250 currInter->pairType = GFR_PAIR_TYPE_JUNCTION_INTRONIC;
00251 currInter->number1 = junction1;
00252 currInter->number2 = intron2;
00253 }
00254 else {
00255 die ("Unexpected pair type: %d %d %d %d %d %d",exon1,intron1,junction1,exon2,intron2,junction2);
00256 }
00257 }
00258
00259
00260
00261 static int sortCoordinates (Coordinate *a, Coordinate *b)
00262 {
00263 int diff;
00264
00265 diff = strcmp (a->chromosome,b->chromosome);
00266 if (diff != 0) {
00267 return diff;
00268 }
00269 return a->genomic - b->genomic;
00270 }
00271
00272
00273
00274 static Array convertIntraCoordinates (Interval *transcript)
00275 {
00276 int i,j,k;
00277 SubInterval *currExon;
00278 Coordinate *currCoordinate;
00279 Array coordinates;
00280
00281 coordinates = arrayCreate (10000,Coordinate);
00282 k = 1;
00283 for (i = 0; i < arrayMax (transcript->subIntervals); i++) {
00284 currExon = arrp (transcript->subIntervals,i,SubInterval);
00285 for (j = currExon->start; j <= currExon->end; j++) {
00286 currCoordinate = arrayp (coordinates,arrayMax (coordinates),Coordinate);
00287 currCoordinate->genomic = j;
00288 currCoordinate->transcript = k;
00289 currCoordinate->chromosome = hlr_strdup (transcript->chromosome);
00290 k++;
00291 }
00292 }
00293 return coordinates;
00294
00295 }
00296
00297
00298
00299 static void addInterCoordinates (Interval *transcript, Array coordinates, int *transcriptIndex,
00300 int startFusionTranscript, int endFusionTranscript)
00301 {
00302 SubInterval *currExon;
00303 Coordinate *currCoordinate;
00304 int i,j;
00305
00306 for (i = 0; i < arrayMax (transcript->subIntervals); i++) {
00307 currExon = arrp (transcript->subIntervals,i,SubInterval);
00308 for (j = currExon->start; j <= currExon->end; j++) {
00309 if (j >= startFusionTranscript && j <= endFusionTranscript) {
00310 currCoordinate = arrayp (coordinates,arrayMax (coordinates),Coordinate);
00311 currCoordinate->genomic = j;
00312 currCoordinate->transcript = *transcriptIndex;
00313 currCoordinate->chromosome = hlr_strdup (transcript->chromosome);
00314 (*transcriptIndex)++;
00315 }
00316 }
00317 }
00318 }
00319
00320
00321
00322 static Array convertInterCoordinates (SuperInter *currSuperInter, int isAB)
00323 {
00324 int i,start;
00325 int startFusionTranscript1,endFusionTranscript1;
00326 int startFusionTranscript2,endFusionTranscript2;
00327 Inter *currInter;
00328 Array coordinates;
00329 int transcriptIndex;
00330
00331 start = 0;
00332 while (start < arrayMax (currSuperInter->inters)) {
00333 currInter = arru (currSuperInter->inters,start,Inter*);
00334 if (currInter->pairType == GFR_PAIR_TYPE_EXONIC_EXONIC) {
00335 break;
00336 }
00337 start++;
00338 }
00339 startFusionTranscript1 = currInter->readStart1;
00340 endFusionTranscript1 = currInter->readEnd1;
00341 startFusionTranscript2 = currInter->readStart2;
00342 endFusionTranscript2 = currInter->readEnd2;
00343 for (i = start + 1; i < arrayMax (currSuperInter->inters); i++) {
00344 currInter = arru (currSuperInter->inters,i,Inter*);
00345 if (currInter->pairType != GFR_PAIR_TYPE_EXONIC_EXONIC) {
00346 continue;
00347 }
00348 if (currInter->readStart1 < startFusionTranscript1) {
00349 startFusionTranscript1 = currInter->readStart1;
00350 }
00351 if (currInter->readEnd1 > endFusionTranscript1) {
00352 endFusionTranscript1 = currInter->readEnd1;
00353 }
00354 if (currInter->readStart2 < startFusionTranscript2) {
00355 startFusionTranscript2 = currInter->readStart2;
00356 }
00357 if (currInter->readEnd2 > endFusionTranscript2) {
00358 endFusionTranscript2 = currInter->readEnd2;
00359 }
00360 }
00361 coordinates = arrayCreate (1000,Coordinate);
00362 transcriptIndex = 1;
00363 if (isAB == 1) {
00364 addInterCoordinates (currSuperInter->transcript1,coordinates,&transcriptIndex,startFusionTranscript1,endFusionTranscript1);
00365 addInterCoordinates (currSuperInter->transcript2,coordinates,&transcriptIndex,startFusionTranscript2,endFusionTranscript2);
00366 }
00367 else if (isAB == 0) {
00368 addInterCoordinates (currSuperInter->transcript2,coordinates,&transcriptIndex,startFusionTranscript2,endFusionTranscript2);
00369 addInterCoordinates (currSuperInter->transcript1,coordinates,&transcriptIndex,startFusionTranscript1,endFusionTranscript1);
00370 }
00371 else {
00372 die ("Unknown mode: %d",isAB);
00373 }
00374 arraySort (coordinates,(ARRAYORDERF)sortCoordinates);
00375 return coordinates;
00376 }
00377
00378
00379
00380 static int getTranscriptCoordinate (Array coordinates, int genomicCoordinate, char* chromosome)
00381 {
00382 Coordinate testCoordinate;
00383 int index;
00384
00385 testCoordinate.genomic = genomicCoordinate;
00386 testCoordinate.chromosome = hlr_strdup (chromosome);
00387 if (!arrayFind (coordinates,&testCoordinate,&index,(ARRAYORDERF)sortCoordinates)) {
00388 die ("Expected to find coordinate: %s %d",chromosome,genomicCoordinate);
00389 }
00390 hlr_free (testCoordinate.chromosome);
00391 return arrp (coordinates,index,Coordinate)->transcript;
00392 }
00393
00394
00395
00396 static void calculateIntraOffsets (Array coordinatesTanscript, SuperIntra *currSuperIntra, Array intraOffsets)
00397 {
00398 int i;
00399 Intra *currIntra;
00400 int index1,index2;
00401
00402 for (i = 0; i < arrayMax (currSuperIntra->intras); i++) {
00403 currIntra = arru (currSuperIntra->intras,i,Intra*);
00404 index2 = getTranscriptCoordinate (coordinatesTanscript,currIntra->readEnd2,currIntra->transcript->chromosome);
00405 index1 = getTranscriptCoordinate (coordinatesTanscript,currIntra->readStart1,currIntra->transcript->chromosome);
00406 array (intraOffsets,arrayMax (intraOffsets),int) = index2 - index1 + 1;
00407 }
00408 }
00409
00410
00411
00412 static void calculateInterOffsets (Array interCoordinates, SuperInter *currSuperInter, Array interOffsets, int isAB)
00413 {
00414 int i;
00415 Inter *currInter;
00416 int index1,index2;
00417
00418 for (i = 0; i < arrayMax (currSuperInter->inters); i++) {
00419 currInter = arru (currSuperInter->inters,i,Inter*);
00420 if (currInter->pairType != GFR_PAIR_TYPE_EXONIC_EXONIC) {
00421 continue;
00422 }
00423 if (isAB == 1) {
00424 index2 = getTranscriptCoordinate (interCoordinates,currInter->readEnd2,currInter->transcript2->chromosome);
00425 index1 = getTranscriptCoordinate (interCoordinates,currInter->readStart1,currInter->transcript1->chromosome);
00426 array (interOffsets,arrayMax (interOffsets),int) = index2 - index1 + 1;
00427 }
00428 else if (isAB == 0) {
00429 index1 = getTranscriptCoordinate (interCoordinates,currInter->readEnd1,currInter->transcript1->chromosome);
00430 index2 = getTranscriptCoordinate (interCoordinates,currInter->readStart2,currInter->transcript2->chromosome);
00431 array (interOffsets,arrayMax (interOffsets),int) = index1 - index2 + 1;
00432 }
00433 else {
00434 die ("Unknown mode: %d",isAB);
00435 }
00436 }
00437 }
00438
00439
00440
00441 static void destroyCoordinates (Array coordinates)
00442 {
00443 Coordinate *currCoordinate;
00444 int i;
00445
00446 for (i = 0; i < arrayMax (coordinates); i++) {
00447 currCoordinate = arrp (coordinates,i,Coordinate);
00448 hlr_free (currCoordinate->chromosome);
00449 }
00450 arrayDestroy (coordinates);
00451 }
00452
00453
00454
00455 static double calculateMean (Array integers)
00456 {
00457 int i;
00458 int count;
00459
00460 count = 0;
00461 for (i = 0; i < arrayMax (integers); i++) {
00462 count += arru (integers,i,int);
00463 }
00464 return 1.0 * count / arrayMax (integers);
00465 }
00466 int sortIntegers(int *a, int *b) {
00467 return *b - *a;
00468 }
00469 static double calculateMedian (Array integers)
00470 {
00471 arraySort( integers, (ARRAYORDERF) sortIntegers);
00472 return arru (integers, arrayMax( integers )/2,int);
00473 }
00474
00475
00476 static double compareDistributions (Array intraOffsets, Array interOffsets)
00477 {
00478 double meanIntra, medianInter;
00479 int i,j;
00480 static Array sampledIntraOffsets = NULL;
00481 int count;
00482
00483 if (sampledIntraOffsets == NULL) {
00484 sampledIntraOffsets = arrayCreate (1000,int);
00485 }
00486 count = 0;
00487
00488 medianInter = calculateMedian( interOffsets );
00489 for (i = 0; i < SAMPLING_ITERATIONS; i++) {
00490 arrayClear (sampledIntraOffsets);
00491 for (j = 0; j < arrayMax (interOffsets); j++) {
00492 array (sampledIntraOffsets,arrayMax (sampledIntraOffsets),int) = arru (intraOffsets,rand () % arrayMax (intraOffsets),int);
00493 }
00494 meanIntra = calculateMean (sampledIntraOffsets);
00495 if (medianInter > meanIntra) {
00496 count++;
00497 }
00498 }
00499 return 1 - (1.0 * count / SAMPLING_ITERATIONS);
00500 }
00501
00502
00503
00504 static SuperIntra* getSuperIntra (Array superIntras, Interval *testInterval)
00505 {
00506 SuperIntra testSuperIntra;
00507 int index;
00508
00509 testSuperIntra.transcript = testInterval;
00510 if (arrayFind (superIntras,&testSuperIntra,&index,(ARRAYORDERF)sortSuperIntras)) {
00511 return arrp (superIntras,index,SuperIntra);
00512 }
00513 return NULL;
00514 }
00515
00516
00517
00518 static char* exonCoordinates2string (Array exons)
00519 {
00520 int i;
00521 static Stringa buffer = NULL;
00522 SubInterval *currExon;
00523
00524 stringCreateClear (buffer,100);
00525 for (i = 0; i < arrayMax (exons); i++) {
00526 currExon = arrp (exons,i,SubInterval);
00527 stringAppendf (buffer,"%d,%d%s",currExon->start,currExon->end,i < arrayMax (exons) - 1 ? "|" : "");
00528 }
00529 return string (buffer);
00530 }
00531
00532
00533
00534 static int isValidForInsertSizeFilter (SuperInter *currSuperInter)
00535 {
00536 int i;
00537 Inter *currInter;
00538
00539 for (i = 0; i < arrayMax (currSuperInter->inters); i++) {
00540 currInter = arru (currSuperInter->inters,i,Inter*);
00541 if (currInter->pairType == GFR_PAIR_TYPE_EXONIC_EXONIC) {
00542 return 1;
00543 }
00544 }
00545 return 0;
00546 }
00547
00548
00549
00550 int main (int argc, char *argv[])
00551 {
00552 MrfEntry *currMrfEntry;
00553 MrfRead *currMrfRead1,*currMrfRead2;
00554 MrfBlock *currMrfBlock1,*currMrfBlock2;
00555 Stringa buffer;
00556 Array intervals1;
00557 Array intervals2;
00558 Interval *transcript1,*transcript2;
00559 Array inters;
00560 Inter *currInter,*nextInter;
00561 Array intras;
00562 Intra *currIntra,*nextIntra;
00563 int i,j;
00564 int exon1,exon2,intron1,intron2,junction1,junction2;
00565 SuperInter *currSuperInter;
00566 Array superInters;
00567 SuperIntra *currSuperIntra,*superIntra1,*superIntra2;
00568 Array superIntras;
00569 Array coordinatesTanscript;
00570 Array interCoordinatesAB,interCoordinatesBA;
00571 Array intraOffsets;
00572 Array interOffsetsAB,interOffsetsBA;
00573 double pvalueAB,pvalueBA;
00574 double meanInterAB,meanInterBA;
00575 FILE *fp;
00576 int mrfLines;
00577 char *exonCoordinates1,*exonCoordinates2;
00578 int minNumberOfPairedEndReads;
00579
00580 if (argc != 3) {
00581 usage ("%s <prefix> <minNumberOfPairedEndReads>",argv[0]);
00582 }
00583 minNumberOfPairedEndReads = atoi (argv[2]);
00584 srand (time (0));
00585 buffer = stringCreate (100);
00586 stringPrintf (buffer,"%s/%s", TRANSCRIPT_COMPOSITE_MODEL_DIR, TRANSCRIPT_COMPOSITE_MODEL_FILENAME);
00587 intervalFind_addIntervalsToSearchSpace (string (buffer),0);
00588 inters = arrayCreate (1000000,Inter);
00589 intras = arrayCreate (1000000,Intra);
00590 mrfLines = 0;
00591 mrf_init ("-");
00592 while (currMrfEntry = mrf_nextEntry ()) {
00593 mrfLines++;
00594 currMrfRead1 = &currMrfEntry->read1;
00595 currMrfRead2 = &currMrfEntry->read2;
00596 for (i = 0; i < arrayMax (currMrfRead1->blocks); i++) {
00597 currMrfBlock1 = arrp (currMrfRead1->blocks,i,MrfBlock);
00598 for (j = 0; j < arrayMax (currMrfRead2->blocks); j++) {
00599 currMrfBlock2 = arrp (currMrfRead2->blocks,j,MrfBlock);
00600 intervals1 = arrayCopy (intervalFind_getOverlappingIntervals (currMrfBlock1->targetName,currMrfBlock1->targetStart,currMrfBlock1->targetEnd));
00601 intervals2 = arrayCopy (intervalFind_getOverlappingIntervals (currMrfBlock2->targetName,currMrfBlock2->targetStart,currMrfBlock2->targetEnd));
00602 if (arrayMax (intervals1) == 1 && arrayMax (intervals2) == 1) {
00603 transcript1 = arru (intervals1,0,Interval*);
00604 transcript2 = arru (intervals2,0,Interval*);
00605 exon1 = getExonNumber (transcript1,currMrfBlock1->targetStart,currMrfBlock1->targetEnd);
00606 exon2 = getExonNumber (transcript2,currMrfBlock2->targetStart,currMrfBlock2->targetEnd);
00607 intron1 = getIntronNumber (transcript1,currMrfBlock1->targetStart,currMrfBlock1->targetEnd);
00608 intron2 = getIntronNumber (transcript2,currMrfBlock2->targetStart,currMrfBlock2->targetEnd);
00609 junction1 = getJunctionNumber (transcript1,currMrfBlock1->targetStart,currMrfBlock1->targetEnd,exon1,intron1);
00610 junction2 = getJunctionNumber (transcript2,currMrfBlock2->targetStart,currMrfBlock2->targetEnd,exon2,intron2);
00611 if (transcript1 != transcript2) {
00612 currInter = arrayp (inters,arrayMax (inters),Inter);
00613 currInter->transcript1 = transcript1;
00614 currInter->transcript2 = transcript2;
00615 currInter->readStart1 = currMrfBlock1->targetStart;
00616 currInter->readStart2 = currMrfBlock2->targetStart;
00617 currInter->readEnd1 = currMrfBlock1->targetEnd;
00618 currInter->readEnd2 = currMrfBlock2->targetEnd;
00619 currInter->read1 = hlr_strdup (currMrfRead1->sequence);
00620 currInter->read2 = hlr_strdup (currMrfRead2->sequence);
00621 assignPairType (currInter,exon1,intron1,junction1,exon2,intron2,junction2);
00622 }
00623 if (transcript1 == transcript2) {
00624 if (exon1 > 0 && exon2 > 0) {
00625 currIntra = arrayp (intras,arrayMax (intras),Intra);
00626 currIntra->transcript = transcript1;
00627 currIntra->readStart1 = currMrfBlock1->targetStart;
00628 currIntra->readStart2 = currMrfBlock2->targetStart;
00629 currIntra->readEnd1 = currMrfBlock1->targetEnd;
00630 currIntra->readEnd2 = currMrfBlock2->targetEnd;
00631 currIntra->read1 = hlr_strdup (currMrfRead1->sequence);
00632 currIntra->read2 = hlr_strdup (currMrfRead2->sequence);
00633 }
00634 }
00635 }
00636 arrayDestroy (intervals1);
00637 arrayDestroy (intervals2);
00638 }
00639 }
00640 }
00641 mrf_deInit ();
00642 arraySort (inters,(ARRAYORDERF)sortIntersByTranscript);
00643 arraySort (intras,(ARRAYORDERF)sortIntrasByTranscript);
00644
00645 intraOffsets = arrayCreate (1000000,int);
00646 superIntras = arrayCreate (100000,SuperIntra);
00647 i = 0;
00648 while (i < arrayMax (intras)) {
00649 currIntra = arrp (intras,i,Intra);
00650 currSuperIntra = arrayp (superIntras,arrayMax (superIntras),SuperIntra);
00651 currSuperIntra->transcript = currIntra->transcript;
00652 currSuperIntra->intras = arrayCreate (100,Intra*);
00653 array (currSuperIntra->intras,arrayMax (currSuperIntra->intras),Intra*) = currIntra;
00654 j = i + 1;
00655 while (j < arrayMax (intras)) {
00656 nextIntra = arrp (intras,j,Intra);
00657 if (currIntra->transcript == nextIntra->transcript) {
00658 array (currSuperIntra->intras,arrayMax (currSuperIntra->intras),Intra*) = nextIntra;
00659 }
00660 else {
00661 break;
00662 }
00663 j++;
00664 }
00665 i = j;
00666 coordinatesTanscript = convertIntraCoordinates (currSuperIntra->transcript);
00667 calculateIntraOffsets (coordinatesTanscript,currSuperIntra,intraOffsets);
00668 destroyCoordinates (coordinatesTanscript);
00669 }
00670 arraySort (superIntras,(ARRAYORDERF)sortSuperIntras);
00671
00672 superInters = arrayCreate (100000,SuperInter);
00673 i = 0;
00674 while (i < arrayMax (inters)) {
00675 currInter = arrp (inters,i,Inter);
00676 currSuperInter = arrayp (superInters,arrayMax (superInters),SuperInter);
00677 currSuperInter->transcript1 = currInter->transcript1;
00678 currSuperInter->transcript2 = currInter->transcript2;
00679 currSuperInter->inters = arrayCreate (100,Inter*);
00680 array (currSuperInter->inters,arrayMax (currSuperInter->inters),Inter*) = currInter;
00681 j = i + 1;
00682 while (j < arrayMax (inters)) {
00683 nextInter = arrp (inters,j,Inter);
00684 if (currInter->transcript1 == nextInter->transcript1 &&
00685 currInter->transcript2 == nextInter->transcript2) {
00686 array (currSuperInter->inters,arrayMax (currSuperInter->inters),Inter*) = nextInter;
00687 }
00688 else {
00689 break;
00690 }
00691 j++;
00692 }
00693 i = j;
00694 }
00695 arraySort (superInters,(ARRAYORDERF)sortSuperInters);
00696
00697 warn ("%s_numMrfLines: %d",argv[0],mrfLines);
00698 warn ("%s_numIntra: %d",argv[0],arrayMax (intras));
00699 warn ("%s_numInter: %d",argv[0],arrayMax (inters));
00700 warn ("%s_numSuperIntra: %d",argv[0],arrayMax (superIntras));
00701 warn ("%s_numSuperInter: %d",argv[0],arrayMax (superInters));
00702 printf ("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
00703 GFR_COLUMN_NAME_NUM_INTER,
00704 GFR_COLUMN_NAME_INTER_MEAN_AB,
00705 GFR_COLUMN_NAME_INTER_MEAN_BA,
00706 GFR_COLUMN_NAME_PVALUE_AB,
00707 GFR_COLUMN_NAME_PVALUE_BA,
00708 GFR_COLUMN_NAME_NUM_INTRA1,
00709 GFR_COLUMN_NAME_NUM_INTRA2,
00710 GFR_COLUMN_NAME_FUSION_TYPE,
00711 GFR_COLUMN_NAME_NAME_TRANSCRIPT1,
00712 GFR_COLUMN_NAME_NUM_EXONS_TRANSCRIPT1,
00713 GFR_COLUMN_NAME_EXON_COORDINATES_TRANSCRIPT1,
00714 GFR_COLUMN_NAME_CHROMOSOME_TRANSCRIPT1,
00715 GFR_COLUMN_NAME_STRAND_TRANSCRIPT1,
00716 GFR_COLUMN_NAME_START_TRANSCRIPT1,
00717 GFR_COLUMN_NAME_END_TRANSCRIPT1,
00718 GFR_COLUMN_NAME_NAME_TRANSCRIPT2,
00719 GFR_COLUMN_NAME_NUM_EXONS_TRANSCRIPT2,
00720 GFR_COLUMN_NAME_EXON_COORDINATES_TRANSCRIPT2,
00721 GFR_COLUMN_NAME_CHROMOSOME_TRANSCRIPT2,
00722 GFR_COLUMN_NAME_STRAND_TRANSCRIPT2,
00723 GFR_COLUMN_NAME_START_TRANSCRIPT2,
00724 GFR_COLUMN_NAME_END_TRANSCRIPT2,
00725 GFR_COLUMN_NAME_INTER_READS,
00726 GFR_COLUMN_NAME_ID,
00727 GFR_COLUMN_NAME_READS_TRANSCRIPT1,
00728 GFR_COLUMN_NAME_READS_TRANSCRIPT2);
00729
00730 interOffsetsAB = arrayCreate (1000,int);
00731 interOffsetsBA = arrayCreate (1000,int);
00732 i = 0;
00733 while (i < arrayMax (superInters)) {
00734 currSuperInter = arrp (superInters,i,SuperInter);
00735 if ( getNumberOfInters (currSuperInter) < minNumberOfPairedEndReads) {
00736 break;
00737 }
00738 if (isValidForInsertSizeFilter (currSuperInter)) {
00739 interCoordinatesAB = convertInterCoordinates (currSuperInter,1);
00740 arrayClear (interOffsetsAB);
00741 calculateInterOffsets (interCoordinatesAB,currSuperInter,interOffsetsAB,1);
00742 pvalueAB = compareDistributions (intraOffsets,interOffsetsAB);
00743 meanInterAB = calculateMedian (interOffsetsAB);
00744 destroyCoordinates (interCoordinatesAB);
00745 interCoordinatesBA = convertInterCoordinates (currSuperInter,0);
00746 arrayClear (interOffsetsBA);
00747 calculateInterOffsets (interCoordinatesBA,currSuperInter,interOffsetsBA,0);
00748 pvalueBA = compareDistributions (intraOffsets,interOffsetsBA);
00749 meanInterBA = calculateMedian (interOffsetsBA);
00750 destroyCoordinates (interCoordinatesBA);
00751 }
00752 else {
00753 meanInterAB = -1;
00754 meanInterBA = -1;
00755 pvalueAB = -1;
00756 pvalueBA = -1;
00757 }
00758 superIntra1 = getSuperIntra (superIntras,currSuperInter->transcript1);
00759 superIntra2 = getSuperIntra (superIntras,currSuperInter->transcript2);
00760 exonCoordinates1 = hlr_strdup (exonCoordinates2string (currSuperInter->transcript1->subIntervals));
00761 exonCoordinates2 = hlr_strdup (exonCoordinates2string (currSuperInter->transcript2->subIntervals));
00762 printf ("%d\t%.2f\t%.2f\t%.5f\t%.5f\t%d\t%d\t%s\t",
00763 getNumberOfInters ( currSuperInter ),
00764 meanInterAB,meanInterBA,pvalueAB,pvalueBA,
00765 superIntra1 ? getNumberOfIntras(superIntra1) : 0,
00766 superIntra2 ? getNumberOfIntras(superIntra2) : 0,
00767 strEqual (currSuperInter->transcript1->chromosome,currSuperInter->transcript2->chromosome) ? "cis" : "trans");
00768 printf ("%s\t%d\t%s\t%s\t%c\t%d\t%d\t",
00769 currSuperInter->transcript1->name,arrayMax (currSuperInter->transcript1->subIntervals),exonCoordinates1,currSuperInter->transcript1->chromosome,currSuperInter->transcript1->strand,currSuperInter->transcript1->start,currSuperInter->transcript1->end);
00770 printf ("%s\t%d\t%s\t%s\t%c\t%d\t%d\t",
00771 currSuperInter->transcript2->name,arrayMax (currSuperInter->transcript2->subIntervals),exonCoordinates2,currSuperInter->transcript2->chromosome,currSuperInter->transcript2->strand,currSuperInter->transcript2->start,currSuperInter->transcript2->end);
00772 for (j = 0; j < arrayMax (currSuperInter->inters); j++) {
00773 currInter = arru (currSuperInter->inters,j,Inter*);
00774 printf ("%d,%d,%d,%d,%d,%d,%d%s",
00775 currInter->pairType,currInter->number1,currInter->number2,
00776 currInter->readStart1,currInter->readEnd1,
00777 currInter->readStart2,currInter->readEnd2,
00778 j < arrayMax (currSuperInter->inters) - 1 ? "|" : "\t");
00779 }
00780 printf ("%s_%05d\t",argv[1],i + 1);
00781 for (j = 0; j < arrayMax (currSuperInter->inters); j++) {
00782 currInter = arru (currSuperInter->inters,j,Inter*);
00783 printf ("%s%s",currInter->read1,j < arrayMax (currSuperInter->inters) - 1 ? "|" : "\t");
00784 }
00785 for (j = 0; j < arrayMax (currSuperInter->inters); j++) {
00786 currInter = arru (currSuperInter->inters,j,Inter*);
00787 printf ("%s%s",currInter->read2,j < arrayMax (currSuperInter->inters) - 1 ? "|" : "\n");
00788 }
00789 hlr_free (exonCoordinates1);
00790 hlr_free (exonCoordinates2);
00791 i++;
00792 }
00793 warn ("%s_numGfrEntries: %d",argv[0],i);
00794 stringPrintf (buffer,"%s.intraOffsets",argv[1]);
00795 fp = fopen (string (buffer),"w");
00796 for (i = 0; i < arrayMax (intraOffsets); i++) {
00797 fprintf (fp,"%d\n",arru (intraOffsets,i,int));
00798 }
00799 fclose (fp);
00800 stringDestroy (buffer);
00801 return 0;
00802 }
00803