00001 #include "log.h"
00002 #include "format.h"
00003 #include "bowtieParser.h"
00004 #include "bp.h"
00005 #include "common.h"
00006 #include "geneFusionsConfig.h"
00007 #include "fasta.h"
00008 #include <unistd.h>
00009
00010
00011 typedef struct {
00012 char* chromosome1;
00013 int start1;
00014 int end1;
00015 char* chromosome2;
00016 int start2;
00017 int end2;
00018 } BreakPointJunction;
00019
00020 static char* getBreakPointSequence (char *tileCoordinate1, char *tileCoordinate2)
00021 {
00022 Stringa buffer;
00023 Stringa targetsFile;
00024 FILE *fp;
00025 Array targetSeqs;
00026 int i;
00027 Seq *currSeq;
00028 static Stringa sequence = NULL;
00029
00030 buffer = stringCreate (100);
00031 targetsFile = stringCreate (100);
00032 stringPrintf (targetsFile,"targets_%d.txt",getpid ());
00033 if (!(fp = fopen (string (targetsFile),"w")) ){
00034 die ("Unable to open target file: %s",string (targetsFile));
00035 }
00036 fprintf (fp,"%s\n%s",tileCoordinate1,tileCoordinate2);
00037 fclose (fp);
00038 stringPrintf (buffer,"%s %s/%s stdout -noMask -seqList=%s",
00039 BLAT_TWO_BIT_TO_FA,BLAT_DATA_DIR,BLAT_TWO_BIT_DATA_FILENAME,string (targetsFile));
00040 fasta_initFromPipe (string (buffer));
00041 targetSeqs = fasta_readAllSequences (0);
00042 fasta_deInit ();
00043 if (arrayMax (targetSeqs) != 2) {
00044 die ("Expected only two target sequences");
00045 }
00046 stringCreateClear (sequence,100);
00047 for (i = 0; i < arrayMax (targetSeqs); i++) {
00048 currSeq = arrp (targetSeqs,i,Seq);
00049 stringAppendf (sequence,"%s",currSeq->sequence);
00050 hlr_free (currSeq->name);
00051 hlr_free (currSeq->sequence);
00052 }
00053 arrayDestroy (targetSeqs);
00054 stringPrintf (buffer,"rm -rf %s",string (targetsFile));
00055 hlr_system (string (buffer),0);
00056 stringDestroy (targetsFile);
00057 stringDestroy (buffer);
00058 return string (sequence);
00059 }
00060
00061 static void getCoordinates( char* str, BreakPointJunction* bpJunction)
00062 {
00063 Texta txt = textFieldtokP(str, ":|-");
00064 bpJunction->chromosome1 = hlr_strdup( textItem( txt, 0 ));
00065 bpJunction->start1 = atoi(textItem( txt, 1 ) );
00066 bpJunction->end1 = atoi( textItem( txt, 2 ) );
00067 bpJunction->chromosome2 = hlr_strdup( textItem( txt, 3 ) );
00068 bpJunction->start2 = atoi(textItem( txt, 4 ) );
00069 bpJunction->end2 = atoi( textItem( txt, 5 ) );
00070 textDestroy( txt );
00071 }
00072 static int sortHits( BowtieEntry *a, BowtieEntry *b)
00073 {
00074 BreakPointJunction* BPJa, *BPJb;
00075 AllocVar( BPJa );
00076 AllocVar( BPJb );
00077 getCoordinates( a->chromosome, BPJa);
00078 getCoordinates( b->chromosome, BPJb);
00079 int diff = ( (BPJa->start1 - BPJb->start1) + (BPJa->start2 - BPJb->start2) );
00080 hlr_free( BPJa->chromosome1 );
00081 hlr_free( BPJa->chromosome2 );
00082 hlr_free( BPJb->chromosome1 );
00083 hlr_free( BPJb->chromosome2 );
00084 freeMem( BPJa );
00085 freeMem( BPJb );
00086 return diff;
00087 }
00088
00089 static int sortBreakPointJunctions( BreakPointJunction *a, BreakPointJunction *b)
00090 {
00091 int diff = ( (a->start1 - b->start1) + (a->start2 - b->start2) );
00092 return diff;
00093 }
00094
00095 int main (int argc, char *argv[])
00096 {
00097 BowtieQuery *currBQ;
00098 BowtieEntry *currBE;
00099 Array bowtieQueries;
00100 Array breakPointJunctions;
00101 BreakPointJunction *currBPJ;
00102 int i,j;
00103 bowtieParser_initFromFile ("-");
00104 bowtieQueries = bowtieParser_getAllQueries ();
00105 bowtieParser_deInit ();
00106 breakPointJunctions = arrayCreate (10000,BreakPointJunction);
00107 for (i = 0; i < arrayMax (bowtieQueries); i++) {
00108 currBQ = arrp (bowtieQueries,i,BowtieQuery);
00109 currBPJ = arrayp (breakPointJunctions,arrayMax(breakPointJunctions), BreakPointJunction);
00110 if( arrayMax( currBQ->entries ) == 1 ) {
00111
00112 getCoordinates( currBQ->sequenceName, currBPJ);
00113 } else {
00114 arraySort( currBQ->entries, (ARRAYORDERF) sortHits);
00115 for( j=0; j<arrayMax( currBQ->entries ); j++ ) {
00116 currBE = arrp (currBQ->entries,j,BowtieEntry);
00117 BreakPointJunction *bpJ;
00118 if( j==0 )
00119 getCoordinates( currBE->chromosome, currBPJ);
00120 else {
00121 AllocVar( bpJ );
00122 getCoordinates( currBE->chromosome, bpJ);
00123 currBPJ->end2 = bpJ->end2;
00124 hlr_free(bpJ->chromosome1);
00125 hlr_free(bpJ->chromosome2);
00126 freeMem( bpJ);
00127 }
00128 }
00129 }
00130 }
00131 arraySort( breakPointJunctions, (ARRAYORDERF) sortBreakPointJunctions);
00132 arrayUniq( breakPointJunctions, NULL, (ARRAYORDERF) sortBreakPointJunctions );
00133
00134 for( i=0; i<arrayMax( breakPointJunctions); i++ ) {
00135 currBPJ = arrp (breakPointJunctions, i, BreakPointJunction);
00136
00137 Stringa tileCoordinate1 = stringCreate( 100 );
00138 stringPrintf( tileCoordinate1, "%s:%d-%d", currBPJ->chromosome1, currBPJ->start1, currBPJ->end1 );
00139 Stringa tileCoordinate2 = stringCreate( 100 );
00140 stringPrintf( tileCoordinate2, "%s:%d-%d", currBPJ->chromosome2, currBPJ->start2, currBPJ->end2 );
00141 char* breakPointSequence = getBreakPointSequence ( string(tileCoordinate1), string(tileCoordinate2) );
00142
00143 printf( ">%s:%d-%d|%s:%d-%d\n%s\n", currBPJ->chromosome1, currBPJ->start1, currBPJ->end1, currBPJ->chromosome2, currBPJ->start2, currBPJ->end2,
00144 breakPointSequence);
00145 stringDestroy( tileCoordinate1 );
00146 stringDestroy( tileCoordinate2 );
00147 }
00148 warn("bpClustering: Number of breakpoint junctions:\t%d",--i);
00149 return 0;
00150 }