00001 #include "log.h"
00002 #include "format.h"
00003 #include "linestream.h"
00004 #include "mrf.h"
00005 #include "exportPEParser.h"
00006 #include <stdio.h>
00007
00013 static char convertStrand (char strand)
00014 {
00015 if (strand == 'F') {
00016 return '+';
00017 }
00018 if (strand != 'R') {
00019 die ("Unexpected strand: %c",strand);
00020 }
00021 return '-';
00022 }
00023
00024
00025 int main (int argc, char *argv[])
00026 {
00027 int readLength;
00028 int chrEnd1, startEnd1;
00029 int chrEnd2, startEnd2;
00030 char *pos;
00031 Stringa id;
00032 FILE *fp;
00033 Stringa buffer, export1, export2;
00034 int countExport = 0;
00035 int countMRF = 0;
00036 int junction1, junction2;
00037
00038 if (argc != 4) {
00039 usage ("%s <prefix> <exportFile1> <exportFile2>",argv[0]);
00040 }
00041 id = stringCreate (100);
00042 buffer = stringCreate (100);
00043 stringPrintf (buffer,"%s_allReads.txt",argv[1]);
00044 fp = fopen (string (buffer),"w");
00045 if (fp == NULL) {
00046 die ("Unable to open file: %s",string (buffer));
00047 }
00048 printf ("%s\t%s\t%s\n",MRF_COLUMN_NAME_BLOCKS,MRF_COLUMN_NAME_SEQUENCE,MRF_COLUMN_NAME_QUALITY_SCORES);
00049
00050 export1 = stringCreate( 20 );
00051 export2 = stringCreate( 20 );
00052 if( strEndsWith( argv[2], ".gz" ) ) {
00053 stringPrintf( export1, "zcat %s", argv[2] );
00054 stringPrintf( export2, "zcat %s", argv[3] );
00055 exportPEParser_initFromPipe( string( export1 ), string( export2 ) );
00056 } else {
00057 exportPEParser_initFromFile( argv[2], argv[3] );
00058 }
00059
00060 ExportPE* currExport;
00061 while ( currExport = exportPEParser_nextEntry() ) {
00062 countExport++;
00063 junction1=0;
00064 junction2=0;
00065 singleEnd* currEnd1 = currExport->end1;
00066 singleEnd* currEnd2 = currExport->end2;
00067 if(currEnd1->filter != 'Y' || currEnd2->filter != 'Y' ) {
00068 continue;
00069 }
00070
00071 if (strStartsWith (currEnd1->chromosome,"QC") || strStartsWith (currEnd2->chromosome,"QC") ) {
00072 continue;
00073 }
00074
00075 fprintf (fp,"%s\n",currEnd1->sequence);
00076 fprintf (fp,"%s\n",currEnd2->sequence);
00077
00078
00079 if (strEqual (currEnd1->chromosome,"NM") || strEqual (currEnd2->chromosome,"NM" ) ) {
00080 continue;
00081 }
00082
00083 if (strchr (currEnd1->chromosome,'M') || strchr (currEnd2->chromosome,'M')) {
00084 continue;
00085 }
00086 if( strStartsWith( currEnd1->chromosome, "ribosomal") || strStartsWith( currEnd2->chromosome, "ribosomal") ) {
00087 continue;
00088 }
00089 if( strchr( currEnd1->chromosome, ':' ) || strchr( currEnd2->chromosome, ':' ) ) {
00090 continue;
00091 }
00092 static Stringa end1 = NULL;
00093 static Stringa end2 = NULL;
00094 stringCreateClear( end1, 30);
00095 stringCreateClear( end2, 30);
00096
00097
00098 readLength = strlen( currEnd1->sequence );
00099 if( strStartsWith( currEnd1->chromosome, "chr") ) {
00100 pos = strchr ( currEnd1->chromosome,'.');
00101 *pos = '\0';
00102 chrEnd1 = atoi( currEnd1->chromosome+3 );
00103 startEnd1 = currEnd1->position;
00104 if( currEnd1->position<0 )
00105 continue;
00106 stringPrintf( end1, "%s:%c:%d:%d:1:%d", currEnd1->chromosome, convertStrand (currEnd1->strand), currEnd1->position, currEnd1->position + readLength - 1, readLength );
00107 } else if( strstr( currEnd1->chromosome, "junction" ) ) {
00108 Texta location = textFieldtokP ( currEnd1->contig, "|" );
00109 int tilestart1 = atoi( textItem( location, 1) );
00110 int tilestart2 = atoi( textItem( location, 2) );
00111 int tileSize = atoi( textItem( location, 3 ) );
00112 if( (tileSize - currEnd1->position + 1 ) >= readLength || ( (tileSize - currEnd1->position) < 0) )
00113 continue;
00114 int targetStart1 = tilestart1 + currEnd1->position;
00115 int targetEnd1 = tilestart1 + tileSize;
00116 int queryStart1 = 1;
00117 int queryEnd1 = (targetEnd1 - targetStart1 + 1);
00118 int basesOnSecondBlock = readLength - (queryEnd1 - queryStart1 + 1);
00119
00120 int targetStart2 = tilestart2 + 1 ;
00121 int targetEnd2 = targetStart2 + basesOnSecondBlock - 1;
00122 int queryStart2 = queryEnd1 + 1;
00123 int queryEnd2 = readLength;
00124 stringPrintf( end1, "%s:%c:%d:%d:%d:%d,%s:%c:%d:%d:%d:%d",
00125 textItem( location, 0),convertStrand (currEnd1->strand),targetStart1,targetEnd1, queryStart1, queryEnd1,
00126 textItem( location, 0),convertStrand (currEnd1->strand),targetStart2,targetEnd2, queryStart2, queryEnd2
00127 );
00128 chrEnd1 = atoi( textItem( location, 0)+3 );
00129 startEnd1 = targetStart1;
00130 textDestroy( location );
00131 junction1 = 1;
00132 } else {
00133 warn( "End 1 not in a proper format:%s", exportPEParser_writeEntry( currEnd1 ) );
00134 continue;
00135 }
00136 readLength = strlen( currEnd2->sequence );
00137 if( strStartsWith( currEnd2->chromosome, "chr") ) {
00138 pos = strchr ( currEnd2->chromosome,'.');
00139 *pos = '\0';
00140 chrEnd2 = atoi( currEnd2->chromosome+3 );
00141 startEnd2 = currEnd2->position;
00142 stringPrintf( end2, "%s:%c:%d:%d:1:%d", currEnd2->chromosome, convertStrand ( currEnd2->strand), currEnd2->position, currEnd2->position + readLength - 1, readLength );
00143 } else if( strstr( currEnd2->chromosome, "junction" ) ) {
00144 Texta location = textFieldtokP ( currEnd2->contig, "|" );
00145
00146 int tilestart1 = atoi( textItem( location, 1) );
00147 int tilestart2 = atoi( textItem( location, 2) );
00148 int tileSize = atoi( textItem( location, 3 ) );
00149 if( (tileSize - currEnd2->position + 1) >= readLength || ( (tileSize - currEnd2->position ) < 0) )
00150 continue;
00151 int targetStart1 = tilestart1 + currEnd2->position;
00152 int targetEnd1 = tilestart1 + tileSize;
00153 int queryStart1 = 1;
00154 int queryEnd1 = (targetEnd1 - targetStart1 + 1);
00155 int basesOnSecondBlock = readLength - (queryEnd1 - queryStart1 + 1);
00156
00157 int targetStart2 = tilestart2 + 1 ;
00158 int targetEnd2 = targetStart2 + basesOnSecondBlock - 1;
00159 int queryStart2 = queryEnd1 + 1;
00160 int queryEnd2 = readLength;
00161 stringPrintf( end2, "%s:%c:%d:%d:%d:%d,%s:%c:%d:%d:%d:%d",
00162 textItem( location, 0),convertStrand (currEnd2->strand),targetStart1, targetEnd1, queryStart1, queryEnd1,
00163 textItem( location, 0),convertStrand (currEnd2->strand),targetStart2, targetEnd2, queryStart2, queryEnd2
00164 );
00165 chrEnd2=atoi(textItem( location, 0)+3 );
00166 startEnd2 = targetStart2;
00167 textDestroy( location );
00168 junction2=1;
00169 } else {
00170 warn( "End 2 not in a proper format:%s", exportPEParser_writeEntry( currEnd2 ) );
00171 continue;
00172 }
00173
00174 if( chrEnd1 < chrEnd2 ) {
00175 printf ("%s|%s\t%s|%s\t%s|%s\n", string(end1), string(end2), currEnd1->sequence, currEnd2->sequence, currEnd1->quality, currEnd2->quality);
00176 } else {
00177 if( chrEnd1 > chrEnd2 ) {
00178 printf ("%s|%s\t%s|%s\t%s|%s\n", string(end2), string(end1), currEnd2->sequence, currEnd1->sequence, currEnd2->quality, currEnd1->quality);
00179 } else {
00180
00181
00182
00183
00184
00185 if ((currEnd1->strand == 'R' && currEnd2->strand == 'F') || (currEnd1->strand == 'F' && currEnd2->strand == 'R') &&
00186 ( junction1 == 0 && junction2 == 0) ) {
00187 if( (startEnd2-startEnd1) > 0 ) {
00188 printf ("%s|%s\t%s|%s\t%s|%s\n", string(end1), string(end2), currEnd1->sequence, currEnd2->sequence, currEnd1->quality, currEnd2->quality);
00189 } else {
00190 printf ("%s|%s\t%s|%s\t%s|%s\n", string(end2), string(end1), currEnd2->sequence, currEnd1->sequence, currEnd2->quality, currEnd1->quality);
00191 }
00192 } else {
00193 if( junction1 == 1 || junction2 == 1 ) {
00194 if( (startEnd2-startEnd1) > 0 ) {
00195 printf ("%s|%s\t%s|%s\t%s|%s\n", string(end1), string(end2), currEnd1->sequence, currEnd2->sequence, currEnd1->quality, currEnd2->quality);
00196 } else {
00197 printf ("%s|%s\t%s|%s\t%s|%s\n", string(end2), string(end1), currEnd2->sequence, currEnd1->sequence, currEnd2->quality, currEnd1->quality);
00198 }
00199 } else {
00200 warn( "Same strand (1):\t%s", exportPEParser_writeEntry ( currEnd1 ) );
00201 warn( "Same strand (2):\t%s", exportPEParser_writeEntry ( currEnd2 ) );
00202 continue;
00203 }
00204 }
00205 }
00206 }
00207 stringDestroy( end1 );
00208 stringDestroy( end2 );
00209 countMRF++;
00210 }
00211 fclose (fp);
00212
00213 stringPrintf (buffer,"%s.meta",argv[1]);
00214 fp = fopen (string (buffer),"w");
00215 if (fp == NULL) {
00216 die ("Unable to open file: %s",string (buffer));
00217 }
00218 fprintf(fp, "Total_number_reads\t%d\nMapped_reads\t%d", countExport,countMRF);
00219 fclose (fp);
00220 stringDestroy (buffer);
00221 stringDestroy (id);
00222 return 0;
00223 }
00224
00225
00226