00001 #include "log.h"
00002 #include "format.h"
00003 #include "linestream.h"
00004 #include "common.h"
00005 #include "bits.h"
00006 #include "mrf.h"
00007
00008
00009
00016 #define INIT_MODE_FROM_FILE 1
00017 #define INIT_MODE_FROM_PIPE 2
00018
00019
00020
00021 static LineStream lsMrf = NULL;
00022 static Bits *presentColumnTypes = NULL;
00023 static Array columnTypes = NULL;
00024 static Texta columnHeaders = NULL;
00025 static Texta comments = NULL;
00026 static char *headerLine = NULL;
00027
00028
00029
00030 static void mrf_addColumnType (char *type)
00031 {
00032 if (strEqual (type,MRF_COLUMN_NAME_BLOCKS)) {
00033 bitSetOne (presentColumnTypes,MRF_COLUMN_TYPE_BLOCKS);
00034 array (columnTypes,arrayMax (columnTypes),int) = MRF_COLUMN_TYPE_BLOCKS;
00035 textAdd (columnHeaders,MRF_COLUMN_NAME_BLOCKS);
00036 }
00037 else if (strEqual (type,MRF_COLUMN_NAME_SEQUENCE)) {
00038 bitSetOne (presentColumnTypes,MRF_COLUMN_TYPE_SEQUENCE);
00039 array (columnTypes,arrayMax (columnTypes),int) = MRF_COLUMN_TYPE_SEQUENCE;
00040 textAdd (columnHeaders,MRF_COLUMN_NAME_SEQUENCE);
00041 }
00042 else if (strEqual (type,MRF_COLUMN_NAME_QUALITY_SCORES)) {
00043 bitSetOne (presentColumnTypes,MRF_COLUMN_TYPE_QUALITY_SCORES);
00044 array (columnTypes,arrayMax (columnTypes),int) = MRF_COLUMN_TYPE_QUALITY_SCORES;
00045 textAdd (columnHeaders,MRF_COLUMN_NAME_QUALITY_SCORES);
00046 }
00047 else if (strEqual (type,MRF_COLUMN_NAME_QUERY_ID)) {
00048 bitSetOne (presentColumnTypes,MRF_COLUMN_TYPE_QUERY_ID);
00049 array (columnTypes,arrayMax (columnTypes),int) = MRF_COLUMN_TYPE_QUERY_ID;
00050 textAdd (columnHeaders,MRF_COLUMN_NAME_QUERY_ID);
00051 }
00052 else {
00053 die ("Unknown presentColumn: %s",type);
00054 }
00055 }
00056
00057
00058
00059 static void mrf_doInit (char *arg, int initMode)
00060 {
00061 Texta tokens;
00062 char *line;
00063 int i;
00064
00065 columnTypes = arrayCreate (20,int);
00066 columnHeaders = textCreate (20);
00067 presentColumnTypes = bitAlloc (100);
00068 comments = textCreate (100);
00069 if (initMode == INIT_MODE_FROM_FILE) {
00070 lsMrf = ls_createFromFile (arg);
00071 }
00072 else if (initMode == INIT_MODE_FROM_PIPE) {
00073 lsMrf = ls_createFromPipe (arg);
00074 }
00075 else {
00076 die ("Unknown init mode");
00077 }
00078 ls_bufferSet (lsMrf,1);
00079 while (line = ls_nextLine (lsMrf)) {
00080 if (line[0] == '#') {
00081 textAdd (comments,line + 1);
00082 }
00083 else {
00084 ls_back (lsMrf,1);
00085 break;
00086 }
00087 }
00088 headerLine = hlr_strdup (ls_nextLine (lsMrf));
00089 tokens = textFieldtokP (headerLine,"\t");
00090 for (i = 0; i < arrayMax (tokens); i++) {
00091 mrf_addColumnType (textItem (tokens,i));
00092 }
00093 }
00094
00095
00096
00101 void mrf_init (char *fileName)
00102 {
00103 mrf_doInit (fileName,INIT_MODE_FROM_FILE);
00104 }
00105
00106
00107
00112 void mrf_initFromPipe (char *cmd)
00113 {
00114 mrf_doInit (cmd,INIT_MODE_FROM_PIPE);
00115 }
00116
00117
00118
00123 void mrf_addNewColumnType (char* columnName)
00124 {
00125 int i;
00126
00127 i = 0;
00128 while (i < arrayMax (columnHeaders)) {
00129 if (strEqual (textItem (columnHeaders,i),columnName)) {
00130 break;
00131 }
00132 i++;
00133 }
00134 if (i == arrayMax (columnHeaders)) {
00135 mrf_addColumnType (columnName);
00136 }
00137 }
00138
00139
00140
00144 void mrf_deInit (void)
00145 {
00146 ls_destroy (lsMrf);
00147 arrayDestroy (columnTypes);
00148 textDestroy (columnHeaders);
00149 bitFree (&presentColumnTypes);
00150 textDestroy (comments);
00151 hlr_free (headerLine);
00152 }
00153
00154
00155
00156 static void mrf_freeReadAttributes (MrfRead *currRead)
00157 {
00158 int i;
00159 MrfBlock *currBlock;
00160
00161 for (i = 0; i < arrayMax (currRead->blocks); i++) {
00162 currBlock = arrp (currRead->blocks,i,MrfBlock);
00163 hlr_free (currBlock->targetName);
00164 }
00165 arrayDestroy (currRead->blocks);
00166 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_SEQUENCE)) {
00167 hlr_free (currRead->sequence);
00168 }
00169 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_QUALITY_SCORES)) {
00170 hlr_free (currRead->qualityScores);
00171 }
00172 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_QUERY_ID)) {
00173 hlr_free (currRead->queryId);
00174 }
00175 }
00176
00177
00178
00179 static void mrf_freeEntry (MrfEntry* currEntry)
00180 {
00181 if (currEntry == NULL) {
00182 return;
00183 }
00184 mrf_freeReadAttributes (&currEntry->read1);
00185 if (currEntry->isPairedEnd == 1) {
00186 mrf_freeReadAttributes (&currEntry->read2);
00187 }
00188 freeMem (currEntry);
00189 }
00190
00191
00192
00193 static void mrf_processBlocks (char *blockString, MrfRead *currRead)
00194 {
00195 Texta blocks;
00196 Texta blockFields;
00197 int i;
00198 MrfBlock *currBlock;
00199
00200 blocks = textFieldtok (blockString,",");
00201 for (i = 0; i < arrayMax (blocks); i++) {
00202 currBlock = arrayp (currRead->blocks,arrayMax (currRead->blocks),MrfBlock);
00203 blockFields = textFieldtok (textItem (blocks,i),":");
00204 currBlock->targetName = hlr_strdup (textItem (blockFields,0));
00205 currBlock->strand = textItem (blockFields,1)[0];
00206 currBlock->targetStart = atoi (textItem (blockFields,2));
00207 currBlock->targetEnd = atoi (textItem (blockFields,3));
00208 currBlock->queryStart = atoi (textItem (blockFields,4));
00209 currBlock->queryEnd = atoi (textItem (blockFields,5));
00210 textDestroy (blockFields);
00211 }
00212 textDestroy (blocks);
00213 }
00214
00215
00216
00217 static MrfEntry* mrf_processNextEntry (int freeMemory)
00218 {
00219 static MrfEntry *currEntry = NULL;
00220 char *line,*token,*pos;
00221 WordIter w;
00222 int index,columnType;
00223
00224 while (line = ls_nextLine (lsMrf)) {
00225 if (line[0] == '\0' || line[0] == '#' || strEqual (line,headerLine)) {
00226 continue;
00227 }
00228 if (freeMemory) {
00229 mrf_freeEntry (currEntry);
00230 }
00231 AllocVar (currEntry);
00232 if (strchr (line,'|')) {
00233 currEntry->isPairedEnd = 1;
00234 }
00235 else {
00236 currEntry->isPairedEnd = 0;
00237 }
00238 index = 0;
00239 w = wordIterCreate (line,"\t",0);
00240 while (token = wordNext (w)) {
00241 columnType = arru (columnTypes,index,int);
00242 if (columnType == MRF_COLUMN_TYPE_BLOCKS) {
00243 if (currEntry->isPairedEnd == 1) {
00244 pos = strchr (token,'|');
00245 *pos = '\0';
00246 currEntry->read1.blocks = arrayCreate (2,MrfBlock);
00247 currEntry->read2.blocks = arrayCreate (2,MrfBlock);
00248 mrf_processBlocks (token,&currEntry->read1);
00249 mrf_processBlocks (pos + 1,&currEntry->read2);
00250
00251 }
00252 else {
00253 currEntry->read1.blocks = arrayCreate (2,MrfBlock);
00254 mrf_processBlocks (token,&currEntry->read1);
00255 }
00256 }
00257 else if (columnType == MRF_COLUMN_TYPE_SEQUENCE) {
00258 if (currEntry->isPairedEnd == 1) {
00259 pos = strchr (token,'|');
00260 *pos = '\0';
00261 currEntry->read1.sequence = hlr_strdup (token);
00262 currEntry->read2.sequence = hlr_strdup (pos + 1);
00263 }
00264 else {
00265 currEntry->read1.sequence = hlr_strdup (token);
00266 }
00267 }
00268 else if (columnType == MRF_COLUMN_TYPE_QUALITY_SCORES) {
00269 if (currEntry->isPairedEnd == 1) {
00270 pos = strchr (token,'|');
00271 *pos = '\0';
00272 currEntry->read1.qualityScores = hlr_strdup (token);
00273 currEntry->read2.qualityScores = hlr_strdup (pos + 1);
00274 }
00275 else {
00276 currEntry->read1.qualityScores = hlr_strdup (token);
00277 }
00278 }
00279 else if (columnType == MRF_COLUMN_TYPE_QUERY_ID) {
00280 if (currEntry->isPairedEnd == 1) {
00281 pos = strchr (token,'|');
00282 *pos = '\0';
00283 currEntry->read1.queryId = hlr_strdup (token);
00284 currEntry->read2.queryId = hlr_strdup (pos + 1);
00285 }
00286 else {
00287 currEntry->read1.queryId = hlr_strdup (token);
00288 }
00289 }
00290 else {
00291 die ("Unknown columnType: %d",columnType);
00292 }
00293 index++;
00294 }
00295 wordIterDestroy (w);
00296 return currEntry;
00297 }
00298 if (freeMemory) {
00299 mrf_freeEntry (currEntry);
00300 }
00301 currEntry = NULL;
00302 return currEntry;
00303 }
00304
00305
00306
00311 MrfEntry* mrf_nextEntry (void)
00312 {
00313 return mrf_processNextEntry (1);
00314 }
00315
00316
00317
00323 Array mrf_parse (void)
00324 {
00325 Array mrfEntries;
00326 MrfEntry *currEntry;
00327
00328 mrfEntries = arrayCreate (100000,MrfEntry);
00329 while (currEntry = mrf_processNextEntry (0)) {
00330 array (mrfEntries,arrayMax (mrfEntries),MrfEntry) = *currEntry;
00331 }
00332 return mrfEntries;
00333 }
00334
00335
00336
00337 static void mrf_addTab (Stringa buffer, int *first)
00338 {
00339 if (*first == 1) {
00340 *first = 0;
00341 return;
00342 }
00343 stringCatChar (buffer,'\t');
00344 }
00345
00349 int getReadLength (MrfRead *currRead)
00350 {
00351 MrfBlock *currBlock;
00352 int i;
00353 int sum;
00354
00355 sum = 0;
00356 for (i = 0; i < arrayMax (currRead->blocks); i++) {
00357 currBlock = arrp (currRead->blocks,i,MrfBlock);
00358 sum += (currBlock->targetEnd - currBlock->targetStart + 1);
00359 }
00360 return sum;
00361 }
00362
00363
00368 char* mrf_writeHeader (void)
00369 {
00370 static Stringa buffer = NULL;
00371 int i;
00372
00373 stringCreateClear (buffer,100);
00374 for (i = 0; i < arrayMax (comments); i++) {
00375 stringAppendf (buffer,"#%s\n",textItem (comments,i));
00376 }
00377 for (i = 0; i < arrayMax (columnHeaders); i++) {
00378 stringAppendf (buffer,"%s%s",textItem (columnHeaders,i),
00379 i < arrayMax (columnHeaders) - 1 ? "\t" : "");
00380 }
00381 return string (buffer);
00382 }
00383
00384
00385
00386 static void mrf_writeBlocks (Stringa buffer, Array blocks)
00387 {
00388 MrfBlock *currBlock;
00389 int i;
00390
00391 for (i = 0; i < arrayMax (blocks); i++) {
00392 currBlock = arrp (blocks,i,MrfBlock);
00393 stringAppendf (buffer,"%s:%c:%d:%d:%d:%d%s",currBlock->targetName,
00394 currBlock->strand,currBlock->targetStart,currBlock->targetEnd,
00395 currBlock->queryStart,currBlock->queryEnd,
00396 i < arrayMax (blocks) - 1 ? "," : "");
00397 }
00398 }
00399
00400
00401
00406 char* mrf_writeEntry (MrfEntry *currEntry)
00407 {
00408 static Stringa buffer = NULL;
00409 int first;
00410 int i;
00411 int columnType;
00412
00413 stringCreateClear (buffer,100);
00414 first = 1;
00415 for (i = 0; i < arrayMax (columnTypes); i++) {
00416 columnType = arru (columnTypes,i,int);
00417 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_BLOCKS) && columnType == MRF_COLUMN_TYPE_BLOCKS) {
00418 mrf_addTab (buffer,&first);
00419 if (currEntry->isPairedEnd == 1) {
00420 mrf_writeBlocks (buffer,currEntry->read1.blocks);
00421 stringCatChar (buffer,'|');
00422 mrf_writeBlocks (buffer,currEntry->read2.blocks);
00423 }
00424 else {
00425 mrf_writeBlocks (buffer,currEntry->read1.blocks);
00426 }
00427 }
00428 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_SEQUENCE) && columnType == MRF_COLUMN_TYPE_SEQUENCE) {
00429 mrf_addTab (buffer,&first);
00430 if (currEntry->isPairedEnd == 1) {
00431 stringAppendf (buffer,"%s|%s",currEntry->read1.sequence,currEntry->read2.sequence);
00432 }
00433 else {
00434 stringAppendf (buffer,"%s",currEntry->read1.sequence);
00435 }
00436 }
00437 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_QUALITY_SCORES) && columnType == MRF_COLUMN_TYPE_QUALITY_SCORES) {
00438 mrf_addTab (buffer,&first);
00439 if (currEntry->isPairedEnd == 1) {
00440 stringAppendf (buffer,"%s|%s",currEntry->read1.qualityScores,currEntry->read2.qualityScores);
00441 }
00442 else {
00443 stringAppendf (buffer,"%s",currEntry->read1.qualityScores);
00444 }
00445 }
00446 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_QUERY_ID) && columnType == MRF_COLUMN_TYPE_QUERY_ID) {
00447 mrf_addTab (buffer,&first);
00448 if (currEntry->isPairedEnd == 1) {
00449 stringAppendf (buffer,"%s|%s",currEntry->read1.queryId,currEntry->read2.queryId);
00450 }
00451 else {
00452 stringAppendf (buffer,"%s",currEntry->read1.queryId);
00453 }
00454 }
00455 }
00456 return string (buffer);
00457 }
00458
00459