00001 #include "log.h" 00002 #include "format.h" 00003 #include "linestream.h" 00004 #include "common.h" 00005 #include "bits.h" 00006 #include "mrf.h" 00007 00008 00009 00016 #define INIT_MODE_FROM_FILE 1 00017 #define INIT_MODE_FROM_PIPE 2 00018 00019 00020 00021 static LineStream lsMrf = NULL; 00022 static Bits *presentColumnTypes = NULL; 00023 static Array columnTypes = NULL; 00024 static Texta columnHeaders = NULL; 00025 static Texta comments = NULL; 00026 static char *headerLine = NULL; 00027 00028 00029 00030 static void mrf_addColumnType (char *type) 00031 { 00032 if (strEqual (type,MRF_COLUMN_NAME_BLOCKS)) { 00033 bitSetOne (presentColumnTypes,MRF_COLUMN_TYPE_BLOCKS); 00034 array (columnTypes,arrayMax (columnTypes),int) = MRF_COLUMN_TYPE_BLOCKS; 00035 textAdd (columnHeaders,MRF_COLUMN_NAME_BLOCKS); 00036 } 00037 else if (strEqual (type,MRF_COLUMN_NAME_SEQUENCE)) { 00038 bitSetOne (presentColumnTypes,MRF_COLUMN_TYPE_SEQUENCE); 00039 array (columnTypes,arrayMax (columnTypes),int) = MRF_COLUMN_TYPE_SEQUENCE; 00040 textAdd (columnHeaders,MRF_COLUMN_NAME_SEQUENCE); 00041 } 00042 else if (strEqual (type,MRF_COLUMN_NAME_QUALITY_SCORES)) { 00043 bitSetOne (presentColumnTypes,MRF_COLUMN_TYPE_QUALITY_SCORES); 00044 array (columnTypes,arrayMax (columnTypes),int) = MRF_COLUMN_TYPE_QUALITY_SCORES; 00045 textAdd (columnHeaders,MRF_COLUMN_NAME_QUALITY_SCORES); 00046 } 00047 else if (strEqual (type,MRF_COLUMN_NAME_QUERY_ID)) { 00048 bitSetOne (presentColumnTypes,MRF_COLUMN_TYPE_QUERY_ID); 00049 array (columnTypes,arrayMax (columnTypes),int) = MRF_COLUMN_TYPE_QUERY_ID; 00050 textAdd (columnHeaders,MRF_COLUMN_NAME_QUERY_ID); 00051 } 00052 else { 00053 die ("Unknown presentColumn: %s",type); 00054 } 00055 } 00056 00057 00058 00059 static void mrf_doInit (char *arg, int initMode) 00060 { 00061 Texta tokens; 00062 char *line; 00063 int i; 00064 00065 columnTypes = arrayCreate (20,int); 00066 columnHeaders = textCreate (20); 00067 presentColumnTypes = bitAlloc (100); 00068 comments = textCreate (100); 00069 if (initMode == INIT_MODE_FROM_FILE) { 00070 lsMrf = ls_createFromFile (arg); 00071 } 00072 else if (initMode == INIT_MODE_FROM_PIPE) { 00073 lsMrf = ls_createFromPipe (arg); 00074 } 00075 else { 00076 die ("Unknown init mode"); 00077 } 00078 ls_bufferSet (lsMrf,1); 00079 while (line = ls_nextLine (lsMrf)) { 00080 if (line[0] == '#') { 00081 textAdd (comments,line + 1); 00082 } 00083 else { 00084 ls_back (lsMrf,1); 00085 break; 00086 } 00087 } 00088 headerLine = hlr_strdup (ls_nextLine (lsMrf)); 00089 tokens = textFieldtokP (headerLine,"\t"); 00090 for (i = 0; i < arrayMax (tokens); i++) { 00091 mrf_addColumnType (textItem (tokens,i)); 00092 } 00093 } 00094 00095 00096 00101 void mrf_init (char *fileName) 00102 { 00103 mrf_doInit (fileName,INIT_MODE_FROM_FILE); 00104 } 00105 00106 00107 00112 void mrf_initFromPipe (char *cmd) 00113 { 00114 mrf_doInit (cmd,INIT_MODE_FROM_PIPE); 00115 } 00116 00117 00118 00123 void mrf_addNewColumnType (char* columnName) 00124 { 00125 int i; 00126 00127 i = 0; 00128 while (i < arrayMax (columnHeaders)) { 00129 if (strEqual (textItem (columnHeaders,i),columnName)) { 00130 break; 00131 } 00132 i++; 00133 } 00134 if (i == arrayMax (columnHeaders)) { 00135 mrf_addColumnType (columnName); 00136 } 00137 } 00138 00139 00140 00144 void mrf_deInit (void) 00145 { 00146 ls_destroy (lsMrf); 00147 arrayDestroy (columnTypes); 00148 textDestroy (columnHeaders); 00149 bitFree (&presentColumnTypes); 00150 textDestroy (comments); 00151 hlr_free (headerLine); 00152 } 00153 00154 00155 00156 static void mrf_freeReadAttributes (MrfRead *currRead) 00157 { 00158 int i; 00159 MrfBlock *currBlock; 00160 00161 for (i = 0; i < arrayMax (currRead->blocks); i++) { 00162 currBlock = arrp (currRead->blocks,i,MrfBlock); 00163 hlr_free (currBlock->targetName); 00164 } 00165 arrayDestroy (currRead->blocks); 00166 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_SEQUENCE)) { 00167 hlr_free (currRead->sequence); 00168 } 00169 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_QUALITY_SCORES)) { 00170 hlr_free (currRead->qualityScores); 00171 } 00172 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_QUERY_ID)) { 00173 hlr_free (currRead->queryId); 00174 } 00175 } 00176 00177 00178 00179 static void mrf_freeEntry (MrfEntry* currEntry) 00180 { 00181 if (currEntry == NULL) { 00182 return; 00183 } 00184 mrf_freeReadAttributes (&currEntry->read1); 00185 if (currEntry->isPairedEnd == 1) { 00186 mrf_freeReadAttributes (&currEntry->read2); 00187 } 00188 freeMem (currEntry); 00189 } 00190 00191 00192 00193 static void mrf_processBlocks (char *blockString, MrfRead *currRead) 00194 { 00195 Texta blocks; 00196 Texta blockFields; 00197 int i; 00198 MrfBlock *currBlock; 00199 00200 blocks = textFieldtok (blockString,","); 00201 for (i = 0; i < arrayMax (blocks); i++) { 00202 currBlock = arrayp (currRead->blocks,arrayMax (currRead->blocks),MrfBlock); 00203 blockFields = textFieldtok (textItem (blocks,i),":"); 00204 currBlock->targetName = hlr_strdup (textItem (blockFields,0)); 00205 currBlock->strand = textItem (blockFields,1)[0]; 00206 currBlock->targetStart = atoi (textItem (blockFields,2)); 00207 currBlock->targetEnd = atoi (textItem (blockFields,3)); 00208 currBlock->queryStart = atoi (textItem (blockFields,4)); 00209 currBlock->queryEnd = atoi (textItem (blockFields,5)); 00210 textDestroy (blockFields); 00211 } 00212 textDestroy (blocks); 00213 } 00214 00215 00216 00217 static MrfEntry* mrf_processNextEntry (int freeMemory) 00218 { 00219 static MrfEntry *currEntry = NULL; 00220 char *line,*token,*pos; 00221 WordIter w; 00222 int index,columnType; 00223 00224 while (line = ls_nextLine (lsMrf)) { 00225 if (line[0] == '\0' || line[0] == '#' || strEqual (line,headerLine)) { 00226 continue; 00227 } 00228 if (freeMemory) { 00229 mrf_freeEntry (currEntry); 00230 } 00231 AllocVar (currEntry); 00232 if (strchr (line,'|')) { 00233 currEntry->isPairedEnd = 1; 00234 } 00235 else { 00236 currEntry->isPairedEnd = 0; 00237 } 00238 index = 0; 00239 w = wordIterCreate (line,"\t",0); 00240 while (token = wordNext (w)) { 00241 columnType = arru (columnTypes,index,int); 00242 if (columnType == MRF_COLUMN_TYPE_BLOCKS) { 00243 if (currEntry->isPairedEnd == 1) { 00244 pos = strchr (token,'|'); 00245 *pos = '\0'; 00246 currEntry->read1.blocks = arrayCreate (2,MrfBlock); 00247 currEntry->read2.blocks = arrayCreate (2,MrfBlock); 00248 mrf_processBlocks (token,&currEntry->read1); 00249 mrf_processBlocks (pos + 1,&currEntry->read2); 00250 00251 } 00252 else { 00253 currEntry->read1.blocks = arrayCreate (2,MrfBlock); 00254 mrf_processBlocks (token,&currEntry->read1); 00255 } 00256 } 00257 else if (columnType == MRF_COLUMN_TYPE_SEQUENCE) { 00258 if (currEntry->isPairedEnd == 1) { 00259 pos = strchr (token,'|'); 00260 *pos = '\0'; 00261 currEntry->read1.sequence = hlr_strdup (token); 00262 currEntry->read2.sequence = hlr_strdup (pos + 1); 00263 } 00264 else { 00265 currEntry->read1.sequence = hlr_strdup (token); 00266 } 00267 } 00268 else if (columnType == MRF_COLUMN_TYPE_QUALITY_SCORES) { 00269 if (currEntry->isPairedEnd == 1) { 00270 pos = strchr (token,'|'); 00271 *pos = '\0'; 00272 currEntry->read1.qualityScores = hlr_strdup (token); 00273 currEntry->read2.qualityScores = hlr_strdup (pos + 1); 00274 } 00275 else { 00276 currEntry->read1.qualityScores = hlr_strdup (token); 00277 } 00278 } 00279 else if (columnType == MRF_COLUMN_TYPE_QUERY_ID) { 00280 if (currEntry->isPairedEnd == 1) { 00281 pos = strchr (token,'|'); 00282 *pos = '\0'; 00283 currEntry->read1.queryId = hlr_strdup (token); 00284 currEntry->read2.queryId = hlr_strdup (pos + 1); 00285 } 00286 else { 00287 currEntry->read1.queryId = hlr_strdup (token); 00288 } 00289 } 00290 else { 00291 die ("Unknown columnType: %d",columnType); 00292 } 00293 index++; 00294 } 00295 wordIterDestroy (w); 00296 return currEntry; 00297 } 00298 if (freeMemory) { 00299 mrf_freeEntry (currEntry); 00300 } 00301 currEntry = NULL; 00302 return currEntry; 00303 } 00304 00305 00306 00311 MrfEntry* mrf_nextEntry (void) 00312 { 00313 return mrf_processNextEntry (1); 00314 } 00315 00316 00317 00323 Array mrf_parse (void) 00324 { 00325 Array mrfEntries; 00326 MrfEntry *currEntry; 00327 00328 mrfEntries = arrayCreate (100000,MrfEntry); 00329 while (currEntry = mrf_processNextEntry (0)) { 00330 array (mrfEntries,arrayMax (mrfEntries),MrfEntry) = *currEntry; 00331 } 00332 return mrfEntries; 00333 } 00334 00335 00336 00337 static void mrf_addTab (Stringa buffer, int *first) 00338 { 00339 if (*first == 1) { 00340 *first = 0; 00341 return; 00342 } 00343 stringCatChar (buffer,'\t'); 00344 } 00345 00349 int getReadLength (MrfRead *currRead) 00350 { 00351 MrfBlock *currBlock; 00352 int i; 00353 int sum; 00354 00355 sum = 0; 00356 for (i = 0; i < arrayMax (currRead->blocks); i++) { 00357 currBlock = arrp (currRead->blocks,i,MrfBlock); 00358 sum += (currBlock->targetEnd - currBlock->targetStart + 1); 00359 } 00360 return sum; 00361 } 00362 00363 00368 char* mrf_writeHeader (void) 00369 { 00370 static Stringa buffer = NULL; 00371 int i; 00372 00373 stringCreateClear (buffer,100); 00374 for (i = 0; i < arrayMax (comments); i++) { 00375 stringAppendf (buffer,"#%s\n",textItem (comments,i)); 00376 } 00377 for (i = 0; i < arrayMax (columnHeaders); i++) { 00378 stringAppendf (buffer,"%s%s",textItem (columnHeaders,i), 00379 i < arrayMax (columnHeaders) - 1 ? "\t" : ""); 00380 } 00381 return string (buffer); 00382 } 00383 00384 00385 00386 static void mrf_writeBlocks (Stringa buffer, Array blocks) 00387 { 00388 MrfBlock *currBlock; 00389 int i; 00390 00391 for (i = 0; i < arrayMax (blocks); i++) { 00392 currBlock = arrp (blocks,i,MrfBlock); 00393 stringAppendf (buffer,"%s:%c:%d:%d:%d:%d%s",currBlock->targetName, 00394 currBlock->strand,currBlock->targetStart,currBlock->targetEnd, 00395 currBlock->queryStart,currBlock->queryEnd, 00396 i < arrayMax (blocks) - 1 ? "," : ""); 00397 } 00398 } 00399 00400 00401 00406 char* mrf_writeEntry (MrfEntry *currEntry) 00407 { 00408 static Stringa buffer = NULL; 00409 int first; 00410 int i; 00411 int columnType; 00412 00413 stringCreateClear (buffer,100); 00414 first = 1; 00415 for (i = 0; i < arrayMax (columnTypes); i++) { 00416 columnType = arru (columnTypes,i,int); 00417 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_BLOCKS) && columnType == MRF_COLUMN_TYPE_BLOCKS) { 00418 mrf_addTab (buffer,&first); 00419 if (currEntry->isPairedEnd == 1) { 00420 mrf_writeBlocks (buffer,currEntry->read1.blocks); 00421 stringCatChar (buffer,'|'); 00422 mrf_writeBlocks (buffer,currEntry->read2.blocks); 00423 } 00424 else { 00425 mrf_writeBlocks (buffer,currEntry->read1.blocks); 00426 } 00427 } 00428 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_SEQUENCE) && columnType == MRF_COLUMN_TYPE_SEQUENCE) { 00429 mrf_addTab (buffer,&first); 00430 if (currEntry->isPairedEnd == 1) { 00431 stringAppendf (buffer,"%s|%s",currEntry->read1.sequence,currEntry->read2.sequence); 00432 } 00433 else { 00434 stringAppendf (buffer,"%s",currEntry->read1.sequence); 00435 } 00436 } 00437 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_QUALITY_SCORES) && columnType == MRF_COLUMN_TYPE_QUALITY_SCORES) { 00438 mrf_addTab (buffer,&first); 00439 if (currEntry->isPairedEnd == 1) { 00440 stringAppendf (buffer,"%s|%s",currEntry->read1.qualityScores,currEntry->read2.qualityScores); 00441 } 00442 else { 00443 stringAppendf (buffer,"%s",currEntry->read1.qualityScores); 00444 } 00445 } 00446 if (bitReadOne (presentColumnTypes,MRF_COLUMN_TYPE_QUERY_ID) && columnType == MRF_COLUMN_TYPE_QUERY_ID) { 00447 mrf_addTab (buffer,&first); 00448 if (currEntry->isPairedEnd == 1) { 00449 stringAppendf (buffer,"%s|%s",currEntry->read1.queryId,currEntry->read2.queryId); 00450 } 00451 else { 00452 stringAppendf (buffer,"%s",currEntry->read1.queryId); 00453 } 00454 } 00455 } 00456 return string (buffer); 00457 } 00458 00459