00001 #include "log.h"
00002 #include "format.h"
00003 #include "linestream.h"
00004 #include "util.h"
00005 #include "bits.h"
00006
00007 int getNucleotideOverlap ( BlatQuery* blQ ) {
00008 int l;
00009 PslEntry* blE=NULL;
00010 int qSize = arrp( blQ->entries, arrayMax(blQ->entries)-1, PslEntry)->qSize;
00011 int maxOverlap=0;
00012
00013
00014
00015 for( l=0; l < arrayMax ( blQ->entries ); l++ ) {
00016 blE = arrp( blQ->entries, l, PslEntry );
00017 if( blE->qSize != qSize ) die("Query size different from PslEntry query size: qSize(%d) - blE->qSize(%d)", qSize, blE->qSize);
00018 if( ((blE->qEnd - blE->qStart)+1) > maxOverlap ) maxOverlap = (blE->qEnd - blE->qStart)+1;
00019
00020
00021
00022
00023
00024
00025
00026
00027 }
00028
00029
00030 return maxOverlap;
00031 }
00032
00033 Array util_readKnownGeneXrefs (char* fileName)
00034 {
00035 WordIter w;
00036 LineStream ls;
00037 char *line,*pos;
00038 Array kgXrefs;
00039 KgXref *currKgXref;
00040
00041 kgXrefs = arrayCreate (50000,KgXref);
00042 ls = ls_createFromFile (fileName);
00043 while (line = ls_nextLine (ls)) {
00044 if (line[0] == '\0') {
00045 continue;
00046 }
00047 currKgXref = arrayp (kgXrefs,arrayMax (kgXrefs),KgXref);
00048 w = wordIterCreate (line,"\t",0);
00049 currKgXref->transcriptName = hlr_strdup (wordNext (w));
00050 wordNext (w);
00051 currKgXref->swissProt = hlr_strdup (wordNext (w));
00052 currKgXref->uniprotId = hlr_strdup (wordNext (w));
00053 currKgXref->geneSymbol = hlr_strdup (wordNext (w));
00054 currKgXref->refseqId = hlr_strdup (wordNext (w));
00055 wordNext (w);
00056 currKgXref->refseqDescription = hlr_strdup (wordNext (w));
00057 wordIterDestroy (w);
00058 if (pos = strchr (currKgXref->uniprotId,'-')) {
00059 *pos = '\0';
00060 }
00061 if (pos = strchr (currKgXref->swissProt,'-')) {
00062 *pos = '\0';
00063 }
00064 }
00065 ls_destroy (ls);
00066 return kgXrefs;
00067 }
00068
00069
00070
00071 Array util_readKnownGeneTreeFams (char* fileName)
00072 {
00073 LineStream ls;
00074 char* line;
00075 char* pos;
00076 Array kgTreeFams;
00077 KgTreeFam *currKgTreeFam;
00078
00079 kgTreeFams = arrayCreate (30000,KgTreeFam);
00080 ls = ls_createFromFile (fileName);
00081 while (line = ls_nextLine (ls)) {
00082 if (line[0] == '\0') {
00083 continue;
00084 }
00085 if (pos = strchr (line,'\t')) {
00086 *pos = '\0';
00087 currKgTreeFam = arrayp (kgTreeFams,arrayMax (kgTreeFams),KgTreeFam);
00088 currKgTreeFam->transcriptName = hlr_strdup (line);
00089 currKgTreeFam->treeFamId = hlr_strdup (pos + 1);
00090 }
00091 }
00092 ls_destroy (ls);
00093 return kgTreeFams;
00094 }
00095
00096
00097
00098 int sortKgXrefsByTranscriptName (KgXref *a, KgXref *b)
00099 {
00100 return strcmp (a->transcriptName,b->transcriptName);
00101 }
00102
00103
00104
00105 static char* convert2string (Texta t)
00106 {
00107 static Stringa buffer = NULL;
00108 int i;
00109
00110 stringCreateClear (buffer,100);
00111 for (i = 0; i < arrayMax (t); i++) {
00112 stringAppendf (buffer,"%s%s",textItem (t,i),i < arrayMax (t) - 1 ? "|" : "");
00113 }
00114 return string (buffer);
00115 }
00116
00117
00118
00119 void transcript2geneSymbolAndGeneDescription (Array kgXrefs, char *transcriptName, char** geneSymbol, char **description)
00120 {
00121 Texta tokens;
00122 int i;
00123 KgXref testKX,*currKX;
00124 int index;
00125 static Texta descriptions = NULL;
00126 static Texta geneSymbols = NULL;
00127
00128 textCreateClear (descriptions,100);
00129 textCreateClear (geneSymbols,100);
00130 tokens = textFieldtokP (transcriptName,"|");
00131 for (i = 0; i < arrayMax (tokens); i++) {
00132 testKX.transcriptName = hlr_strdup (textItem (tokens,i));
00133 if (!arrayFind (kgXrefs,&testKX,&index,(ARRAYORDERF)sortKgXrefsByTranscriptName)) {
00134 die ("Expected to find KgXref: %s",testKX.transcriptName);
00135 }
00136 currKX = arrp (kgXrefs,index,KgXref);
00137 if (currKX->refseqDescription[0] != '\0') {
00138 textAdd (descriptions,currKX->refseqDescription);
00139 }
00140 if (currKX->geneSymbol[0] != '\0') {
00141 textAdd (geneSymbols,currKX->geneSymbol);
00142 }
00143 hlr_free (testKX.transcriptName);
00144 }
00145 textDestroy (tokens);
00146 textUniqKeepOrder (descriptions);
00147 textUniqKeepOrder (geneSymbols);
00148 *geneSymbol = hlr_strdup (convert2string (geneSymbols));
00149 *description = hlr_strdup (convert2string (descriptions));
00150 }
00151