#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #%% Convert wiggle file into bin file; #%% Divide the genome into small bin of 100-bp in size; For each bin, calculate the average signal of a chromatin feature #%% Each bin is assigened an ID like chrI_x, (the bin corresponding to the one from (x-1)*100 to x*100-1 on chrI ###################### rm(list=ls()) chr.len = c(15072471,15279373,13783731,17493835,20919618,17718904,13844) # length of each worm chromosome chr.nam = paste("chr", c("I","II","III","IV","V", "X", "M"), sep="") # name of each worm chromosome chr.len = chr.len[1:6] chr.nam = chr.nam[1:6] wig.Dir = "/home1/yourname/yourwigfiles/" # where you put your wiggle files tar.Dir = "/home1/yourname/yourwigfiles/bin/" # where you want to put the bin files setwd(wig.Dir) ## setup your working directory myinf = dir() tag = substr(myinf, nchar(myinf)-3, nchar(myinf)) myinf = myinf[tag==".wig"] length(myinf) for(k in 1:length(myinf)) { cat("\r", k) setwd(curDir) ## @@ [1] tmpDir = substr(myinf[k], 1, nchar(myinf[k])-4) comm = paste("wc -l ", myinf[k], sep="") totLin = system(comm, intern=T ) totLin = as.numeric(unlist(strsplit(totLin, " "))[1]) conIn = file(myinf[k], "r") comm = paste("mkdir ", tmpDir, sep="") system(comm) comm = paste("grep -n chrom= ", myinf[k], ">./", tmpDir,"/info.txt", sep="" ) system(comm) tmpDir = paste(curDir,tmpDir, sep="") #--- setwd(tmpDir) info = read.table("info.txt", sep="") pos = unlist(strsplit(as.character(info[,1]), ":")) cnum = length(pos)/2 pos = as.numeric(pos[(1:cnum)*2-1]) # start Line Num of each chromosome pos = c(pos, totLin+1) for(i in 1:(length(pos)-1)) { pos[i] = pos[i+1]-pos[i]-1 } chr = as.character(info[,2]) chr = substr(chr,7, nchar(chr)) chr = paste("chr", chr, sep="") #--- curLine = readLines(conIn, 1) # skip the 1st line: track ... for(i in 1:length(chr)) { tmpFile = paste(chr[i], ".wig", sep="") curLine = readLines(conIn, 1) conOut = file(tmpFile, "w") curLine = readLines(conIn, pos[i]) writeLines(curLine, conOut) close(conOut) }#create data for each chromosome close(conIn) #--- tmpFile = substr(myinf[k], 1, nchar(myinf[k])-4) tmpFile = paste(tar.Dir, tmpFile, ".bin", sep="") ##@@ [3] conOut = file(tmpFile, "w") cat("\n") for(i in 1:length(chr.nam)) { cat("\r", i) tmpFile = paste(chr.nam[i], ".wig", sep="") conIn = file(tmpFile, "r") sig = readLines(conIn, -1) close(conIn) #--- sig = matrix(unlist(strsplit(sig, "\t")), ncol=2, byrow=T) tmp1 = c(0,0) tmp2 = c(chr.len[i], 0) sig = rbind(tmp1, sig, tmp2) res = rep(0, ceiling(chr.len[i]/100)) for(j in 1:(nrow(sig)-1)) { mysta = as.numeric(sig[j,1]) myend = as.numeric(sig[j+1,1])-1 if(mysta>chr.len[i]) break if(myend>chr.len[i]) myend = chr.len[i] myval = as.numeric(sig[j,2]) idx1 = floor(mysta/100) + 1 idx2 = floor(myend/100) + 1 if(idx1==idx2) { res[idx1] = (myend-mysta+1)*myval*0.01 + res[idx1] } if(idx2-idx1==1) { res[idx1] = (100*idx1-mysta)*myval*0.01 + res[idx1] res[idx2] = (myend-100*idx1+1)*myval*0.01 + res[idx2] } if(idx2-idx1>1) { res[idx1] = (100*idx1-mysta)*myval*0.01 + res[idx1] res[(idx1+1):(idx2-1)] = myval + res[(idx1+1):(idx2-1)] res[idx2] = (myend-100*(idx2-1)+1)*myval*0.01 + res[idx2] } } ## end of the j res = res[1:(length(res)-1)] # discard the last bin tmp = 1:floor(chr.len[i]/100) tmp = paste(chr.nam[i], tmp, sep="_") curLine = paste(tmp, res, sep="\t") writeLines(curLine, conOut) } # end of chrom i close(conOut) #--- setwd(curDir) comm = paste("rm -r ", tmpDir, sep="") system(comm) } EOF ################################################################### #%% combined the bin files into a single file: predictor matrix for all bins rm(list=ls()) myoutf = "/home1/yourname/yourwigfiles/bin/predictor.mat" ## set the file name here for the predictor matrix tmpf1 = "/home1/yourname/yourwigfiles/bin/tmpf1" ## temporary file 1 tmpf2 = "//home1/yourname/yourwigfiles/bin/tmpf2" ## ## temporary file 1 myDir = "/home1/yourname/yourwigfiles/bin/" ## the directory that contains the bin files setwd(myDir) myinf = dir(myDir) comm = paste("cp ", myinf[1], " ", myoutf, sep="") system(comm) for(k in 2:length(myinf)) { cat("\r", k) comm = paste("cut -f 2 ", myinf[k], ">", tmpf1, sep="") system(comm) comm = paste("paste ", myoutf, " ", tmpf1, " >", tmpf2, sep="") system(comm) comm = paste("rm -f ", myoutf, sep="") system(comm) comm = paste("mv ", tmpf2, " ", myoutf, sep="") system(comm) } comm = paste("rm -f tmpf*") system(comm) # delete temporary files ################################################################### ## Prepare the bin IDs for each gene ## For each gene gene, create 160 bins ## 40 bins are from the TSS to upstream 4kb: Bin 1, 2, ..., 40 ## 40 bins are from the TSS to downstream 4kb: Bin 41, 42, ..., 80 ## 40 bins are from the TTS to upstream 4kb: Bin 81, 82, ..., 120 ## 40 bins are from the TSS to downstream 4kb: Bin 121, 122, ..., 160 ## rm(list=ls()) setwd("/home1/yourname/yourfiles/") myinf1= "/home1/yourname/yourfiles/ws180_transcript.annotation" ## the annotation of all worm genes, can be download from webpage myoutf1 = "/home1/yourname/yourfiles/ws180_transcript_bin_no.txt" ## the file in which you want to save the bin names chr.len = c(15072471,15279373,13783731,17493835,20919618,17718904,13844) chr.nam = c("I","II","III","IV","V", "X", "M") chr.len = chr.len[1:6] chr.nam = chr.nam[1:6] chr.len = floor(chr.len/100)-1 data =read.table(myinf1, sep="\t", header=F, colClasses = c(rep("character", 3), rep("numeric",2), "character")) tmp = sort(data[,2]) # unique name only tmp = rle(tmp) tmp = tmp$values[tmp$lengths==1] data = data[data[,2]%in%tmp, ] comm = paste("rm -f ", myoutf1, sep="") system(comm) for(k in 1:length(chr.nam)) { cat("\r", k) subdata1 = data[data[,1] == chr.nam[k] & data[,6]=="+", ] res1 = matrix(rep(0, nrow(subdata1)*160), ncol=160) mysta = floor(subdata1[,4]/100)+1 myend = floor(subdata1[,5]/100)+1 for(i in 1:40) { res1[, i] = mysta-i+1 res1[,40+i] = mysta+i res1[,80+i] = myend-i+1 res1[,120+i] = myend+i } subdata2 = data[data[,1] == chr.nam[k] & data[,6]=="-", ] res2 = matrix(rep(0, nrow(subdata2)*160), ncol=160) mysta = floor(subdata2[,4]/100)+1 myend = floor(subdata2[,5]/100)+1 for(i in 1:40) { res2[, i] = myend+i-1 res2[,40+i] = myend-i res2[,80+i] = mysta+i-1 res2[,120+i] = mysta-i } res = rbind(res1, res2) minv = apply(res, 1, min) maxv = apply(res, 1, max) tag = minv>=1 & maxv<=chr.len[k] for(i in 1:160) res[,i] = paste("chr", chr.nam[k], "_", as.integer(res[,i]), sep="") tmp = rbind(subdata1, subdata2) res = cbind(tmp[,c(1,2,6)], res) res = res[tag==1, ] write.table(res, myoutf1, sep="\t", row.names=F, col.names=F, quote=F, append=T) } ################################################################### ## prepare the predictor matrix for each of the 160 bins #%% create 160 files, nameds as feature_bin_k.mat, where k=1, 2, ..., 160 #%% In each file, the signals of all chromatin features for all genes in this bin are provided #%% each row is a gene #%% each column is a feature rm(list=ls()) myinf1 = "/home1/yourname/yourfiles/ws180_transcript_bin_no.txt" myinf2 = "/home1/yourname/yourfiles/predictor/predictor.mat" # contains the bin data for all chromatin features, each row is a bin; each column is a feature; the 1st column is the bin IDs myinf3 = "/home1/yourname/yourfiles/predictor.header" # contains the names of all chrmatin features tar.Dir = "/home1/yourname/yourfiles/160bin/" binfo = read.table(myinf1, sep="\t", header=F, row.names=2, colClasses=rep("character", 162)) data = read.table(myinf2, sep="", row.names=1, header=T) conIn = file(myinf3, "r") myname = readLines(conIn, -1) close(conIn) myname = unlist(strsplit(myname, "\t")) for(i in 1:160) { cat("\r", i) bin.list = binfo[, 2+i] subdata = data[row.names(data)%in%bin.list,] res = subdata[bin.list,] tmp = paste(row.names(binfo), binfo[,2], bin.list, sep="$") row.names(res) = tmp myoutf = paste(tar.Dir, "feature_bin_", i,".mat", sep="" ) write.table(res, myoutf, sep="\t", row.names=T, quote=F) } ################################################################### ## the SVM model, draw ROC curve # classify highly and lowly expressed genes #%% need install package "e1071" rm(list=ls()) myinf1 = "/home1/yourname/yourfiles/Input_data_2SVM_bin43.txt" # the input file to SVM, can be download from our webpage data = read.table(myinf1, sep="\t", row.names=1, quote="", header=T) data = data[!is.na(data$yval),] yval = data[, 1] mymed = median(yval) tag = (yval<=mymed) yval[tag==1] = "L" # lowly expressed genes yval[tag==0] = "H" # highly expressed genes data[,1] = yval library(e1071) pos = data[data[,1]=="H", ] neg = data[data[,1]=="L", ] pos.data = pos[sample(1:nrow(pos), 800), ] neg.data = neg[sample(1:nrow(neg), 800), ] se1 = sample(1:800, 400) se2 = sample(1:800, 400) tr = rbind(pos.data[se1, ], neg.data[se2,]) # training data te = rbind(pos.data[-se1, ], neg.data[-se2,]) # testing data mm = svm(tr[,-1], tr[,1], type="C", probability=T, cross=5) pre = predict(mm, te[,-1], probability=T) res = attr(pre, "probabilities") thr = (1:99)*0.01 yy = xx = rep(0, length(thr)) for(i in 1:length(thr)) { aa = sum(res[,1]>=thr[i] & te[,1]=="H") bb = sum(res[,1]=thr[i] & te[,1]=="L") dd = sum(res[,1]0,] data[,1] = log2(data[,1]) library(e1071) se = sample(1:nrow(data), 2400) se1 = se[1:400] se2 = se[401:2400] tr = data[se1,] te = data[se2,] mm = svm(tr[,-1], tr[,1], probability=T, cross=5) pre= predict(mm,te[,-1]) mycorr = round(cor(pre, te[,1]),3) # correlation between predicted values and experimental values plot(te[,1], pre, pch=20, col="blue", xlab="", ylab="", main="", cex=0.9) mylm = lm(pre~te[,1]) abline(mylm, lwd=2) text(min(te[,1]), max(pre), paste("R=", mycorr, sep="")) mycorr ###################################################################