#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%% Convert wiggle file into bin file; 
#%% Divide the genome into small bin of 100-bp in size; For each bin, calculate the average signal of a chromatin feature
#%% Each bin is assigened an ID like chrI_x, (the bin corresponding to the one from (x-1)*100 to x*100-1 on chrI 
######################
rm(list=ls())
chr.len = c(15072471,15279373,13783731,17493835,20919618,17718904,13844)	# length of each worm chromosome
chr.nam = paste("chr", c("I","II","III","IV","V", "X", "M"), sep="")	# name of each worm chromosome
chr.len = chr.len[1:6]
chr.nam = chr.nam[1:6]

wig.Dir = "/home1/yourname/yourwigfiles/"			# where you put your wiggle files 
tar.Dir = "/home1/yourname/yourwigfiles/bin/"		# where you want to put the bin files
setwd(wig.Dir)  ## setup your working directory
myinf = dir()
tag = substr(myinf, nchar(myinf)-3, nchar(myinf))
myinf = myinf[tag==".wig"]
length(myinf)
for(k in 1:length(myinf))
{
  cat("\r", k)
  setwd(curDir)  ## @@ [1]
  tmpDir = substr(myinf[k], 1, nchar(myinf[k])-4)
  comm = paste("wc -l ", myinf[k], sep="")
  totLin = system(comm, intern=T )
  totLin = as.numeric(unlist(strsplit(totLin, " "))[1])
  conIn = file(myinf[k], "r")
  comm = paste("mkdir ", tmpDir, sep="")
  system(comm)
  comm = paste("grep -n chrom= ", myinf[k], ">./", tmpDir,"/info.txt", sep="" )
  system(comm)
  tmpDir = paste(curDir,tmpDir, sep="")
  #---
  setwd(tmpDir)
  info = read.table("info.txt", sep="")
  pos = unlist(strsplit(as.character(info[,1]), ":"))
  cnum = length(pos)/2
  pos = as.numeric(pos[(1:cnum)*2-1])	# start Line Num of each chromosome
  pos = c(pos, totLin+1)
  for(i in 1:(length(pos)-1))
  { pos[i] = pos[i+1]-pos[i]-1 }
  chr = as.character(info[,2])
  chr = substr(chr,7, nchar(chr))
  chr = paste("chr", chr, sep="")
  #---
  curLine = readLines(conIn, 1)  # skip the 1st line: track ...
  for(i in 1:length(chr))
  {
    tmpFile = paste(chr[i], ".wig", sep="")
    curLine = readLines(conIn, 1)
    conOut = file(tmpFile, "w")
    curLine = readLines(conIn, pos[i])
    writeLines(curLine, conOut)
    close(conOut)    	
  }#create data for each chromosome
  close(conIn)
  #---
  tmpFile = substr(myinf[k], 1, nchar(myinf[k])-4)
  tmpFile = paste(tar.Dir, tmpFile, ".bin", sep="") ##@@ [3]
  conOut = file(tmpFile, "w")
  cat("\n")
  for(i in 1:length(chr.nam))
  {
     cat("\r", i)
     tmpFile = paste(chr.nam[i], ".wig", sep="")
     conIn = file(tmpFile, "r")
     sig = readLines(conIn, -1)
     close(conIn)
     #---
     sig = matrix(unlist(strsplit(sig, "\t")), ncol=2, byrow=T)
     tmp1 = c(0,0)
     tmp2 = c(chr.len[i], 0)
     sig = rbind(tmp1, sig, tmp2)
     res = rep(0, ceiling(chr.len[i]/100))
     for(j in 1:(nrow(sig)-1))
     {
       mysta = as.numeric(sig[j,1])
       myend = as.numeric(sig[j+1,1])-1
       if(mysta>chr.len[i]) break
       if(myend>chr.len[i]) 
         myend = chr.len[i]
       myval = as.numeric(sig[j,2])
       idx1 = floor(mysta/100) + 1
       idx2 = floor(myend/100) + 1
       	if(idx1==idx2)
       	{
       	  res[idx1] = (myend-mysta+1)*myval*0.01 + res[idx1]	
       	}
       	if(idx2-idx1==1)
       	{
       	  res[idx1] = (100*idx1-mysta)*myval*0.01 + res[idx1]
       	  res[idx2] = (myend-100*idx1+1)*myval*0.01 + res[idx2]       	  	
       	}
       	if(idx2-idx1>1)
       	{
       	  res[idx1] = (100*idx1-mysta)*myval*0.01 + res[idx1]
       	  res[(idx1+1):(idx2-1)] = myval + res[(idx1+1):(idx2-1)]
       	  res[idx2] = (myend-100*(idx2-1)+1)*myval*0.01 + res[idx2]       	  	
       	}
     } ## end of the j
     res = res[1:(length(res)-1)] # discard the last bin
     tmp = 1:floor(chr.len[i]/100)
     tmp = paste(chr.nam[i], tmp, sep="_")
     curLine = paste(tmp, res, sep="\t")
     writeLines(curLine, conOut)     		 
  } # end of chrom i
  close(conOut)
  #---
  setwd(curDir)
  comm = paste("rm -r ", tmpDir, sep="")
  system(comm)  
}
EOF

###################################################################
#%% combined the bin files into a single file: predictor matrix for all bins

rm(list=ls())
myoutf = "/home1/yourname/yourwigfiles/bin/predictor.mat"	## set the file name here for the predictor matrix
tmpf1 = "/home1/yourname/yourwigfiles/bin/tmpf1"			## temporary file 1
tmpf2 = "//home1/yourname/yourwigfiles/bin/tmpf2"			## ## temporary file 1

myDir = "/home1/yourname/yourwigfiles/bin/"					## the directory that contains the bin files
setwd(myDir)
myinf =  dir(myDir)
comm = paste("cp ", myinf[1], " ", myoutf, sep="")
system(comm)
for(k in 2:length(myinf))
{
   cat("\r", k)
   comm = paste("cut -f 2 ", myinf[k], ">", tmpf1, sep="")
   system(comm)
   comm = paste("paste ", myoutf, " ", tmpf1, " >", tmpf2, sep="")
   system(comm)
   comm = paste("rm -f ", myoutf, sep="")
   system(comm)
   comm = paste("mv ", tmpf2, " ", myoutf, sep="")
   system(comm) 
}
comm = paste("rm -f tmpf*")
system(comm)	# delete temporary files

###################################################################
## Prepare the bin IDs for each gene
## For each gene gene, create 160 bins
## 40 bins are from the TSS to upstream 4kb: Bin 1, 2, ..., 40
## 40 bins are from the TSS to downstream 4kb: Bin 41, 42, ..., 80
## 40 bins are from the TTS to upstream 4kb: Bin 81, 82, ..., 120
## 40 bins are from the TSS to downstream 4kb: Bin 121, 122, ..., 160
##
rm(list=ls())
setwd("/home1/yourname/yourfiles/")
myinf1= "/home1/yourname/yourfiles/ws180_transcript.annotation"		## the annotation of all worm genes, can be download from webpage
myoutf1 = "/home1/yourname/yourfiles/ws180_transcript_bin_no.txt"	## the file in which you want to save the bin names 

chr.len = c(15072471,15279373,13783731,17493835,20919618,17718904,13844)
chr.nam = c("I","II","III","IV","V", "X", "M")
chr.len = chr.len[1:6]
chr.nam = chr.nam[1:6]
chr.len = floor(chr.len/100)-1

data =read.table(myinf1, sep="\t", header=F, colClasses = c(rep("character", 3), rep("numeric",2), "character"))

tmp = sort(data[,2])	# unique name only
tmp = rle(tmp)
tmp = tmp$values[tmp$lengths==1]
data = data[data[,2]%in%tmp, ]

comm = paste("rm -f ", myoutf1, sep="")
system(comm)
for(k in 1:length(chr.nam))
{
  cat("\r", k)
  subdata1 = data[data[,1] == chr.nam[k] & data[,6]=="+", ]
  res1 = matrix(rep(0, nrow(subdata1)*160), ncol=160)
  mysta = floor(subdata1[,4]/100)+1
  myend = floor(subdata1[,5]/100)+1
  for(i in 1:40)
  {
  	res1[, i] = mysta-i+1
  	res1[,40+i] = mysta+i
  	res1[,80+i] = myend-i+1
  	res1[,120+i] = myend+i  	
  }
  subdata2 = data[data[,1] == chr.nam[k] & data[,6]=="-", ]
  res2 = matrix(rep(0, nrow(subdata2)*160), ncol=160)
  mysta = floor(subdata2[,4]/100)+1
  myend = floor(subdata2[,5]/100)+1
  for(i in 1:40)
  {
  	res2[, i] = myend+i-1
  	res2[,40+i] = myend-i
  	res2[,80+i] = mysta+i-1
  	res2[,120+i] = mysta-i
  }
  res = rbind(res1, res2)
  minv = apply(res, 1, min)
  maxv = apply(res, 1, max)
  tag = minv>=1 & maxv<=chr.len[k]
  for(i in 1:160)
    res[,i] = paste("chr", chr.nam[k], "_",  as.integer(res[,i]), sep="")
  tmp = rbind(subdata1, subdata2)
  res = cbind(tmp[,c(1,2,6)], res)
  res = res[tag==1, ]
  write.table(res, myoutf1, sep="\t", row.names=F, col.names=F, quote=F, append=T)	
}

###################################################################
## prepare the predictor matrix for each of the 160 bins
#%% create 160 files, nameds as feature_bin_k.mat, where k=1, 2, ..., 160
#%% In each file, the signals of all chromatin features for all genes in this bin are provided
#%% each row is a gene
#%% each column is a feature

rm(list=ls())
myinf1 = "/home1/yourname/yourfiles/ws180_transcript_bin_no.txt"
myinf2 = "/home1/yourname/yourfiles/predictor/predictor.mat"	# contains the bin data for all chromatin features, each row is a bin; each column is a feature; the 1st column is the bin IDs
myinf3 = "/home1/yourname/yourfiles/predictor.header"			# contains the names of all chrmatin features
tar.Dir = "/home1/yourname/yourfiles/160bin/"

binfo = read.table(myinf1, sep="\t", header=F, row.names=2, colClasses=rep("character", 162))
data = read.table(myinf2, sep="", row.names=1, header=T)
conIn = file(myinf3, "r")
myname = readLines(conIn, -1)
close(conIn)
myname = unlist(strsplit(myname, "\t"))

for(i in 1:160)
{
   cat("\r", i)
   bin.list = binfo[,  2+i]
   subdata = data[row.names(data)%in%bin.list,]
   res = subdata[bin.list,]
   tmp = paste(row.names(binfo), binfo[,2], bin.list, sep="$")
   row.names(res) = tmp
   myoutf = paste(tar.Dir, "feature_bin_", i,".mat", sep="" )
   write.table(res, myoutf, sep="\t", row.names=T, quote=F)  
} 

###################################################################
## the SVM model, draw ROC curve
# classify highly and lowly expressed genes
#%% need install package "e1071"

rm(list=ls())
myinf1 = "/home1/yourname/yourfiles/Input_data_2SVM_bin43.txt"		# the input file to SVM, can be download from our webpage

data = read.table(myinf1, sep="\t", row.names=1, quote="", header=T)
data = data[!is.na(data$yval),]

   yval = data[, 1]
   mymed = median(yval)
   tag = (yval<=mymed)
   yval[tag==1] = "L"		# lowly expressed genes
   yval[tag==0] = "H"		# highly expressed genes
   data[,1] = yval
 
   library(e1071)
   pos = data[data[,1]=="H", ]
   neg = data[data[,1]=="L", ]
   pos.data = pos[sample(1:nrow(pos), 800), ]
   neg.data = neg[sample(1:nrow(neg), 800), ]
   se1 = sample(1:800, 400)
   se2 = sample(1:800, 400)
   tr = rbind(pos.data[se1, ], neg.data[se2,])		# training data
   te = rbind(pos.data[-se1, ], neg.data[-se2,])	# testing data
   mm = svm(tr[,-1], tr[,1], type="C", probability=T, cross=5)
   pre = predict(mm, te[,-1], probability=T)
   res = attr(pre, "probabilities")
   thr = (1:99)*0.01
   yy =  xx =  rep(0, length(thr))
   for(i in 1:length(thr))
   {
    aa = sum(res[,1]>=thr[i] & te[,1]=="H")
    bb = sum(res[,1]<thr[i] & te[,1]=="H")
    cc = sum(res[,1]>=thr[i] & te[,1]=="L")
    dd = sum(res[,1]<thr[i] & te[,1]=="L")
    yy[i] = aa/(aa+bb)
    xx[i] = cc/(cc+dd)
   }
   xx = c(1, xx, 0)
   yy = c(1, yy, 0)
   tmp1 = tmp2 = rep(0,100)
   for(i in 1:100)
   {
      tmp1[i] = xx[i]-xx[i+1]
      tmp2[i] = (yy[i+1]+yy[i])/2	
   }
   myauc = round(sum(tmp1*tmp2),3)   # AUC score
   plot(xx, yy, ylab="Sensitivity", xlab="1-Specificity", col= "red", type="l", lwd=2, xlim=c(0,1), ylim=c(0,1))
   text(0.5, 0.5, paste("AUC=", myauc, sep=""))
   myauc


###################################################################
## the SVR regression model, draw scatter plot
#%% predict expression levels of genes
#%% plot experimental values against predicted values
#%% need to install R package "e1071"

rm(list=ls())
myinf1 = "/home1/yourname/yourfiles/Input_data_2SVM_bin43.txt"		# the input file to SVM, can be download from our webpage

data = read.table(myinf1, sep="\t", row.names=1, quote="", header=T)
data = data[!is.na(data$yval),]
data = data[data[,1]>0,]
data[,1] = log2(data[,1]) 
library(e1071)

se = sample(1:nrow(data), 2400)
se1 = se[1:400]
se2 = se[401:2400]
tr = data[se1,]
te = data[se2,]
mm = svm(tr[,-1], tr[,1], probability=T, cross=5)
pre= predict(mm,te[,-1])
mycorr = round(cor(pre, te[,1]),3)  	# correlation between predicted values and experimental values
plot(te[,1], pre, pch=20, col="blue", xlab="", ylab="", main="", cex=0.9)
mylm = lm(pre~te[,1])
abline(mylm, lwd=2)
text(min(te[,1]), max(pre), paste("R=", mycorr, sep=""))
mycorr
 
###################################################################