# wigfilename: the name of the wiggle file
# annotationfilename: the names of annotation file hg18
# width: the window size upstream
## smooth: Logic value, if smooth=T perform smoothing for the TF binding profile;  

wig2weight<-function(wigfilename, annotationfilename, width=10000, smooth=T, outfilename)
{
	chr.len = c(247249719,242951149,199501827,191273063,180857866,170899992,158821424,
					146274826,140273252,135374737,134452384,132349534,114142980,106368585,
					100338915,88827254,78774742,76117153,63811651,62435964,46944323,49691432,
					154913754,57772954,16571)
	chr.nam = paste("chr", c(1:22, "X", "Y", "M"), sep="")
	chr.len = chr.len[1:(length(chr.len)-2)]
	chr.nam = chr.nam[1:(length(chr.nam)-2)]

	# read in gene info
	mygene = read.table(annotationfilename, sep="\t", header=T, colClasses=c(rep("character", 3), rep("numeric", 5)))
	tag = substr(mygene[,1], 1, 3)
	mygene = mygene[tag=="NM_", 1:5]
	colnames(mygene) = c("name", "chr", "str", "sta", "end")
	tmp = mygene$sta
	tmp1 = tmp-width
	tmp2 = tmp+width
	tag = mygene[,3]=="-"
	tmp[tag==1] = mygene[tag==1,5]
	tmp1[tag==1] = tmp[tag==1]+width 
	tmp2[tag==1] = tmp[tag==1]-width 
	mygene$sta = tmp1
	mygene$end = tmp2
	for(k in 1:length(chr.nam))
	{
	   tag = mygene$chr==chr.nam[k]
	   tmp = mygene$sta[tag==1]
	   tmp[tmp<1] = 1
	   tmp[tmp>chr.len[k]] = chr.len[k]	
	   mygene$sta[tag==1] = tmp	
	   tmp = mygene$end[tag==1]
	   tmp[tmp<1] = 1
	   tmp[tmp>chr.len[k]] = chr.len[k]	
	   mygene$end[tag==1] = tmp	
	}
	
	# read in wiggle file
	conIn = file(wigfilename, "r")
	data = readLines(conIn, -1)
	close(conIn)
	
	###
	pos = grep("fixedStep", data)
	info = data[pos]
	pos = c(pos, length(data)+1)
	info = unlist(strsplit(info, " "))
	cnum = length(info)/4
	mychr = info[(1:cnum)*4-2]
	mysta = info[(1:cnum)*4-1]
	mychr = gsub("chrom=", "", mychr)
	mysta = as.numeric(gsub("start=", "", mysta))
	pos.sta = pos[1:(length(pos)-1)]+1
	pos.end = pos[2:length(pos)]-1
	sig.tk = data.frame(mychr, mysta, pos.sta, pos.end)

	## create weight file
	myw = rep(0, width*2+1)
	for(k in 1:length(chr.nam))
	{
	   	cat("\r", chr.nam[k])
	   	# signal vector
	   	read.cov = rep(0, chr.len[k])
	   	tag = sig.tk[,1]==chr.nam[k]
	   	if(sum(tag)==0) next
	   	mysig = sig.tk[sig.tk[,1]==chr.nam[k], ]
 	  	for(i in 1:nrow(mysig))
		{
			tmp1 = mysig[i,3]   # start line
			tmp2 = mysig[i,4]   # end line
			tmp.sta = mysig[i,2]  # start position in chr
			read.cov[(tmp.sta+1):(tmp.sta+tmp2-tmp1+1)] = as.numeric(data[tmp1:tmp2])
		}
   	# refseq
   		curgene = mygene[mygene[,2]==chr.nam[k],]
   		for(i in 1:nrow(curgene))
   		{
   	  	 	myw = myw+read.cov[curgene[i,4]:curgene[i,5]]   	      	   
   		}
 	}
	myw = myw/nrow(mygene)
	tmp = myw
	if(smooth==T)
	{
		for(i in 1:length(myw))
		{
		    myw[i] = mean(tmp[max(i-250,1):min(i+250, length(myw))])	
		}
	}
	myw = myw/sum(myw)
	write.table(myw, outfilename, sep="\t", row.names=F, col.names=F, quote=F)
}

#####

peak2weight<-function(peakfilename, annotationfilename, width=10000, smooth=T, outfilename)
{
	chr.len = c(247249719,242951149,199501827,191273063,180857866,170899992,158821424,
					146274826,140273252,135374737,134452384,132349534,114142980,106368585,
					100338915,88827254,78774742,76117153,63811651,62435964,46944323,49691432,
					154913754,57772954,16571)
	chr.nam = paste("chr", c(1:22, "X", "Y", "M"), sep="")
	chr.len = chr.len[1:(length(chr.len)-2)]
	chr.nam = chr.nam[1:(length(chr.nam)-2)]

	# read in gene info
	mygene = read.table(annotationfilename, sep="\t", header=T, colClasses=c(rep("character", 3), rep("numeric", 5)))
	tag = substr(mygene[,1], 1, 3)
	mygene = mygene[tag=="NM_", 1:5]
	colnames(mygene) = c("name", "chr", "str", "sta", "end")
	tmp = mygene$sta
	tmp1 = tmp-width
	tmp2 = tmp+width
	tag = mygene[,3]=="-"
	tmp[tag==1] = mygene[tag==1,5]
	tmp1[tag==1] = tmp[tag==1]+width 
	tmp2[tag==1] = tmp[tag==1]-width 
	mygene$sta = tmp1
	mygene$end = tmp2
	for(k in 1:length(chr.nam))
	{
	   tag = mygene$chr==chr.nam[k]
	   tmp = mygene$sta[tag==1]
	   tmp[tmp<1] = 1
	   tmp[tmp>chr.len[k]] = chr.len[k]	
	   mygene$sta[tag==1] = tmp	
	   tmp = mygene$end[tag==1]
	   tmp[tmp<1] = 1
	   tmp[tmp>chr.len[k]] = chr.len[k]	
	   mygene$end[tag==1] = tmp	
	}
	
	# read in peak file
	data = read.table(peakfilename, sep="\t", header=T)
	data$chr = paste("chr", data$chr, sep="")
	
	## create weight file
	myw = rep(0, width*2+1)
	for(k in 1:length(chr.nam))
	{
	   	cat("\r", chr.nam[k])
	   	# signal vector
	   	read.cov = rep(0, chr.len[k])
	   	tag = data$chr==chr.nam[k]
	   	if(sum(tag)==0) next
	   	mysig = data[tag==1,]
 	  	for(i in 1:nrow(mysig))
		{
			tmp1 = as.numeric(mysig$start[i])+1   # start line
			tmp2 = as.numeric(mysig$end[i])   # end line
			read.cov[tmp1:tmp2] = read.cov[tmp1:tmp2] + as.numeric(mysig$height[i])
		}
   	# refseq
   		curgene = mygene[mygene[,2]==chr.nam[k],]
   		for(i in 1:nrow(curgene))
   		{
   	  	 	myw = myw+read.cov[curgene[i,4]:curgene[i,5]]   	      	   
   		}
 	}
	myw = myw/nrow(mygene)
	tmp = myw
	if(smooth==T)
	{
		for(i in 1:length(myw))
		{
		    myw[i] = mean(tmp[max(i-250,1):min(i+250, length(myw))])	
		}
	}
	myw = myw/sum(myw)
	write.table(myw, outfilename, sep="\t", row.names=F, col.names=F, quote=F)
}
#####

calscoreonwig<-function(wigfilename, annotationfilename, weightfilename, outfilename)
{
	chr.len = c(247249719,242951149,199501827,191273063,180857866,170899992,158821424,
					146274826,140273252,135374737,134452384,132349534,114142980,106368585,
					100338915,88827254,78774742,76117153,63811651,62435964,46944323,49691432,
					154913754,57772954,16571)
	chr.nam = paste("chr", c(1:22, "X", "Y", "M"), sep="")
	chr.len = chr.len[1:(length(chr.len)-2)]
	chr.nam = chr.nam[1:(length(chr.nam)-2)]

	# read in weight
	conIn = file(weightfilename, "r")
	myw = readLines(conIn)
	close(conIn)
	myw = as.numeric(myw)
	width = (length(myw)-1)/2
	
	# read in gene info
	mygene = read.table(annotationfilename, sep="\t", header=T, colClasses=c(rep("character", 3), rep("numeric", 5)))
	tag = substr(mygene[,1], 1, 3)
	mygene = mygene[tag=="NM_", 1:5]
	colnames(mygene) = c("name", "chr", "str", "sta", "end")
	
	# read in wiggle file
	conIn = file(wigfilename, "r")
	data = readLines(conIn, -1)
	close(conIn)
	
	###
	pos = grep("fixedStep", data)
	info = data[pos]
	pos = c(pos, length(data)+1)
	info = unlist(strsplit(info, " "))
	cnum = length(info)/4
	mychr = info[(1:cnum)*4-2]
	mysta = info[(1:cnum)*4-1]
	mychr = gsub("chrom=", "", mychr)
	mysta = as.numeric(gsub("start=", "", mysta))
	pos.sta = pos[1:(length(pos)-1)]+1
	pos.end = pos[2:length(pos)]-1
	sig.tk = data.frame(mychr, mysta, pos.sta, pos.end)

	## calculate score
	mysco = rep(0, nrow(mygene))	#TSS
	gname = rep("",nrow(mygene))
	count = 0
	for(k in 1:length(chr.nam))
	{
	   	cat("\r", chr.nam[k])
	   	# signal vector
	   	read.cov = rep(0, chr.len[k])
	   	tag = sig.tk[,1]==chr.nam[k]
	   	if(sum(tag)==0) next
	   	mysig = sig.tk[sig.tk[,1]==chr.nam[k], ]
 	  	for(i in 1:nrow(mysig))
		{
			tmp1 = mysig[i,3]   # start line
			tmp2 = mysig[i,4]   # end line
			tmp.sta = mysig[i,2]  # start position in chr
			read.cov[(tmp.sta+1):(tmp.sta+tmp2-tmp1+1)] = as.numeric(data[tmp1:tmp2])
		}
   		curgene = mygene[mygene[,2]==chr.nam[k],]
	   	for(i in 1:nrow(curgene))
	   	{
	   	   count = count + 1
	   	   if(curgene$str[i]=="+")
	   	   {
	   	   		tmp= read.cov[max(1,curgene$sta[i]-width):min(curgene$sta[i]+width, chr.len[k])]
	   	   		b1 = 1-(curgene$sta[i]-width)
	   	   		b2 = curgene$sta[i]+width-chr.len[k]
	   	   		if(b1>0)
	   	   		{
	   	   		   tmp=c(rep(0, b1), tmp)	
	   	   		}
	   	   		if(b2>0)
	   	   		{
	   	   		   tmp=c(tmp, rep(0, b2))	
	   	   		}
	   	   		mysco[count] = sum(tmp*myw)
	   	   }
	   	   if(curgene$str[i]=="-")
	   	   {
	   	   		tmp= read.cov[min(chr.len[k],curgene$end[i]+width):max(curgene$end[i]-width, 1)]
	   	   		b1 = curgene$end[i]+width-chr.len[k]
	   	   		b2 = 1-(curgene$end[i]-width)
	   	   		if(b1>0)
	   	   		{
	   	   		   tmp=c(rep(0, b1), tmp)	
	   	   		}
	   	   		if(b2>0)
	   	   		{
	   	   		   tmp=c(tmp, rep(0, b2))	
	   	   		}
	   	   		mysco[count] = sum(tmp*myw)
	   	   }
	   	   gname[count] = curgene[i,1]   	      	   
	   	}
	}
	zscore = (mysco-mean(mysco))/sd(mysco)	#Z-score based onTSS
	pvalue = pnorm(-zscore)
    res = cbind(gname, mysco, zscore, pvalue)
	colnames(res) = c("name", "raw.score", "zscore", "p.value")
	res = res[order(res[,1], decreasing=T), ]
	write.table(res, outfilename, sep="\t", row.names=F, quote=F)
}


calscoreonpeak<-function(peakfilename, annotationfilename, weightfilename, outfilename)
{
	chr.len = c(247249719,242951149,199501827,191273063,180857866,170899992,158821424,
					146274826,140273252,135374737,134452384,132349534,114142980,106368585,
					100338915,88827254,78774742,76117153,63811651,62435964,46944323,49691432,
					154913754,57772954,16571)
	chr.nam = paste("chr", c(1:22, "X", "Y", "M"), sep="")
	chr.len = chr.len[1:(length(chr.len)-2)]
	chr.nam = chr.nam[1:(length(chr.nam)-2)]

	# read in weight
	conIn = file(weightfilename, "r")
	myw = readLines(conIn)
	close(conIn)
	myw = as.numeric(myw)
	width = (length(myw)-1)/2
	
	# read in gene info
	mygene = read.table(annotationfilename, sep="\t", header=T, colClasses=c(rep("character", 3), rep("numeric", 5)))
	tag = substr(mygene[,1], 1, 3)
	mygene = mygene[tag=="NM_", 1:5]
	colnames(mygene) = c("name", "chr", "str", "sta", "end")
	
	# read in peak file
	data = read.table(peakfilename, sep="\t", header=T)
	data$chr = paste("chr", data$chr, sep="")
	
	## calculate score
	mysco = rep(0, nrow(mygene))	#TSS
	gname = rep("",nrow(mygene))
	count = 0
	for(k in 1:length(chr.nam))
	{
	   	cat("\r", chr.nam[k])
	   	# signal vector
	   	read.cov = rep(0, chr.len[k])
	   	tag = data$chr==chr.nam[k]
	   	if(sum(tag)==0) next
	   	mysig = data[tag==1,]
 	  	for(i in 1:nrow(mysig))
		{
			tmp1 = as.numeric(mysig$start[i])+1   # start line
			tmp2 = as.numeric(mysig$end[i])   # end line
			read.cov[tmp1:tmp2] = read.cov[tmp1:tmp2] + as.numeric(data$height[i])
		}
   	# refseq
   		curgene = mygene[mygene[,2]==chr.nam[k],]
	   	for(i in 1:nrow(curgene))
	   	{
	   	   count = count + 1
	   	   if(curgene$str[i]=="+")
	   	   {
	   	   		tmp= read.cov[max(1,curgene$sta[i]-width):min(curgene$sta[i]+width, chr.len[k])]
	   	   		b1 = 1-(curgene$sta[i]-width)
	   	   		b2 = curgene$sta[i]+width-chr.len[k]
	   	   		if(b1>0)
	   	   		{
	   	   		   tmp=c(rep(0, b1), tmp)	
	   	   		}
	   	   		if(b2>0)
	   	   		{
	   	   		   tmp=c(tmp, rep(0, b2))	
	   	   		}
	   	   		mysco[count] = sum(tmp*myw)
	   	   }
	   	   if(curgene$str[i]=="-")
	   	   {
	   	   		tmp= read.cov[min(chr.len[k],curgene$end[i]+width):max(curgene$end[i]-width, 1)]
	   	   		b1 = curgene$end[i]+width-chr.len[k]
	   	   		b2 = 1-(curgene$end[i]-width)
	   	   		if(b1>0)
	   	   		{
	   	   		   tmp=c(rep(0, b1), tmp)	
	   	   		}
	   	   		if(b2>0)
	   	   		{
	   	   		   tmp=c(tmp, rep(0, b2))	
	   	   		}
	   	   		mysco[count] = sum(tmp*myw)
	   	   }
	   	   gname[count] = curgene[i,1]   	      	   
	   	}
	}
	zscore = (mysco-mean(mysco))/sd(mysco)	#Z-score based onTSS
	pvalue = pnorm(-zscore)
    res = cbind(gname, mysco, zscore, pvalue)
	colnames(res) = c("name", "raw.score", "zscore", "p.value")
	res = res[order(res[,1], decreasing=T), ]
	write.table(res, outfilename, sep="\t", row.names=F, quote=F)
}