GitHub - CrickWu/code

Branches Tags

Name		Name	Last commit message	Last commit date
Latest commit History 144 Commits
Rscripts		Rscripts
htseq		htseq
motif_gff_hg19		motif_gff_hg19
motif_gff_mm9		motif_gff_mm9
mtl		mtl
README.txt		README.txt
assignEmpP.py		assignEmpP.py
avgBed.py		avgBed.py
bedToGff.py		bedToGff.py
bkgdGff.py		bkgdGff.py
centLuc.py		centLuc.py
checkNan.sh		checkNan.sh
countSeqLength.py		countSeqLength.py
excludeBed.py		excludeBed.py
excludeBedMid.py		excludeBedMid.py
fimoCountToMotifPval.py		fimoCountToMotifPval.py
format.py		format.py
formatAme.py		formatAme.py
formatEncodeBroadPeak.py		formatEncodeBroadPeak.py
formatFasta.py		formatFasta.py
formatHtseq.py		formatHtseq.py
formatHtseqPeakToGene.py		formatHtseqPeakToGene.py
formatLuc.py		formatLuc.py
formatMeme.py		formatMeme.py
formatPoisson.py		formatPoisson.py
getgrangehg19.py		getgrangehg19.py
getgrangemm9.py		getgrangemm9.py
gffToBed.py		gffToBed.py
gtf.py		gtf.py
hg19gff_wg_pp.py		hg19gff_wg_pp.py
hg19gffgname_loci_pp.py		hg19gffgname_loci_pp.py
hgmmConvert.py		hgmmConvert.py
htseqCent.py		htseqCent.py
htseqEmpP.py		htseqEmpP.py
htseqEmpiricalp.py		htseqEmpiricalp.py
htseqGnameLuc.py		htseqGnameLuc.py
htseqPoissoncdf.py		htseqPoissoncdf.py
intersectBedp.py		intersectBedp.py
jaspar.py		jaspar.py
macsToBed.py		macsToBed.py
makeGff.py		makeGff.py
makeGffFromBed.py		makeGffFromBed.py
makeJIMat.py		makeJIMat.py
makeJIMatClean.py		makeJIMatClean.py
makeMotifMat.py		makeMotifMat.py
makeMotifMatClean.py		makeMotifMatClean.py
makealloverlap.sh		makealloverlap.sh
mapGff.py		mapGff.py
mappability.py		mappability.py
median.py		median.py
mm9gffgname_multip.py		mm9gffgname_multip.py
modBedOverlap.py		modBedOverlap.py
modBedVal.py		modBedVal.py
modIntersectBedOverlap.py		modIntersectBedOverlap.py
motif2tfname.py		motif2tfname.py
motifMacsp.py		motifMacsp.py
motifZinbap.py		motifZinbap.py
motifs.py		motifs.py
nm.py		nm.py
nmchr.py		nmchr.py
normZinba.py		normZinba.py
normbed.py		normbed.py
poissonZscore.py		poissonZscore.py
randIv.py		randIv.py
randPWM.py		randPWM.py
runHuman.sh		runHuman.sh
tflist.py		tflist.py

Repository files navigation

## generate upstream 10kb files using 2BitToFa and Bedtools
1. Download hg19.2bit & mm9.2bit from /goldenPath/hg19/bigZips/ & /goldenPath/mm9/bigZips/

2. Download blatScr.zip from users.soe.ucsc.edu/~kent/src/
	modify .bashrc
		'MACHTYPE=x86_64 
		export MACHTYPE 
		export PATH="$PATH":~/bin/$MACHTYPE'
	mkdir $MACHTYPE in lib	
	make
	
	Note: BLAT pre-compiled binaries for linux as 32-bit, so to compile BLAT on x86_64 Ubuntu follow:
		apt-get install build-essential
		remove the -Werror compiler flag that treats warnings as errors
			edit the /inc/common.mk
			HG_WARN_ERR = -DJK_WARN -Wall -Werrror
			to
			HG_WARN_ERR = -DJK_WARN -Wall
		makedir -p ~/bin/x86_64 
		export MACHTYPE=x86_64 (add this to $PATH)
		make
    
3. Convert 2bit to fasta file format 
	twoBitToFa mm9.2bit mm9.fa

4. Generate BED file using UCSC Tables to output upstream10kb.bed
	or use BSGenome to write out specific genomic regions in a bed format file

5. Use bedtools to generate upstream10kb.fa
	bedtools getfasta -name -s -fi mm9.fa -bed mm9upstream10kb.bed -fo upstream10kb.fa
	'-s forces strand information'

## motif scan
1. Download pwms and TF_Info files by species from the Hughes database: https://cisbp2.ccbr.utoronto.ca/

2. Convert the pwms into uniprobe format matrices
	python motifs.py <path-to-the-motif-files>

3. Convert uniprobe format matrices into meme compatible format in Command-line
	uniprobe2meme -bg mm9upstream10kbbgfile ./motif_output/M*.txt > mm9motifs.meme
	note: add following commands in .bashrc
		'export PERL5LIB=/home/xc406/tools/meme/lib/perl:$PERL5LIB'	

4. Use fimo in MEME Suite to search for alignment
	fasta-get-markov < mm9upstream10kb.fa > mm9upstream10kbbgfile
	fimo --text --bgfile mm9upstream10kbbgfile --output-pthresh 1e-3 ./mm9motifs.meme ./mm9upstream10kb.fa > mm9fimoout_date.txt 2> mm9fimoerr_date.txt
	note: the default cutoff pval is 1e-4 without the --output-pthresh option
	***--psp

## formating fimo outputs to gff files
1. format fimo_output_file into gff (time-consuming) with parallel python
	python mm9gff.py fimo_output_file

2. substitute/add Hugo_gene_names next to NM# (time-consuming) and take out overlaps 
	python overlap.py gff_file
	sort the gff_files
	python overlap_2.py gff_file
	--this is done with a shell script overlap.sh to first check whether the tf file is empty

3. filter by DHS reads (htseq-count_output_files)
	python filter htseq-count_output_file gff_file fimo_stderr_file
	note: unnecessary

3'. alternatively, use macs to perform peak calling with an arbitrary cutoff p val (say 1e-3) on downloaded bam files
	macs14 -t wgEncodeUwDnaseMelC57bl6MAdult8wksAlnRep1.bam -f BAM -g mm -p 1e-3 -n mel_dhs1
		note: this step can sometimes be substituted with the broadpeak/narrowpeak bed files from ENCODE
	bed files will be processed by 
		bedtools intersect -a gff_file -b macs_output_file -wa -wb > intersect_file
	process the intersect files with
		python bed_macs_two.py intersect_file

4. calculate aupr with combine.all.R
	
#####################################
#&# steps to process fimo outputs into gff files
gffgname_loci_pp-->rmvOverlap-->htseqCount-->Ppoisson/Pnb-->AUPR_PpoissonChIP