forked from speciationgenomics/scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvertVCFtoEigenstrat.sh
executable file
·133 lines (108 loc) · 4.64 KB
/
convertVCFtoEigenstrat.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/bin/bash
# Script to convert vcf to eigenstrat format for ADMIXTOOLS
# Written by Joana Meier
# It takes a single argument: the vcf file (can be gzipped) and
# optionally you can specify --renameScaff if you have scaffold names (not chr1, chr2...)
# The script assumes a constant recombination rate of 2 cM/Mb. You may have to change that or change the jackknife block size in ADMIXTOOLS.
# Here, you can change the recombination rate which is currently set to 2 cM/Mb
rec=2
# It requires vcftools and admixtools
# for some clusters, it is needed to load these modules:
# module load gcc/4.8.2 vcftools openblas/0.2.13_seq perl/5.18.4 admixtools
renameScaff="FALSE"
# If help is requested
if [[ $1 == "-h" ]]
then
echo "Please provide the vcf file to parse, and optionally add --renameScaff if you have scaffolds instead of chromosomes"
echo "Usage: convertVCFtoEigenstrat.sh <vcf file> --renameScaff --rec 2 (note, the second argument is optional)"
exit 1
# If the second argument renameScaff is given, set it to True
elif [[ $2 == "--renameScaff" ]]
then
renameScaff="TRUE"
# If no argument is given or the second one is not -removeChr, give information and quit
elif [ $# -ne 1 ]
then
echo "Please provide the vcf file to parse, and optionally add --renameScaff if you have scaffolds instead of chromosomes"
echo "Usage: ./convertVCFtoEigenstrat.sh <vcf file> --renameScaff (note, the second argument is optional)"
exit 1
fi
# Set the first argument to the file name
file=$1
file=${file%.gz}
file=${file%.vcf}
# if the vcf file is gzipped:
if [ -s $file.vcf.gz ]
then
# If renaming of scaffolds is requested, set all chromosome/scaffold names to 1
if [ $renameScaff == "TRUE" ]
then
echo "setting scaffold names to 1 and positions to additive numbers"
zcat $file".vcf.gz" | awk 'BEGIN {OFS = "\t";add=0;lastPos=0;scaff=""}{
if($1!~/^#/){
if($1!=scaff){add=lastPos;scaff=$1}
$1="1"
$2=$2+add
lastPos=$2
}
print $0}' | gzip > $file.renamedScaff.vcf.gz
# Get a .map and .ped file (remove multiallelic SNPs, monomorphic sites and indels)
vcftools --gzvcf $file".renamedScaff.vcf.gz" \
--plink --mac 1.0 --remove-indels --max-alleles 2 --out $file
else
# Get a .map and .ped file (remove multiallelic SNPs, monomorphic sites and indels)
vcftools --gzvcf $file".vcf.gz" \
--plink --mac 1.0 --remove-indels --max-alleles 2 --out $file
fi
# if the file is not gzipped
else
# If renaming of scaffolds is requested, set all chromosome/scaffold names to 1
if [ $renameScaff == "TRUE" ]
then
echo "setting scaffold names to 1 and positions to additive numbers"
awk 'BEGIN {OFS = "\t";add=0;lastPos=0;scaff=""}{
if($1!~/^#/){
if($1!=scaff){add=lastPos;scaff=$1}
$1="1"
$2=$2+add
lastPos=$2
}
print $0}' $file.vcf | gzip > $file.renamedScaff.vcf.gz
# Get a .map and .ped file (remove multiallelic SNPs, monomorphic sites and indels)
vcftools --gzvcf $file".renamedScaff.vcf.gz" \
--plink --mac 1.0 --remove-indels --max-alleles 2 --out $file
else
vcftools --vcf $file".vcf" --plink --mac 1.0 --remove-indels --max-alleles 2 --out $file
fi
fi
# Change the .map file to match the requirements of ADMIXTOOLS by adding fake Morgan positions (assuming a recombination rate of 2 cM/Mbp)
awk -F"\t" -v rec=$rec 'BEGIN{scaff="";add=0}{
split($2,newScaff,":")
if(!match(newScaff[1],scaff)){
scaff=newScaff[1]
add=lastPos
}
pos=add+$4
count+=0.00000001*rec*(pos-lastPos)
print newScaff[1]"\t"$2"\t"count"\t"pos
lastPos=pos
}' ${file}.map | sed 's/^chr//' > better.map
mv better.map ${file}.map
# Change the .ped file to match the ADMIXTOOLS requirements
awk 'BEGIN{ind=1}{printf ind"\t"$2"\t0\t0\t0\t1\t";
for(i=7;i<=NF;++i) printf $i"\t";ind++;printf "\n"}' ${file}.ped > tmp.ped
mv tmp.ped ${file}.ped
# create an inputfile for convertf
echo "genotypename: ${file}.ped" > par.PED.EIGENSTRAT.${file}
echo "snpname: ${file}.map" >> par.PED.EIGENSTRAT.${file}
echo "indivname: ${file}.ped" >> par.PED.EIGENSTRAT.${file}
echo "outputformat: EIGENSTRAT" >> par.PED.EIGENSTRAT.${file}
echo "genotypeoutname: ${file}.eigenstratgeno" >> par.PED.EIGENSTRAT.${file}
echo "snpoutname: ${file}.snp" >> par.PED.EIGENSTRAT.${file}
echo "indivoutname: ${file}.ind" >> par.PED.EIGENSTRAT.${file}
echo "familynames: NO" >> par.PED.EIGENSTRAT.${file}
# Use CONVERTF to parse PED to eigenstrat
convertf -p par.PED.EIGENSTRAT.${file}
# change the snp file for ADMIXTOOLS:
awk 'BEGIN{i=0}{i=i+1; print $1"\t"$2"\t"$3"\t"i"\t"$5"\t"$6}' $file.snp > $file.snp.tmp
mv $file.snp.tmp $file.snp