struct
module Location = struct
type t = [
| `Url of string
| `Vcf_concat of (string * t) list
| `Concat of t list
| `Gunzip of t
| `Untar of t
]
let url u = `Url u
let vcf_concat l = `Vcf_concat l
let concat l = `Concat l
let gunzip l = `Gunzip l
let untar l = `Untar l
end
type t = {
name: string;
metadata: string option;
fasta: Location.t;
dbsnp: Location.t option;
cosmic: Location.t option;
exome_gtf: Location.t option;
cdna: Location.t option;
major_contigs: string list option;
}
let create
?metadata
~fasta
?dbsnp
?cosmic
?exome_gtf
?cdna
?major_contigs
name = {
name;
metadata;
fasta;
dbsnp;
cosmic;
exome_gtf;
cdna;
major_contigs;
}
module Default = struct
let major_contigs_b37 =
List.init 22 (fun i -> sprintf "%d" (i + 1))
@ ["X"; "Y"; "MT";]
let major_contigs_hg_family =
List.init 22 (fun i -> sprintf "chr%d" (i + 1))
@ [
"chrX";
"chrY";
"chrM";
]
let major_contigs_mm10 =
List.init 19 (fun i -> sprintf "%d" (i + 1))
@ [ "X"; "Y" ]
module Name = struct
let b37 = "b37"
let b37decoy = "b37decoy"
let b38 = "b38"
let hg18 = "hg18"
let hg19 = "hg19"
let mm10 = "mm10"
end
let b37_dbsnp_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/b37/dbsnp_138.b37.vcf.gz"
let b37_cosmic_url =
"http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutect/b37_cosmic_v54_120711.vcf"
let b37_exome_gtf_url =
"http://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz"
let b37_cdna_url =
"http://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.75.cdna.all.fa.gz"
let b37 =
create Name.b37
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_b37
~fasta:Location.(
url "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/b37/human_g1k_v37.fasta.gz"
|> gunzip)
~dbsnp:Location.(url b37_dbsnp_url |> gunzip)
~cosmic:Location.(url b37_cosmic_url)
~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip)
~cdna:Location.(url b37_cdna_url |> gunzip)
let b37decoy =
create Name.b37decoy
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_b37
~fasta:Location.(
url
"ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
|> gunzip)
~dbsnp:Location.(url b37_dbsnp_url |> gunzip)
~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip)
~cosmic:Location.(url b37_cosmic_url)
~cdna:Location.(url b37_cdna_url |> gunzip)
let b38 =
let b38_url =
"ftp://ftp.ensembl.org/pub/release-79/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" in
let dbsnp_b38 =
"http://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b142_GRCh38/VCF/00-All.vcf.gz" in
let gtf_b38_url =
"http://ftp.ensembl.org/pub/release-79/gtf/homo_sapiens/Homo_sapiens.GRCh38.79.gtf.gz" in
let cdna_b38_url =
"http://ftp.ensembl.org/pub/release-79/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" in
create Name.b38
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_b37
~fasta:Location.(url b38_url|> gunzip)
~dbsnp:Location.(url dbsnp_b38 |> gunzip)
~exome_gtf:Location.(url gtf_b38_url |> gunzip)
~cdna:Location.(url cdna_b38_url |> gunzip)
let hg18 =
let hg18_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg18/Homo_sapiens_assembly18.fasta.gz" in
let dbsnp_hg18_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg18/dbsnp_138.hg18.vcf.gz" in
create Name.hg18
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_hg_family
~fasta:Location.(url hg18_url|> gunzip)
~dbsnp:Location.(url dbsnp_hg18_url |> gunzip)
let hg19 =
let hg19_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/ucsc.hg19.fasta.gz" in
let dbsnp_hg19_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/dbsnp_138.hg19.vcf.gz" in
create Name.hg19
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_hg_family
~fasta:Location.(url hg19_url|> gunzip)
~dbsnp:Location.(url dbsnp_hg19_url |> gunzip)
let mm10 =
let mm10_url =
"ftp://ftp.ensembl.org/pub/release-84/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz" in
let dbsnp_mm10_snps_url =
"ftp://ftp-mouse.sanger.ac.uk/REL-1303-SNPs_Indels-GRCm38/mgp.v3.snps.rsIDdbSNPv137.vcf.gz" in
let dbsnp_mm10_indels_url =
"ftp://ftp-mouse.sanger.ac.uk/REL-1303-SNPs_Indels-GRCm38/mgp.v3.indels.rsIDdbSNPv137.vcf.gz" in
let gene_annotations_gtf =
"ftp://ftp.ensembl.org/pub/release-79/gtf/mus_musculus/Mus_musculus.GRCm38.79.gtf.gz" in
create Name.mm10
~metadata:"Provided by the Biokepi Library"
~major_contigs:major_contigs_mm10
~fasta:Location.(url mm10_url |> gunzip)
~dbsnp:Location.(
vcf_concat ["db_snps.vcf", url dbsnp_mm10_snps_url |> gunzip;
"db_indels.vcf", url dbsnp_mm10_indels_url |> gunzip]
)
~exome_gtf:Location.(url gene_annotations_gtf |> gunzip)
end
end