module Common_pipelines
= struct
open Biokepi_run_environment
open Biokepi_bfx_tools
open Common
module Somatic = struct
type from_fastqs =
normal_fastqs:Pipeline.Construct.input_fastq ->
tumor_fastqs:Pipeline.Construct.input_fastq ->
dataset:string -> Pipeline.vcf Pipeline.t list
let crazy_example ~normal_fastqs ~tumor_fastqs ~dataset =
let open Pipeline.Construct in
let normal = input_fastq ~dataset normal_fastqs in
let tumor = input_fastq ~dataset tumor_fastqs in
let bam_pair ?configuration () =
let normal =
bwa ?configuration normal
|> gatk_indel_realigner
|> picard_mark_duplicates
|> gatk_bqsr
in
let tumor =
bwa ?configuration tumor
|> gatk_indel_realigner
|> picard_mark_duplicates
|> gatk_bqsr
in
pair ~normal ~tumor in
let bam_pairs =
let non_default =
{ Bwa.Configuration.Aln.
name = "config42";
gap_open_penalty = 10;
gap_extension_penalty = 7 } in
[
bam_pair ();
bam_pair ~configuration:non_default ();
] in
let vcfs =
List.concat_map bam_pairs ~f:(fun bam_pair ->
[
mutect bam_pair;
somaticsniper bam_pair;
somaticsniper
~configuration:Somaticsniper.Configuration.{
name = "example0001-095";
prior_probability = 0.001;
theta = 0.95;
} bam_pair;
varscan_somatic bam_pair;
strelka ~configuration:Strelka.Configuration.exome_default bam_pair;
])
in
vcfs
let from_fastqs_with_variant_caller
~variant_caller ~normal_fastqs ~tumor_fastqs ~dataset =
let open Pipeline.Construct in
let normal = input_fastq ~dataset normal_fastqs in
let tumor = input_fastq ~dataset tumor_fastqs in
let make_bam data =
data |> bwa_mem |> gatk_indel_realigner |> picard_mark_duplicates |> gatk_bqsr
in
let vc_input =
pair ~normal:(make_bam normal) ~tumor:(make_bam tumor) in
[variant_caller vc_input]
end
end
module Optimization_framework
= struct
module type Trans_base = sig
type 'a from
type 'a term
val fwd : 'a from -> 'a term
val bwd : 'a term -> 'a from
end
module type Transformation = sig
include Trans_base
val fmap : ('a from -> 'b from) -> ('a term -> 'b term)
val fmap2 : ('a from -> 'b from -> 'c from) ->
('a term -> 'b term -> 'c term)
end
module Define_transformation(X : Trans_base) = struct
include X
let fmap f term = fwd (f (bwd term))
let fmap2 f term1 term2 = fwd (f (bwd term1) (bwd term2))
end
module Generic_optimizer
(X: Transformation)
(Input: Semantics.Bioinformatics_base with type 'a repr = 'a X.from)
: Semantics.Bioinformatics_base
with type 'a repr = 'a X.term
and type 'a observation = 'a Input.observation
= struct
module Tools = Biokepi_bfx_tools
open X
type 'a repr = 'a term
type 'a observation = 'a Input.observation
let lambda f = fwd (Input.lambda (fun x -> bwd (f (fwd x))))
let apply e1 e2 = fmap2 Input.apply e1 e2
let observe f =
Input.observe (fun () -> bwd (f ()))
let list l =
fwd (Input.list (List.map bwd l))
let list_map l ~f =
fwd (Input.list_map (bwd l) (bwd f))
let to_unit x = fwd (Input.to_unit (bwd x))
let pair a b = fwd (Input.pair (bwd a) (bwd b))
let pair_first x = fwd (Input.pair_first (bwd x))
let pair_second x = fwd (Input.pair_second (bwd x))
let fastq ~sample_name ?fragment_id ~r1 ?r2 () =
fwd (Input.fastq ~sample_name ?fragment_id ~r1 ?r2 ())
let fastq_gz ~sample_name ?fragment_id ~r1 ?r2 () =
fwd (Input.fastq_gz ~sample_name ?fragment_id ~r1 ?r2 ())
let bam ~path ?sorting ~reference_build () =
fwd (Input.bam ~path ?sorting ~reference_build ())
let bwa_aln ?configuration ~reference_build fq =
fwd (Input.bwa_aln ?configuration ~reference_build (bwd fq))
let bwa_mem ?configuration ~reference_build fq =
fwd (Input.bwa_mem ?configuration ~reference_build (bwd fq))
let star ?configuration ~reference_build fq =
fwd (Input.star ?configuration ~reference_build (bwd fq))
let hisat ?configuration ~reference_build fq =
fwd (Input.hisat ?configuration ~reference_build (bwd fq))
let mosaik ~reference_build fq =
fwd (Input.mosaik ~reference_build (bwd fq))
let stringtie ?configuration fq =
fwd (Input.stringtie ?configuration (bwd fq))
let gatk_indel_realigner ?configuration bam =
fwd (Input.gatk_indel_realigner ?configuration (bwd bam))
let gatk_indel_realigner_joint ?configuration pair =
fwd (Input.gatk_indel_realigner_joint ?configuration (bwd pair))
let picard_mark_duplicates ?configuration bam =
fwd (Input.picard_mark_duplicates ?configuration (bwd bam))
let gatk_bqsr ?configuration bam =
fwd (Input.gatk_bqsr ?configuration (bwd bam))
let seq2hla fq =
fwd (Input.seq2hla (bwd fq))
let optitype how fq =
fwd (Input.optitype how (bwd fq))
let gatk_haplotype_caller bam =
fwd (Input.gatk_haplotype_caller (bwd bam))
let gunzip gz =
fwd (Input.gunzip (bwd gz))
let gunzip_concat gzl =
fwd (Input.gunzip_concat (bwd gzl))
let concat l =
fwd (Input.concat (bwd l))
let merge_bams bl =
fwd (Input.merge_bams (bwd bl))
let bam_to_fastq ~sample_name ?fragment_id se_or_pe bam =
fwd (Input.bam_to_fastq ~sample_name ?fragment_id se_or_pe (bwd bam))
let mutect ?configuration ~normal ~tumor () =
fwd (Input.mutect ?configuration ~normal:(bwd normal) ~tumor:(bwd tumor) ())
let mutect2 ?configuration ~normal ~tumor () =
fwd (Input.mutect2 ?configuration ~normal:(bwd normal) ~tumor:(bwd tumor) ())
let somaticsniper ?configuration ~normal ~tumor () =
fwd (Input.somaticsniper ?configuration ~normal:(bwd normal) ~tumor:(bwd tumor) ())
let strelka ?configuration ~normal ~tumor () =
fwd (Input.strelka ?configuration ~normal:(bwd normal) ~tumor:(bwd tumor) ())
let varscan_somatic ?adjust_mapq ~normal ~tumor () =
fwd (Input.varscan_somatic
?adjust_mapq ~normal:(bwd normal) ~tumor:(bwd tumor) ())
let muse ?configuration ~normal ~tumor () =
fwd (Input.muse ?configuration ~normal:(bwd normal) ~tumor:(bwd tumor) ())
let virmid ?configuration ~normal ~tumor () =
fwd (Input.virmid ?configuration ~normal:(bwd normal) ~tumor:(bwd tumor) ())
end
end
module Pipeline
= struct
open Biokepi_run_environment
open Biokepi_bfx_tools
open Common
module File = struct
type t = KEDSL.file_workflow
end
type json = Yojson.Basic.json
type fastq_gz = Fastq_gz
type fastq = Fastq
type fastq_sample = Fastq_sample
type bam = KEDSL.bam_file KEDSL.workflow_node
type bam_pair = Bam_pair
type vcf = Vcf
type gtf = Gtf
type seq2hla_hla_types = Seq2HLA_hla_types
type optitype_hla_types = Optitype_hla_types
type somatic = { normal : bam; tumor : bam }
type germline = bam
type fastq_sample_info = {
sample_name: string;
fragment_id: string;
}
module Variant_caller = struct
type _ input =
| Somatic: somatic -> somatic input
| Germline: germline -> germline input
type 'a t = {
name: string;
configuration_json: json;
configuration_name: string;
make_target:
run_with: Machine.t ->
input: 'a input ->
result_prefix: string ->
?more_edges: KEDSL.workflow_edge list ->
unit ->
KEDSL.file_workflow
}
end
type metadata_spec = [
| `Add_tags of string list
| `Add_tags_rec of string list
]
type _ t =
| Fastq_gz: File.t -> fastq_gz t
| Fastq: File.t -> fastq t
| Bam_sample: string * bam -> bam t
| Bam_to_fastq: fastq_sample_info option * [ `Single | `Paired ] * bam t -> fastq_sample t
| Paired_end_sample: fastq_sample_info * fastq t * fastq t -> fastq_sample t
| Single_end_sample: fastq_sample_info * fastq t -> fastq_sample t
| Gunzip_concat: fastq_gz t list -> fastq t
| Concat_text: fastq t list -> fastq t
| Star: Star.Configuration.Align.t * fastq_sample t -> bam t
| Hisat: Hisat.Configuration.t * fastq_sample t -> bam t
| Stringtie: Stringtie.Configuration.t * bam t -> gtf t
| Bwa: Bwa.Configuration.Aln.t * fastq_sample t -> bam t
| Bwa_mem: Bwa.Configuration.Mem.t * fastq_sample t -> bam t
| Mosaik: fastq_sample t -> bam t
| Gatk_indel_realigner: Gatk.Configuration.indel_realigner * bam t -> bam t
| Picard_mark_duplicates: Picard.Mark_duplicates_settings.t * bam t -> bam t
| Gatk_bqsr: (Gatk.Configuration.bqsr * bam t) -> bam t
| Bam_pair: bam t * bam t -> bam_pair t
| Somatic_variant_caller: somatic Variant_caller.t * bam_pair t -> vcf t
| Germline_variant_caller: germline Variant_caller.t * bam t -> vcf t
| Seq2HLA: fastq_sample t -> seq2hla_hla_types t
| Optitype: ([`DNA | `RNA] * fastq_sample t) -> optitype_hla_types t
| With_metadata: metadata_spec * 'a t -> 'a t
module Construct = struct
type input_fastq = [
| `Paired_end of File.t list * File.t list
| `Single_end of File.t list
]
let input_fastq ~dataset (fastqs: input_fastq) =
let is_fastq_gz p =
Filename.check_suffix p "fastq.gz" || Filename.check_suffix p "fq.gz" in
let is_fastq p =
Filename.check_suffix p "fastq" || Filename.check_suffix p "fq" in
let theyre_all l f = List.for_all l ~f:(fun file -> f file#product#path) in
let bring_to_single_fastq l =
match l with
| [] -> failwithf "Dataset %S seems empty" dataset
| gzs when theyre_all gzs is_fastq_gz ->
Gunzip_concat (List.map gzs (fun f -> Fastq_gz f))
| fqs when theyre_all fqs is_fastq ->
Concat_text (List.map fqs (fun f -> Fastq f))
| not_supported ->
failwithf
"For now, a sample must be a uniform list of fastq.gz/fq.gz or .fq/.fastq files. Dataset %S does not qualify: [%s]
"
dataset
(List.map not_supported ~f:(fun f -> Filename.basename f#product#path)
|> String.concat ~sep:", ")
in
let sample_info = {sample_name = dataset; fragment_id = dataset} in
match fastqs with
| `Paired_end (l1, l2) ->
Paired_end_sample (sample_info, bring_to_single_fastq l1, bring_to_single_fastq l2)
| `Single_end l ->
Single_end_sample (sample_info, bring_to_single_fastq l)
let bam ~dataset bam = Bam_sample (dataset, bam)
let bam_to_fastq ?sample_name how bam = Bam_to_fastq (sample_name, how, bam)
let bwa ?(configuration = Bwa.Configuration.Aln.default) fastq =
Bwa (configuration, fastq)
let bwa_aln = bwa
let bwa_mem ?(configuration = Bwa.Configuration.Mem.default) fastq =
Bwa_mem (configuration, fastq)
let mosaik fastq = Mosaik fastq
let star ?(configuration = Star.Configuration.Align.default) fastq =
Star (configuration, fastq)
let hisat ?(configuration = Hisat.Configuration.default_v1) fastq =
Hisat (configuration, fastq)
let stringtie ?(configuration = Stringtie.Configuration.default) bam =
Stringtie (configuration, bam)
let gatk_indel_realigner
?(configuration=Gatk.Configuration.default_indel_realigner)
bam
= Gatk_indel_realigner (configuration, bam)
let picard_mark_duplicates
?(settings=Picard.Mark_duplicates_settings.default) bam =
Picard_mark_duplicates (settings, bam)
let gatk_bqsr ?(configuration=Gatk.Configuration.default_bqsr) bam = Gatk_bqsr (configuration, bam)
let pair ~normal ~tumor = Bam_pair (normal, tumor)
let germline_variant_caller t input_bam =
Germline_variant_caller (t, input_bam)
let gatk_haplotype_caller input_bam =
let configuration_name = "default" in
let configuration_json =
`Assoc [
"Name", `String configuration_name;
] in
let make_target
~run_with ~input ~result_prefix ?more_edges () =
match input with
| Variant_caller.Germline input_bam ->
Gatk.haplotype_caller ?more_edges ~run_with
~input_bam ~result_prefix `Map_reduce in
germline_variant_caller
{Variant_caller.name = "Gatk-HaplotypeCaller";
configuration_json;
configuration_name;
make_target;}
input_bam
let somatic_variant_caller t bam_pair =
Somatic_variant_caller (t, bam_pair)
let mutect ?(configuration=Mutect.Configuration.default) bam_pair =
let configuration_name = configuration.Mutect.Configuration.name in
let configuration_json = Mutect.Configuration.to_json configuration in
let make_target
~run_with ~input ~result_prefix ?more_edges () =
match input with | Variant_caller.Somatic {normal; tumor} ->
Mutect.run
~configuration
?more_edges
~run_with
~normal ~tumor
~result_prefix `Map_reduce in
somatic_variant_caller
{Variant_caller.name = "Mutect";
configuration_json;
configuration_name;
make_target;}
bam_pair
let mutect2 ?(configuration=Gatk.Configuration.Mutect2.default) bam_pair =
let configuration_name = configuration.Gatk.Configuration.Mutect2.name in
let configuration_json = Gatk.Configuration.Mutect2.to_json configuration in
let make_target
~run_with ~input ~result_prefix ?more_edges () =
match input with
| Variant_caller.Somatic {normal; tumor} ->
Gatk.mutect2
~configuration ?more_edges ~run_with
~input_normal_bam:normal ~input_tumor_bam:tumor
~result_prefix `Map_reduce in
somatic_variant_caller
{Variant_caller.name = "Mutect";
configuration_json;
configuration_name;
make_target;}
bam_pair
let somaticsniper
?(configuration = Somaticsniper.Configuration.default)
bam_pair =
let make_target
~run_with ~input ~result_prefix ?more_edges () =
match input with
| Variant_caller.Somatic {normal; tumor} ->
Somaticsniper.run
~configuration ~run_with ~normal ~tumor ~result_prefix () in
somatic_variant_caller
{Variant_caller.name = "Somaticsniper";
configuration_json = Somaticsniper.Configuration.to_json configuration;
configuration_name = Somaticsniper.Configuration.name configuration;
make_target;}
bam_pair
let varscan_somatic ?adjust_mapq bam_pair =
let configuration_name =
sprintf "amq-%s"
(Option.value_map ~default:"NONE" adjust_mapq ~f:Int.to_string) in
let configuration_json =
`Assoc [
"Name", `String configuration_name;
"Adjust_mapq",
`String (Option.value_map adjust_mapq ~f:Int.to_string ~default:"None");
] in
somatic_variant_caller
{Variant_caller.name = "Varscan-somatic";
configuration_json;
configuration_name;
make_target = begin
fun ~run_with ~input ~result_prefix ?more_edges () ->
match input with | Variant_caller.Somatic {normal; tumor} ->
Varscan.somatic_map_reduce ?adjust_mapq
?more_edges ~run_with ~normal ~tumor ~result_prefix ()
end}
bam_pair
let strelka ~configuration bam_pair =
somatic_variant_caller
{Variant_caller.name = "Strelka";
configuration_json = Strelka.Configuration.to_json configuration;
configuration_name = configuration.Strelka.Configuration.name;
make_target =
fun ~run_with ~input ~result_prefix ?more_edges () ->
match input with | Variant_caller.Somatic {normal; tumor} ->
Strelka.run
?more_edges
~configuration ~normal ~tumor
~run_with ~result_prefix
()
}
bam_pair
let virmid ~configuration bam_pair =
somatic_variant_caller
{Variant_caller.name = "Virmid";
configuration_json = Virmid.Configuration.to_json configuration;
configuration_name = configuration.Virmid.Configuration.name;
make_target =
fun ~run_with ~input ~result_prefix
?more_edges () ->
match input with | Variant_caller.Somatic {normal; tumor} ->
Virmid.run
?more_edges
~configuration ~normal ~tumor
~run_with ~result_prefix
()
}
bam_pair
let muse ~configuration bam_pair =
let make_target
~(run_with: Machine.t) ~input ~result_prefix
?more_edges () =
match input with | Variant_caller.Somatic {normal; tumor} ->
Muse.run ~configuration ?more_edges
~run_with ~normal ~tumor ~result_prefix `Map_reduce in
somatic_variant_caller
{Variant_caller.name = "Muse";
configuration_json = Muse.Configuration.to_json configuration;
configuration_name = configuration.Muse.Configuration.name;
make_target }
bam_pair
let seq2hla fastq_sample = Seq2HLA fastq_sample
let optitype kind fastq_sample = Optitype (kind, fastq_sample)
let add_tags ?(recursively = false) tags pipeline =
With_metadata ((if recursively then `Add_tags_rec tags else `Add_tags tags),
pipeline)
end
let rec to_file_prefix:
type a.
?read:[ `R1 of string | `R2 of string ] ->
a t -> string =
fun ?read w ->
begin match w with
| With_metadata (_, p) -> to_file_prefix ?read p
| Fastq_gz _ -> failwith "TODO"
| Fastq _ -> failwith "TODO"
| Single_end_sample (info, _) -> info.fragment_id
| Gunzip_concat [] -> failwith "TODO"
| Gunzip_concat (_ :: _) ->
begin match read with
| None -> "-cat"
| Some (`R1 s) -> sprintf "%s-R1-cat" s
| Some (`R2 s) -> sprintf "%s-R2-cat" s
end
| Concat_text _ -> failwith "TODO"
| Bam_sample (name, _) -> Filename.basename name
| Bam_to_fastq (_, how, bam) ->
sprintf "%s-b2fq-%s"
(to_file_prefix bam)
(match how with `Paired -> "PE" | `Single -> "SE")
| Paired_end_sample (info, _ , _) -> info.fragment_id
| Bwa (configuration, sample) ->
sprintf "%s-bwa-%s"
(to_file_prefix sample) (Bwa.Configuration.Aln.name configuration)
| Bwa_mem (configuration, sample) ->
sprintf "%s-bwa-mem-%s"
(to_file_prefix sample) (Bwa.Configuration.Mem.name configuration)
| Star (configuration, sample) ->
sprintf "%s-%s-star-aligned"
(to_file_prefix sample)
(Star.Configuration.Align.name configuration)
| Hisat (conf, sample) ->
sprintf "%s-hisat-%s-aligned" (to_file_prefix sample) (conf.Hisat.Configuration.name)
| Stringtie (conf, sample) ->
sprintf "%s-%s-stringtie"
(to_file_prefix sample)
(conf.Stringtie.Configuration.name)
| Mosaik (sample) ->
sprintf "%s-mosaik" (to_file_prefix sample)
| Gatk_indel_realigner ((indel_cfg, target_cfg), bam) ->
let open Gatk.Configuration in
sprintf "%s-indelrealigned-%s-%s"
(to_file_prefix ?read bam)
indel_cfg.Indel_realigner.name
target_cfg.Realigner_target_creator.name
| Gatk_bqsr ((bqsr_cfg, print_reads_cfg), bam) ->
let open Gatk.Configuration in
sprintf "%s-bqsr-%s-%s"
(to_file_prefix ?read bam)
bqsr_cfg.Bqsr.name
print_reads_cfg.Print_reads.name
| Picard_mark_duplicates (_, bam) ->
sprintf "%s-dedup" (to_file_prefix ?read bam)
| Bam_pair (nor, tum) -> to_file_prefix tum
| Somatic_variant_caller (vc, bp) ->
let prev = to_file_prefix bp in
sprintf "%s-%s-%s" prev
vc.Variant_caller.name
vc.Variant_caller.configuration_name
| Germline_variant_caller (vc, bp) ->
let prev = to_file_prefix bp in
sprintf "%s-%s-%s" prev
vc.Variant_caller.name
vc.Variant_caller.configuration_name
| Seq2HLA s ->
sprintf "seq2hla-%s" (to_file_prefix ?read s)
| Optitype (kind, s) ->
sprintf "optitype-%s-%s"
(match kind with `DNA -> "DNA" | `RNA -> "RNA")
(to_file_prefix ?read s)
end
let rec to_json: type a. a t -> json =
fun w ->
let call name (args : json list): json = `List (`String name :: args) in
match w with
| Fastq_gz file -> call "Fastq_gz" [`String file#product#path]
| Fastq file -> call "Fastq" [`String file#product#path]
| Bam_sample (name, file) ->
call "Bam-sample" [`String name; `String file#product#path]
| Bam_to_fastq (name, how, bam) ->
let how_string =
match how with `Paired -> "Paired" | `Single -> "Single" in
call "Bam-to-fastq" [`String how_string; to_json bam]
| Paired_end_sample ({sample_name; fragment_id}, r1, r2) ->
call "Paired-end" [`String sample_name; `String fragment_id;
to_json r1; to_json r2]
| Single_end_sample ({sample_name; fragment_id}, r) ->
call "Single-end" [`String sample_name; `String fragment_id; to_json r]
| Gunzip_concat fastq_gz_list ->
call "Gunzip-concat" (List.map ~f:to_json fastq_gz_list)
| Concat_text fastq_list ->
call "Concat" (List.map ~f:to_json fastq_list)
| Bwa (config, input) ->
call "BWA" [
`Assoc ["configuration", Bwa.Configuration.Aln.to_json config];
to_json input
]
| Bwa_mem (params, input) ->
let input_json = to_json input in
call "BWA-MEM" [
`Assoc ["configuration", Bwa.Configuration.Mem.to_json params];
input_json
]
| Star (conf, input) ->
let input_json = to_json input in
call "STAR" [
`Assoc ["configuration", Star.Configuration.Align.to_json conf];
input_json;
]
| Hisat (conf, input) ->
let input_json = to_json input in
call "HISAT" [
`Assoc ["configuration", Hisat.Configuration.to_json conf];
input_json;
]
| Stringtie (conf, input) ->
let input_json = to_json input in
call "Stringtie" [
`Assoc ["configuration", Stringtie.Configuration.to_json conf];
input_json;
]
| Mosaik (input) ->
let input_json = to_json input in
call "MOSAIK" [input_json]
| Gatk_indel_realigner ((indel_cfg, target_cfg), bam) ->
let open Gatk.Configuration in
let input_json = to_json bam in
let indel_cfg_json = Indel_realigner.to_json indel_cfg in
let target_cfg_json = Realigner_target_creator.to_json target_cfg in
call "Gatk_indel_realigner" [`Assoc [
"Configuration", `Assoc [
"IndelRealigner Configuration", indel_cfg_json;
"RealignerTargetCreator Configuration", target_cfg_json;
];
"Input", input_json;
]]
| Gatk_bqsr ((bqsr_cfg, print_reads_cfg), bam) ->
let open Gatk.Configuration in
let input_json = to_json bam in
call "Gatk_bqsr" [`Assoc [
"Configuration", `Assoc [
"Bqsr", Bqsr.to_json bqsr_cfg;
"Print_reads", Print_reads.to_json print_reads_cfg;
];
"Input", input_json;
]]
| Germline_variant_caller (gvc, bam) ->
call gvc.Variant_caller.name [`Assoc [
"Configuration", gvc.Variant_caller.configuration_json;
"Input", to_json bam;
]]
| Picard_mark_duplicates (settings, bam) ->
call "Picard_mark_duplicates" [`Assoc ["input", to_json bam]]
| Bam_pair (normal, tumor) ->
call "Bam-pair" [`Assoc ["normal", to_json normal; "tumor", to_json tumor]]
| Somatic_variant_caller (svc, bam_pair) ->
call svc.Variant_caller.name [`Assoc [
"Configuration", svc.Variant_caller.configuration_json;
"Input", to_json bam_pair;
]]
| Seq2HLA input ->
call "Seq2HLA" [`Assoc [
"Input", to_json input
]]
| Optitype (kind, input) ->
call "Optitype" [`Assoc [
"Input", to_json input;
"Kind", `String (match kind with `DNA -> "DNA" | `RNA -> "RNA")
]]
| With_metadata (_, p) -> to_json p
module Compiler = struct
type 'a pipeline = 'a t
type workflow_option_failure_mode = [ `Silent | `Fail_if_not_happening ]
type workflow_option = [
| `Multi_sample_indel_realignment of workflow_option_failure_mode
| `Parallel_alignment_over_fastq_fragments of
[ `Bwa_mem | `Bwa | `Mosaik | `Star | `Hisat ] list
* workflow_option_failure_mode
| `Map_reduce of [ `Gatk_indel_realigner ]
]
type t = {
reference_build: Reference_genome.name;
work_dir: string;
machine : Machine.t;
options: workflow_option list;
wrap_bam_node:
bam pipeline ->
KEDSL.bam_file KEDSL.workflow_node ->
KEDSL.bam_file KEDSL.workflow_node;
wrap_vcf_node:
vcf pipeline ->
KEDSL.single_file KEDSL.workflow_node ->
KEDSL.single_file KEDSL.workflow_node;
wrap_gtf_node:
gtf pipeline ->
KEDSL.single_file KEDSL.workflow_node ->
KEDSL.single_file KEDSL.workflow_node;
}
let create
?(wrap_bam_node = fun _ x -> x)
?(wrap_vcf_node = fun _ x -> x)
?(wrap_gtf_node = fun _ x -> x)
?(options=[])
~reference_build ~work_dir ~machine () =
{reference_build; work_dir; machine; options;
wrap_bam_node; wrap_vcf_node; wrap_gtf_node}
let has_option {options; _} f =
List.exists options ~f
let apply_with_metadata ~metadata_spec wf =
begin match metadata_spec with
| `Add_tags tgs -> KEDSL.add_tags ~recursive:false wf tgs
| `Add_tags_rec tgs -> KEDSL.add_tags ~recursive:true wf tgs
end;
wf
let rec fastq_step ~read ~compiler (pipeline: fastq pipeline) =
let {work_dir; machine ; _ } = compiler in
match pipeline with
| Fastq f -> f
| Concat_text (l: fastq pipeline list) ->
failwith "Compilation of Biokepi.Pipeline.Concat_text: not implemented"
| Gunzip_concat (l: fastq_gz pipeline list) ->
let fastqs =
let rec f =
function
| Fastq_gz t -> t
| With_metadata (metadata_spec, p) ->
apply_with_metadata ~metadata_spec (f p)
in
List.map l ~f in
let result_path = work_dir // to_file_prefix ~read pipeline ^ ".fastq" in
dbg "Result_Path: %S" result_path;
Workflow_utilities.Gunzip.concat ~run_with:machine fastqs ~result_path
| With_metadata (metadata_spec, pipeline) ->
fastq_step ~read ~compiler pipeline |> apply_with_metadata ~metadata_spec
let rec fastq_sample_step ~compiler (fs : fastq_sample pipeline) =
let {work_dir; machine ; _ } = compiler in
match fs with
| Bam_to_fastq (info_opt, how, what) ->
let bam = compile_aligner_step ~compiler what in
let sample_type =
match how with `Single -> `Single_end | `Paired -> `Paired_end in
let fastq_pair =
let output_prefix = work_dir // to_file_prefix ?read:None what in
let sample_name = Option.map info_opt (fun x -> x.sample_name) in
Picard.bam_to_fastq ~run_with:machine ~sample_type
?sample_name
~output_prefix bam
in
fastq_pair
| Paired_end_sample (info, l1, l2) ->
let r1 = fastq_step ~compiler ~read:(`R1 info.fragment_id) l1 in
let r2 = fastq_step ~compiler ~read:(`R2 info.fragment_id) l2 in
let open KEDSL in
workflow_node (fastq_reads ~host:Machine.(as_host machine)
~name:info.sample_name
r1#product#path (Some r2#product#path))
~name:(sprintf "Pairing sample %s (%s and %s)"
info.sample_name
(Filename.basename r1#product#path)
(Filename.basename r2#product#path))
~edges:[ depends_on r1; depends_on r2 ]
| Single_end_sample (info, single) ->
let r1 = fastq_step ~compiler ~read:(`R1 info.fragment_id) single in
let open KEDSL in
workflow_node (fastq_reads ~host:Machine.(as_host machine)
~name:info.sample_name
r1#product#path None)
~equivalence:`None
~edges:[ depends_on r1 ]
~name:(sprintf "single-end %s (%s)"
info.sample_name
(Filename.basename r1#product#path))
| With_metadata (metadata_spec, p) ->
fastq_sample_step ~compiler p |> apply_with_metadata ~metadata_spec
and compile_aligner_step ~compiler (pipeline : bam pipeline) =
let {reference_build; work_dir; machine; _} = compiler in
let result_prefix = work_dir // to_file_prefix pipeline in
dbg "Result_Prefix: %S" result_prefix;
let rec parallelize_alignment ~make_aligner sample =
match sample with
| Paired_end_sample (info,
Gunzip_concat r1_list,
Gunzip_concat r2_list) ->
let exploded =
let count = ref 0 in
List.map2 r1_list r2_list
~f:(fun r1 r2 ->
match r1, r2 with
| (Fastq_gz wf1, Fastq_gz wf2) ->
let new_info =
incr count;
{info with
fragment_id =
sprintf "%s-fragment-%d" info.fragment_id !count} in
make_aligner (
Paired_end_sample (new_info,
Gunzip_concat [r1],
Gunzip_concat [r2]))
| other -> failwith "compile_aligner_step: not implemented"
) in
let bams = List.map exploded ~f:(compile_aligner_step ~compiler) in
Samtools.merge_bams ~run_with:machine bams
(result_prefix ^ "-merged.bam")
| With_metadata (metadata_spec, p) ->
parallelize_alignment ~make_aligner p
|> apply_with_metadata ~metadata_spec
| other ->
failwith "parallelize_alignment: Not fully implemented"
in
let catch_parallelize_aligner aligner sample =
let option =
List.find_map compiler.options
(function
| `Parallel_alignment_over_fastq_fragments (al, todo)
when List.mem ~set:al aligner -> Some todo
| _ -> None)
in
match sample with
| Paired_end_sample (name,
Gunzip_concat r1_list,
Gunzip_concat r2_list)
when List.length r1_list > 1 ->
begin match option with
| Some _ -> `Caught
| None -> `Not_caught
end
| Paired_end_sample (name,
Gunzip_concat r1_list,
Gunzip_concat r2_list)
when List.length r1_list = 1 -> `Not_caught
| other ->
begin match option with
| Some `Silent
| None -> `Not_caught
| Some `Fail_if_not_happening ->
failwithf "Option Parallel_alignment_over_fastq_fragments is set to Fail_if_not_happening and it didn't happen:\n%s"
(to_json other |> Yojson.Basic.pretty_to_string)
end
in
let perform_aligner_parallelization aligner_tag ~make_aligner
~make_workflow sample =
begin match catch_parallelize_aligner aligner_tag sample with
| `Caught -> parallelize_alignment ~make_aligner sample
| `Not_caught ->
let fastq = fastq_sample_step ~compiler sample in
make_workflow fastq
end
in
let bam_node =
match pipeline with
| Bam_sample (name, bam_target) -> bam_target
| Gatk_indel_realigner (configuration, bam)
when has_option compiler ((=) (`Map_reduce `Gatk_indel_realigner)) ->
let input_bam = compile_aligner_step ~compiler bam in
Gatk.indel_realigner_map_reduce ~run_with:machine ~compress:false
~configuration (KEDSL.Single_bam input_bam)
| Gatk_indel_realigner (configuration, bam) ->
let input_bam = compile_aligner_step ~compiler bam in
Gatk.indel_realigner ~run_with:machine ~compress:false
~configuration (KEDSL.Single_bam input_bam)
| Gatk_bqsr (configuration, bam) ->
let input_bam = compile_aligner_step ~compiler bam in
let output_bam = result_prefix ^ ".bam" in
Gatk.base_quality_score_recalibrator
~configuration ~run_with:machine ~input_bam ~output_bam
| Picard_mark_duplicates (settings, bam) ->
let input_bam = compile_aligner_step ~compiler bam in
let output_bam = result_prefix ^ ".bam" in
Picard.mark_duplicates ~settings
~run_with:machine ~input_bam output_bam
| Bwa_mem (bwa_mem_config, fq_sample) ->
perform_aligner_parallelization
`Bwa_mem ~make_aligner:(fun pe -> Bwa_mem (bwa_mem_config, pe))
fq_sample
~make_workflow:(fun fastq ->
Bwa.mem_align_to_sam ~reference_build
~configuration:bwa_mem_config
~fastq ~result_prefix ~run_with:machine ()
|> Samtools.sam_to_bam ~reference_build ~run_with:machine)
| Bwa (bwa_config, what) ->
perform_aligner_parallelization
`Bwa ~make_aligner:(fun pe -> Bwa (bwa_config, pe)) what
~make_workflow:(fun fastq ->
Bwa.align_to_sam ~reference_build
~configuration:bwa_config
~fastq ~result_prefix ~run_with:machine ()
|> Samtools.sam_to_bam ~reference_build ~run_with:machine)
| Mosaik (what) ->
perform_aligner_parallelization
`Mosaik ~make_aligner:(fun pe -> Mosaik (pe)) what
~make_workflow:(fun fastq -> Mosaik.align ~reference_build
~fastq ~result_prefix ~run_with:machine ())
| Star (configuration, what) ->
perform_aligner_parallelization
`Star ~make_aligner:(fun pe -> Star (configuration, pe)) what
~make_workflow:(fun fastq ->
Star.align ~configuration ~reference_build
~fastq ~result_prefix ~run_with:machine ())
| Hisat (configuration, what) ->
perform_aligner_parallelization
`Hisat ~make_aligner:(fun pe -> Hisat (configuration, pe)) what
~make_workflow:(fun fastq ->
Hisat.align ~configuration ~reference_build
~fastq ~result_prefix ~run_with:machine ()
|> Samtools.sam_to_bam ~reference_build ~run_with:machine
)
| With_metadata (metadata_spec, p) ->
compile_aligner_step ~compiler p
|> apply_with_metadata ~metadata_spec
in
compiler.wrap_bam_node pipeline bam_node
let rec compile_bam_pair ~compiler (pipeline : bam_pair pipeline) :
[ `Normal of KEDSL.bam_file KEDSL.workflow_node ] *
[ `Tumor of KEDSL.bam_file KEDSL.workflow_node ] *
[ `Pipeline of bam_pair pipeline ]
=
let {reference_build; work_dir; machine; _} = compiler in
begin match pipeline with
| Bam_pair (
(Gatk_bqsr (n_bqsr_config, Gatk_indel_realigner (n_gir_conf, n_bam)))
,
(Gatk_bqsr (t_bqsr_config, Gatk_indel_realigner (t_gir_conf, t_bam)))
)
when
has_option compiler
(function `Multi_sample_indel_realignment _ -> true | _ -> false)
&& n_gir_conf = t_gir_conf ->
let normal = compile_aligner_step ~compiler n_bam in
let tumor = compile_aligner_step ~compiler t_bam in
let bam_list_node =
if has_option compiler ((=) (`Map_reduce `Gatk_indel_realigner))
then (
Gatk.indel_realigner_map_reduce ~run_with:machine ~compress:false
~configuration:n_gir_conf (KEDSL.Bam_workflow_list [normal; tumor])
) else
Gatk.indel_realigner ~run_with:machine ~compress:false
~configuration:n_gir_conf (KEDSL.Bam_workflow_list [normal; tumor])
in
begin match KEDSL.explode_bam_list_node bam_list_node with
| [realigned_normal; realigned_tumor] ->
let new_pipeline =
Bam_pair (
Gatk_bqsr (n_bqsr_config,
Bam_sample (Filename.chop_extension realigned_normal#product#path,
realigned_normal)),
Gatk_bqsr (t_bqsr_config,
Bam_sample (Filename.chop_extension realigned_tumor#product#path,
realigned_tumor)))
in
compile_bam_pair ~compiler new_pipeline
| other ->
failwithf "Gatk.indel_realigner did not return the correct list of length 2 (tumor, normal): it gave %d bams"
(List.length other)
end
| Bam_pair ( Gatk_bqsr (_, Gatk_indel_realigner (_, _)),
Gatk_bqsr (_, Gatk_indel_realigner (_, _))) as bam_pair
when
has_option compiler
((=) (`Multi_sample_indel_realignment `Fail_if_not_happening)) ->
failwithf "Option (`Multi_sample_indel_realignment `Fail_if_not_happening) is set and this pipeline does not qualify:\n%s"
(to_json bam_pair |> Yojson.Basic.pretty_to_string)
| Bam_pair (normal_t, tumor_t) as final_pipeline ->
let normal = compile_aligner_step ~compiler normal_t in
let tumor = compile_aligner_step ~compiler tumor_t in
(`Normal normal, `Tumor tumor, `Pipeline final_pipeline)
| With_metadata (metadata_spec, p) ->
let `Normal normal, `Tumor tumor, `Pipeline p =
compile_bam_pair ~compiler p in
(`Normal (apply_with_metadata ~metadata_spec normal),
`Tumor (apply_with_metadata ~metadata_spec tumor),
`Pipeline p)
end
let rec compile_variant_caller_step ~compiler (pipeline: vcf pipeline) =
let {reference_build; work_dir; machine; _} = compiler in
let vcf_node =
match pipeline with
| Somatic_variant_caller (som_vc, bam_pair) ->
let (`Normal normal, `Tumor tumor, `Pipeline new_bam_pair) =
compile_bam_pair ~compiler bam_pair in
let result_prefix =
work_dir
// to_file_prefix (Somatic_variant_caller (som_vc, new_bam_pair)) in
dbg "Result_Prefix: %S" result_prefix;
som_vc.Variant_caller.make_target
~run_with:machine ~input:(Variant_caller.Somatic {normal; tumor})
~result_prefix ()
| Germline_variant_caller (gvc, bam) ->
let result_prefix = work_dir // to_file_prefix pipeline in
dbg "Result_Prefix: %S" result_prefix;
let input_bam = compile_aligner_step ~compiler bam in
gvc.Variant_caller.make_target
~run_with:machine ~input:(Variant_caller.Germline input_bam)
~result_prefix ()
| With_metadata (metadata_spec, p) ->
compile_variant_caller_step ~compiler p
|> apply_with_metadata ~metadata_spec
in
compiler.wrap_vcf_node pipeline vcf_node
let rec compile_gtf_step ~compiler (pipeline: gtf pipeline) =
let {reference_build; work_dir; machine; _} = compiler in
let result_prefix = work_dir // to_file_prefix pipeline in
dbg "Result_Prefix: %S" result_prefix;
let gtf_node =
match pipeline with
| Stringtie (configuration, bam) ->
let bam = compile_aligner_step ~compiler bam in
Stringtie.run ~configuration
~bam ~result_prefix ~run_with:machine ()
| With_metadata (metadata_spec, p) ->
compile_gtf_step ~compiler p
|> apply_with_metadata ~metadata_spec
in
compiler.wrap_gtf_node pipeline gtf_node
let rec seq2hla_hla_types_step ~compiler (pipeline : seq2hla_hla_types pipeline) =
let { machine ; work_dir; _ } = compiler in
match pipeline with
| Seq2HLA sample ->
let work_dir = work_dir // (to_file_prefix pipeline) ^ "_work_dir" in
let fastq_pair = fastq_sample_step ~compiler sample in
let sample_name = fastq_pair#product#sample_name in
let r1 = fastq_pair#product#r1 in
let r2 = match fastq_pair#product#r2 with
| Some r2 -> r2
| _ -> failwithf "Seq2HLA doesn't support Single_end_sample(s)."
in
Seq2HLA.hla_type
~work_dir ~run_with:machine ~run_name:sample_name ~r1 ~r2
| With_metadata (metadata_spec, p) ->
seq2hla_hla_types_step ~compiler p
|> apply_with_metadata ~metadata_spec
let rec optitype_hla_types_step ~compiler (pipeline : optitype_hla_types pipeline) =
let { machine ; work_dir; _ } = compiler in
match pipeline with
| Optitype (kind, sample) ->
let work_dir = work_dir // (to_file_prefix pipeline) ^ "_work_dir" in
let fastq = fastq_sample_step ~compiler sample in
let sample_name = fastq#product#sample_name in
Optitype.hla_type
~work_dir ~run_with:machine ~run_name:sample_name ~fastq kind
| With_metadata (metadata_spec, p) ->
optitype_hla_types_step ~compiler p
|> apply_with_metadata ~metadata_spec
end
end
module Pipeline_library
= struct
open Biokepi_run_environment.Common
module Input = struct
type t =
| Fastq of fastq
and fastq = {
sample_name : string;
files : (string option * fastq_data) list;
}
and fastq_data =
| PE of string * string
| SE of string
| Of_bam of [ `PE | `SE ] * [ `Coordinate | `Read_name ] option * string * string
let pe ?fragment_id a b = (fragment_id, PE (a, b))
let se ?fragment_id a = (fragment_id, SE a)
let of_bam ?fragment_id ?sorted ~reference_build how s =
(fragment_id, Of_bam (how, sorted, reference_build, s))
let fastq_sample ~sample_name files = Fastq {sample_name; files}
end
module Make (Bfx : Semantics.Bioinformatics_base) = struct
let fastq_of_files ~sample_name ?fragment_id ~r1 ?r2 () =
let is_gz r =
Filename.check_suffix r1 ".gz" || Filename.check_suffix r1 ".fqz"
in
match is_gz r1, is_gz r2 with
| true, true ->
Bfx.(fastq_gz ~sample_name ?fragment_id ~r1 ?r2 () |> gunzip)
| false, false ->
Bfx.(fastq ~sample_name ?fragment_id ~r1 ?r2 ())
| _ ->
failwithf "fastq_of_files: cannot handle mixed gzipped and non-gzipped fastq pairs (for a given same fragment)"
let fastq_of_input u =
let open Input in
match u with
| Fastq {sample_name; files} ->
List.map files ~f:(fun (fragment_id, source) ->
match source with
| PE (r1, r2) ->
fastq_of_files ~sample_name ?fragment_id ~r1 ~r2 ()
| SE r ->
fastq_of_files ~sample_name ?fragment_id ~r1:r ()
| Of_bam (how, sorting, reference_build, path) ->
let bam = Bfx.bam ~path ?sorting ~reference_build () in
Bfx.bam_to_fastq ~sample_name ?fragment_id how bam
)
|> Bfx.list
end
end
module Semantics
= struct
module type Lambda_calculus = sig
type 'a repr
val lambda : ('a repr -> 'b repr) -> ('a -> 'b) repr
val apply : ('a -> 'b) repr -> 'a repr -> 'b repr
type 'a observation
val observe : (unit -> 'a repr) -> 'a observation
end
module type Lambda_with_list_operations = sig
include Lambda_calculus
val list: ('a repr) list -> 'a list repr
val list_map: ('a list repr) -> f:('a -> 'b) repr -> ('b list repr)
end
module type Bioinformatics_base = sig
include Lambda_with_list_operations
open Biokepi_bfx_tools
val pair: 'a repr -> 'b repr -> ('a * 'b) repr
val pair_first: ('a * 'b) repr -> 'a repr
val pair_second: ('a * 'b) repr -> 'b repr
val to_unit: 'a repr -> unit repr
val fastq :
sample_name : string ->
?fragment_id : string ->
r1: string ->
?r2: string ->
unit -> [ `Fastq ] repr
val fastq_gz:
sample_name : string ->
?fragment_id : string ->
r1: string -> ?r2: string ->
unit -> [ `Gz of [ `Fastq ] ] repr
val bam :
path : string ->
?sorting: [ `Coordinate | `Read_name ] ->
reference_build: string ->
unit -> [ `Bam ] repr
val gunzip: [ `Gz of 'a] repr -> 'a repr
val gunzip_concat: ([ `Gz of 'a] list) repr -> 'a repr
val concat: ('a list) repr -> 'a repr
val merge_bams: ([ `Bam ] list) repr -> [ `Bam ] repr
val bam_to_fastq:
sample_name : string ->
?fragment_id : string ->
[ `SE | `PE ] ->
[ `Bam ] repr ->
[ `Fastq ] repr
val bwa_aln:
?configuration: Biokepi_bfx_tools.Bwa.Configuration.Aln.t ->
reference_build: Biokepi_run_environment.Reference_genome.name ->
[ `Fastq ] repr ->
[ `Bam ] repr
val bwa_mem:
?configuration: Biokepi_bfx_tools.Bwa.Configuration.Mem.t ->
reference_build: Biokepi_run_environment.Reference_genome.name ->
[ `Fastq ] repr ->
[ `Bam ] repr
val star:
?configuration: Star.Configuration.Align.t ->
reference_build: Biokepi_run_environment.Reference_genome.name ->
[ `Fastq ] repr ->
[ `Bam ] repr
val hisat:
?configuration: Hisat.Configuration.t ->
reference_build: Biokepi_run_environment.Reference_genome.name ->
[ `Fastq ] repr ->
[ `Bam ] repr
val mosaik:
reference_build: Biokepi_run_environment.Reference_genome.name ->
[ `Fastq ] repr ->
[ `Bam ] repr
val stringtie:
?configuration: Stringtie.Configuration.t ->
[ `Bam ] repr ->
[ `Gtf ] repr
val gatk_indel_realigner:
?configuration : Gatk.Configuration.indel_realigner ->
[ `Bam ] repr ->
[ `Bam ] repr
val gatk_indel_realigner_joint:
?configuration : Gatk.Configuration.indel_realigner ->
([ `Bam ] * [ `Bam ]) repr ->
([ `Bam ] * [ `Bam ]) repr
val picard_mark_duplicates:
?configuration : Picard.Mark_duplicates_settings.t ->
[ `Bam ] repr ->
[ `Bam ] repr
val gatk_bqsr:
?configuration : Gatk.Configuration.bqsr ->
[ `Bam ] repr ->
[ `Bam ] repr
val seq2hla:
[ `Fastq ] repr ->
[ `Seq2hla_result ] repr
val optitype:
[`DNA | `RNA] ->
[ `Fastq ] repr ->
[ `Optitype_result ] repr
val gatk_haplotype_caller:
[ `Bam ] repr ->
[ `Vcf ] repr
val mutect:
?configuration: Biokepi_bfx_tools.Mutect.Configuration.t ->
normal: [ `Bam ] repr ->
tumor: [ `Bam ] repr ->
unit ->
[ `Vcf ] repr
val mutect2:
?configuration: Gatk.Configuration.Mutect2.t ->
normal: [ `Bam ] repr ->
tumor: [ `Bam ] repr ->
unit ->
[ `Vcf ] repr
val somaticsniper:
?configuration: Somaticsniper.Configuration.t ->
normal: [ `Bam ] repr ->
tumor: [ `Bam ] repr ->
unit ->
[ `Vcf ] repr
val varscan_somatic:
?adjust_mapq : int ->
normal: [ `Bam ] repr ->
tumor: [ `Bam ] repr ->
unit ->
[ `Vcf ] repr
val strelka:
?configuration: Strelka.Configuration.t ->
normal: [ `Bam ] repr ->
tumor: [ `Bam ] repr ->
unit ->
[ `Vcf ] repr
val virmid:
?configuration: Virmid.Configuration.t ->
normal: [ `Bam ] repr ->
tumor: [ `Bam ] repr ->
unit ->
[ `Vcf ] repr
val muse:
?configuration: Muse.Configuration.t ->
normal: [ `Bam ] repr ->
tumor: [ `Bam ] repr ->
unit ->
[ `Vcf ] repr
end
end
module To_display
: sig
include
Semantics.Bioinformatics_base
with type 'a repr = var_count: int -> SmartPrint.t
and
type 'a observation = SmartPrint.t
end
= struct
open Nonstd
module Tools = Biokepi_bfx_tools
module SP = SmartPrint
module OCaml = SmartPrint.OCaml
let entity sp = SP.(nest (parens (sp)))
type 'a repr = var_count: int -> SP.t
type 'a observation = SP.t
let lambda f =
fun ~var_count ->
let var_name = sprintf "var%d" var_count in
let var_repr = fun ~var_count -> SP.string var_name in
let applied = f var_repr ~var_count:(var_count + 1) in
entity SP.(string "λ" ^^ string var_name ^^ string "→" ^^ applied)
let apply f v =
fun ~var_count ->
entity (SP.separate SP.space [f ~var_count; v ~var_count])
let observe f = f () ~var_count:0
let to_unit x = fun ~var_count -> entity SP.(x ~var_count ^^ string ":> unit")
let list l =
fun ~var_count ->
SP.nest (OCaml.list (fun a -> a ~var_count) l)
let list_map l ~f =
let open SP in
fun ~var_count ->
entity (
string "List.map"
^^ nest (string "~f:" ^^ f ~var_count)
^^ l ~var_count
)
include To_json.Make_serializer (struct
type t = SP.t
let input_value name kv =
let open SP in
fun ~var_count ->
entity (
ksprintf string "input-%s" name
^^ (OCaml.list
(fun (k, v) -> parens (string k ^-^ string ":" ^^ string v))
kv)
)
let function_call name params =
let open SP in
entity (
ksprintf string "%s" name
^^ (OCaml.list
(fun (k, v) -> parens (string k ^-^ string ":" ^^ v))
params)
)
let string = SP.string
end)
end
module To_dot
= struct
open Nonstd
let failwithf fmt = ksprintf failwith fmt
module Tree = struct
type box = { id: string; name : string; attributes: (string * string) list}
type arrow = {
label: string;
points_to: t
} and t = [
| `Variable of string * string
| `Lambda of string * string * t
| `Apply of string * t * t
| `String of string
| `Input_value of box
| `Node of box * arrow list
]
let node_count = ref 0
let id_style = `Hash
let make_id a =
match a, id_style with
| _, `Unique
| `Unique, _ ->
incr node_count; sprintf "node%03d" !node_count
| `Of v, `Hash ->
Hashtbl.hash v |> sprintf "id%d"
let arrow label points_to = {label; points_to}
let variable id name = `Variable (make_id (`Of id), name)
let lambda varname expr = `Lambda (make_id (`Of (varname, expr)), varname, expr)
let apply f v = `Apply (make_id (`Of (f,v)), f, v)
let string s = `String s
let node ?id ?(a = []) name l : t =
let id =
match id with
| Some i -> i
| None -> make_id (`Of (name, a, l))
in
`Node ({id; name; attributes = a}, l)
let input_value ?(a = []) name : t =
let id = make_id (`Of (name, a)) in
`Input_value {id; name; attributes = a}
let to_dot t =
let open SmartPrint in
let semicolon = string ";" in
let sentence sp = sp ^-^ semicolon ^-^ newline in
let dot_attributes l =
brakets (
List.map l ~f:(fun (k, v) ->
string k ^^ string "=" ^^ v) |> separate (semicolon)
) in
let in_quotes s = ksprintf string "\"%s\"" s in
let label_attribute lab = ("label", in_quotes lab) in
let font_name `Mono =
("fontname", in_quotes "monospace") in
let font_size =
function
| `Small -> ("fontsize", in_quotes "12")
| `Default -> ("fontsize", in_quotes "16")
in
let dot_arrow src dest = string src ^^ string "->" ^^ string dest in
let id_of =
function
| `Lambda (id, _, _) -> id
| `Apply (id, _, _) -> id
| `Variable (id, _) -> id
| `String s -> assert false
| `Input_value {id; _} -> id
| `Node ({id; _}, _) -> id in
let label name attributes =
match attributes with
| [] -> name
| _ ->
sprintf "{<f0>%s |<f1> %s\\l }" name
(List.map attributes ~f:(fun (k,v) -> sprintf "%s: %s" k v)
|> String.concat "\\l")
in
let one o = [o] in
let rec go =
function
| `Variable (_, s) as v ->
let id = id_of v in
sentence (
string id ^^ dot_attributes [
label_attribute (label s []);
font_name `Mono;
"shape", in_quotes "hexagon";
]
)
|> one
| `Lambda (id, v, expr) ->
[
string "subgraph" ^^ ksprintf string "cluster_%s" id ^^ braces (
(
sentence (string "color=grey")
:: sentence (string "style=rounded")
:: sentence (string "penwidth=4")
:: go (node ~id (sprintf "Lambda %s" v) [arrow "Expr" expr])
)
|> List.dedup |> separate empty
) ^-^ newline
]
| `Apply (id, f, v) ->
go (node ~id "Apply F(X)" [arrow "F" f; arrow "X" v])
| `String s ->
failwithf "`String %S -> should have been eliminated" s
| `Input_value {id; name; attributes} ->
sentence (
string id ^^ dot_attributes [
label_attribute (label name attributes);
font_name `Mono;
"shape", in_quotes "Mrecord";
]
)
|> one
| `Node ({id; name; attributes}, trees) ->
sentence (
string id ^^ dot_attributes [
label_attribute (label name attributes);
font_name `Mono;
"shape", in_quotes "Mrecord";
]
)
:: List.concat_map trees ~f:(fun {label; points_to} ->
sentence (
dot_arrow (id_of points_to) id ^^ dot_attributes [
label_attribute label;
font_size `Small;
]
)
:: go points_to
)
in
let dot =
string "digraph target_graph" ^^ braces (nest (
sentence (
string "graph" ^^ dot_attributes [
"rankdir", in_quotes "LR";
font_size `Default;
]
)
^-^ (go t |> List.dedup |> separate empty)
))
in
dot
end
type 'a repr = var_count: int -> Tree.t
type 'a observation = SmartPrint.t
let lambda f =
fun ~var_count ->
let var_name = sprintf "Var_%d" var_count in
let var_repr_fake = fun ~var_count -> Tree.string var_name in
let applied_once = f var_repr_fake ~var_count:(var_count + 1) in
let var_repr = fun ~var_count -> Tree.variable applied_once var_name in
let applied = f var_repr ~var_count:(var_count + 1) in
Tree.lambda var_name applied
let apply f v =
fun ~var_count ->
let func = f ~var_count in
let arg = v ~var_count in
Tree.apply func arg
let observe f = f () ~var_count:0 |> Tree.to_dot
let to_unit x = x
let list l =
fun ~var_count ->
Tree.node "List.make"
(List.mapi ~f:(fun i a -> Tree.arrow (sprintf "L%d" i) (a ~var_count)) l)
let list_map l ~f =
fun ~var_count ->
Tree.node "List.map" [
Tree.arrow "list" (l ~var_count);
Tree.arrow "function" (f ~var_count);
]
include To_json.Make_serializer (struct
type t = Tree.t
let input_value name a ~var_count =
Tree.input_value name ~a
let function_call name params =
let a, arrows =
List.partition_map params ~f:(fun (k, v) ->
match v with
| `String s -> `Fst (k, s)
| _ -> `Snd (k, v)
) in
Tree.node ~a name (List.map ~f:(fun (k,v) -> Tree.arrow k v) arrows)
let string s = Tree.string s
end)
end
module To_json
= struct
open Nonstd
module Tools = Biokepi_bfx_tools
type json = Yojson.Basic.json
type 'a repr = var_count : int -> json
type 'a observation = json
let lambda f =
fun ~var_count ->
let var_name = sprintf "var%d" var_count in
let var_repr = fun ~var_count -> `String var_name in
let applied = f var_repr ~var_count:(var_count + 1) in
`Assoc [
"lambda", `String var_name;
"body", applied;
]
let apply f v =
fun ~var_count ->
let func = f ~var_count in
let arg = v ~var_count in
`Assoc [
"function", func;
"argument", arg;
]
let observe f = f () ~var_count:0
let to_unit j = j
let list l =
fun ~var_count ->
`List (List.map ~f:(fun a -> a ~var_count) l)
let list_map l ~f =
fun ~var_count ->
`Assoc [
"list-map", f ~var_count;
"argument", l ~var_count;
]
module Make_serializer (How : sig
type t
val input_value :
string -> (string * string) list -> var_count:int -> t
val function_call :
string -> (string * t) list -> t
val string : string -> t
end) = struct
open How
let fastq
~sample_name ?fragment_id ~r1 ?r2 () =
input_value "fastq" [
"sample_name", sample_name;
"fragment_id", Option.value ~default:"NONE" fragment_id;
"R1", r1;
"R2", Option.value ~default:"NONE" r2;
]
let fastq_gz
~sample_name ?fragment_id ~r1 ?r2 () =
input_value "fastq.gz" [
"sample_name", sample_name;
"fragment_id", Option.value ~default:"NONE" fragment_id;
"R1", r1;
"R2", Option.value ~default:"NONE" r2;
]
let bam ~path ?sorting ~reference_build () =
input_value "bam" [
"path", path;
"sorting",
Option.value_map ~default:"NONE" sorting
~f:(function `Coordinate -> "Coordinate" | `Read_name -> "Read-name");
"reference_build", reference_build;
]
let pair a b ~(var_count : int) =
function_call "make-pair" [
"first", a ~var_count;
"second", b ~var_count;
]
let pair_first p ~(var_count : int) =
function_call "pair-first" [
"pair", p ~var_count;
]
let pair_second p ~(var_count : int) =
function_call "pair-second" [
"pair", p ~var_count;
]
let aligner name conf_name ~reference_build fq : var_count: int -> How.t =
fun ~var_count ->
let fq_compiled = fq ~var_count in
function_call name [
"configuration", string conf_name;
"reference_build", string reference_build;
"input", fq_compiled;
]
let one_to_one name conf_name bam =
fun ~(var_count : int) ->
let bamc = bam ~var_count in
function_call name [
"configuration", string conf_name;
"input", bamc;
]
let bwa_aln
?(configuration = Tools.Bwa.Configuration.Aln.default) =
aligner "bwa-aln" (Tools.Bwa.Configuration.Aln.name configuration)
let bwa_mem
?(configuration = Tools.Bwa.Configuration.Mem.default) =
aligner "bwa-mem" (Tools.Bwa.Configuration.Mem.name configuration)
let gunzip gz ~(var_count : int) =
function_call "gunzip" ["input", gz ~var_count]
let gunzip_concat gzl ~(var_count : int) =
function_call "gunzip-concat" ["input-list", gzl ~var_count]
let concat l ~(var_count : int) =
function_call "concat" ["input-list", l ~var_count]
let merge_bams bl ~(var_count : int) =
function_call "merge-bams" ["input-list", bl ~var_count]
let star ?(configuration = Tools.Star.Configuration.Align.default) =
aligner "star" (Tools.Star.Configuration.Align.name configuration)
let hisat ?(configuration = Tools.Hisat.Configuration.default_v1) =
aligner "hisat" (configuration.Tools.Hisat.Configuration.name)
let mosaik =
aligner "mosaik" "default"
let stringtie ?(configuration = Tools.Stringtie.Configuration.default) =
one_to_one "stringtie" configuration.Tools.Stringtie.Configuration.name
let indel_real_config (indel, target) =
(sprintf "I%s-TC%s"
indel.Tools.Gatk.Configuration.Indel_realigner.name
target.Tools.Gatk.Configuration.Realigner_target_creator.name)
let gatk_indel_realigner
?(configuration = Tools.Gatk.Configuration.default_indel_realigner) =
one_to_one "gatk_indel_realigner" (indel_real_config configuration)
let gatk_indel_realigner_joint
?(configuration = Tools.Gatk.Configuration.default_indel_realigner) =
one_to_one "gatk_indel_realigner_joint" (indel_real_config configuration)
let picard_mark_duplicates
?(configuration = Tools.Picard.Mark_duplicates_settings.default) =
one_to_one
"picard_mark_duplicates"
configuration.Tools.Picard.Mark_duplicates_settings.name
let gatk_bqsr ?(configuration = Tools.Gatk.Configuration.default_bqsr) =
let (bqsr, preads) = configuration in
one_to_one "gatk_bqsr"
(sprintf "B%s-PR%s"
bqsr.Tools.Gatk.Configuration.Bqsr.name
preads.Tools.Gatk.Configuration.Print_reads.name)
let seq2hla =
one_to_one "seq2hla" "default"
let optitype how =
one_to_one "optitype" (match how with `DNA -> "DNA" | `RNA -> "RNA")
let gatk_haplotype_caller =
one_to_one "gatk_haplotype_caller" "default"
let bam_to_fastq ~sample_name ?fragment_id se_or_pe bam =
fun ~(var_count : int) ->
let bamc = bam ~var_count in
function_call "bam_to_fastq" [
"sample_name", string sample_name;
"fragment_id",
Option.value_map ~f:string ~default:(string "N/A") fragment_id;
"endness", string (
match se_or_pe with
| `SE -> "SE"
| `PE -> "PE"
);
"input", bamc;
]
let variant_caller name conf_name ~normal ~tumor () ~(var_count : int) =
function_call name [
"configuration", string conf_name;
"normal", normal ~var_count;
"tumor", tumor ~var_count;
]
let mutect ?(configuration = Tools.Mutect.Configuration.default) =
variant_caller "mutect"
configuration.Tools.Mutect.Configuration.name
let mutect2 ?(configuration = Tools.Gatk.Configuration.Mutect2.default) =
variant_caller "mutect2"
configuration.Tools.Gatk.Configuration.Mutect2.name
let somaticsniper ?(configuration = Tools.Somaticsniper.Configuration.default) =
variant_caller "somaticsniper"
configuration.Tools.Somaticsniper.Configuration.name
let strelka ?(configuration = Tools.Strelka.Configuration.default) =
variant_caller "strelka"
configuration.Tools.Strelka.Configuration.name
let varscan_somatic ?adjust_mapq =
variant_caller "varscan_somatic"
(Option.value_map ~f:(sprintf "amap%d") ~default:"default" adjust_mapq)
let muse ?(configuration = Tools.Muse.Configuration.default `WGS) =
variant_caller "muse"
configuration.Tools.Muse.Configuration.name
let virmid ?(configuration = Tools.Virmid.Configuration.default) =
variant_caller "virmid"
configuration.Tools.Virmid.Configuration.name
end
include Make_serializer (struct
type t = json
let input_value name kv =
fun ~var_count ->
`Assoc (
("input-value", `String name)
:: List.map kv ~f:(fun (k, v) -> k, `String v)
)
let function_call name params =
`Assoc ( ("function-call", `String name) :: params)
let string s = `String s
end)
end
module To_workflow
= struct
open Nonstd
let (//) = Filename.concat
module File_type_specification = struct
open Biokepi_run_environment.Common.KEDSL
type 'a t = ..
type 'a t +=
| To_unit: 'a t -> unit t
| Fastq: fastq_reads workflow_node -> [ `Fastq ] t
| Bam: bam_file workflow_node -> [ `Bam ] t
| Vcf: single_file workflow_node -> [ `Vcf ] t
| Gtf: single_file workflow_node -> [ `Gtf ] t
| Seq2hla_result: list_of_files workflow_node -> [ `Seq2hla_result ] t
| Optitype_result: unknown_product workflow_node -> [ `Optitype_result ] t
| Gz: 'a t -> [ `Gz of 'a ] t
| List: 'a t list -> 'a list t
| Pair: 'a t * 'b t -> ('a * 'b) t
| Lambda: ('a t -> 'b t) -> ('a -> 'b) t
let rec to_string : type a. a t -> string =
function
| To_unit a -> sprintf "(to_unit %s)" (to_string a)
| Fastq _ -> "Fastq"
| Bam _ -> "Bam"
| Vcf _ -> "Vcf"
| Gtf _ -> "Gtf"
| Seq2hla_result _ -> "Seq2hla_result"
| Optitype_result _ -> "Optitype_result"
| Gz a -> sprintf "(gz %s)" (to_string a)
| List l -> sprintf "[%s]" (List.map l ~f:to_string |> String.concat "; ")
| Pair (a, b) -> sprintf "(%s, %s)" (to_string a) (to_string b)
| Lambda _ -> "--LAMBDA--"
| _ -> "##UNKNOWN##"
let fail_get other name =
ksprintf failwith "Error while extracting File_type_specification.t (%s case, in %s), this usually means that the type has been wrongly extended, and provides more than one [ `%s ] t case" name (to_string other) name
let get_fastq : [ `Fastq ] t -> fastq_reads workflow_node = function
| Fastq b -> b
| o -> fail_get o "Fastq"
let get_bam : [ `Bam ] t -> bam_file workflow_node = function
| Bam b -> b
| o -> fail_get o "Bam"
let get_vcf : [ `Vcf ] t -> single_file workflow_node = function
| Vcf v -> v
| o -> fail_get o "Vcf"
let get_gtf : [ `Gtf ] t -> single_file workflow_node = function
| Gtf v -> v
| o -> fail_get o "Gtf"
let get_seq2hla_result : [ `Seq2hla_result ] t -> list_of_files workflow_node =
function
| Seq2hla_result v -> v
| o -> fail_get o "Seq2hla_result"
let get_optitype_result : [ `Optitype_result ] t -> unknown_product workflow_node =
function
| Optitype_result v -> v
| o -> fail_get o "Optitype_result"
let get_gz : [ `Gz of 'a ] t -> 'a t = function
| Gz v -> v
| o -> fail_get o "Gz"
let get_list : 'a list t -> 'a t list = function
| List v -> v
| o -> fail_get o "List"
let pair a b = Pair (a, b)
let pair_first =
function
| Pair (a, _) -> a
| other -> fail_get other "Pair"
let pair_second =
function
| Pair (_, b) -> b
| other -> fail_get other "Pair"
let rec as_dependency_edges : type a. a t -> workflow_edge list =
let one_depends_on wf = [depends_on wf] in
function
| To_unit v -> as_dependency_edges v
| Fastq wf -> one_depends_on wf
| Bam wf -> one_depends_on wf
| Vcf wf -> one_depends_on wf
| Gtf wf -> one_depends_on wf
| Seq2hla_result wf -> one_depends_on wf
| Optitype_result wf -> one_depends_on wf
| List l -> List.concat_map l ~f:as_dependency_edges
| Pair (a, b) -> as_dependency_edges a @ as_dependency_edges b
| other -> fail_get other "as_dependency_edges"
let get_unit_workflow :
name: string ->
unit t ->
unknown_product workflow_node =
fun ~name f ->
match f with
| To_unit v ->
workflow_node without_product
~name ~edges:(as_dependency_edges v)
| other -> fail_get other "get_unit_workflow"
end
open Biokepi_run_environment
module type Compiler_configuration = sig
val work_dir: string
val machine : Machine.t
val map_reduce_gatk_indel_realigner : bool
end
module Defaults = struct
let map_reduce_gatk_indel_realigner = true
end
module Make (Config : Compiler_configuration)
: Semantics.Bioinformatics_base
with type 'a repr = 'a File_type_specification.t and
type 'a observation = 'a File_type_specification.t
= struct
include File_type_specification
module Tools = Biokepi_bfx_tools
module KEDSL = Common.KEDSL
let failf fmt =
ksprintf failwith fmt
type 'a repr = 'a t
type 'a observation = 'a repr
let observe : (unit -> 'a repr) -> 'a observation = fun f -> f ()
let lambda : ('a repr -> 'b repr) -> ('a -> 'b) repr = fun f ->
Lambda f
let apply : ('a -> 'b) repr -> 'a repr -> 'b repr = fun f_repr x ->
match f_repr with
| Lambda f -> f x
| _ -> assert false
let list : 'a repr list -> 'a list repr = fun l -> List l
let list_map : 'a list repr -> f:('a -> 'b) repr -> 'b list repr = fun l ~f ->
match l with
| List l ->
List (List.map ~f:(fun v -> apply f v) l)
| _ -> assert false
let to_unit x = To_unit x
let host = Machine.as_host Config.machine
let run_with = Config.machine
let fastq
~sample_name ?fragment_id ~r1 ?r2 () =
Fastq (
KEDSL.workflow_node (KEDSL.fastq_reads ~host ~name:sample_name r1 r2)
~name:(sprintf "Input-fastq: %s (%s)" sample_name
(Option.value fragment_id ~default:(Filename.basename r1)))
)
let fastq_gz
~sample_name ?fragment_id ~r1 ?r2 () =
Gz (
Fastq (
KEDSL.workflow_node (KEDSL.fastq_reads ~host ~name:sample_name r1 r2)
~name:(sprintf "Input-fastq-gz: %s (%s)" sample_name
(Option.value fragment_id ~default:(Filename.basename r1)))
)
)
let bam ~path ?sorting ~reference_build () =
Bam (
KEDSL.workflow_node
(KEDSL.bam_file ~host ?sorting ~reference_build path)
~name:(sprintf "Input-bam: %s" (Filename.basename path))
)
let make_aligner
name ~make_workflow ~config_name
~configuration ~reference_build fastq =
let freads = get_fastq fastq in
let result_prefix =
Config.work_dir //
sprintf "%s-%s_%s-%s"
freads#product#escaped_sample_name
(Option.value freads#product#fragment_id ~default:"")
name
(config_name configuration)
in
Bam (
make_workflow
~reference_build ~configuration ~result_prefix ~run_with freads
)
let bwa_aln
?(configuration = Tools.Bwa.Configuration.Aln.default) =
make_aligner "bwaaln" ~configuration
~config_name:Tools.Bwa.Configuration.Aln.name
~make_workflow:(
fun
~reference_build
~configuration ~result_prefix ~run_with freads ->
Tools.Bwa.align_to_sam
~reference_build ~configuration ~fastq:freads
~result_prefix ~run_with ()
|> Tools.Samtools.sam_to_bam ~reference_build ~run_with
)
let bwa_mem
?(configuration = Tools.Bwa.Configuration.Mem.default) =
make_aligner "bwamem" ~configuration
~config_name:Tools.Bwa.Configuration.Mem.name
~make_workflow:(
fun
~reference_build
~configuration ~result_prefix ~run_with freads ->
Tools.Bwa.mem_align_to_sam
~reference_build ~configuration ~fastq:freads
~result_prefix ~run_with ()
|> Tools.Samtools.sam_to_bam ~reference_build ~run_with
)
let star
?(configuration = Tools.Star.Configuration.Align.default) =
make_aligner "star"
~configuration
~config_name:Tools.Star.Configuration.Align.name
~make_workflow:(
fun ~reference_build ~configuration ~result_prefix ~run_with fastq ->
Tools.Star.align ~configuration ~reference_build
~fastq ~result_prefix ~run_with ()
)
let hisat
?(configuration = Tools.Hisat.Configuration.default_v1) =
make_aligner "hisat"
~configuration
~config_name:Tools.Hisat.Configuration.name
~make_workflow:(
fun ~reference_build ~configuration ~result_prefix ~run_with fastq ->
Tools.Hisat.align ~configuration ~reference_build
~fastq ~result_prefix ~run_with ()
|> Tools.Samtools.sam_to_bam ~reference_build ~run_with
)
let mosaik =
make_aligner "mosaik"
~configuration:()
~config_name:(fun _ -> "default")
~make_workflow:(
fun ~reference_build ~configuration ~result_prefix ~run_with fastq ->
Tools.Mosaik.align ~reference_build
~fastq ~result_prefix ~run_with ())
let gunzip: type a. [ `Gz of a ] t -> a t = fun gz ->
let inside = get_gz gz in
begin match inside with
| Fastq f ->
let make_result_path read =
let base = Filename.basename read in
Config.work_dir //
(match base with
| fastqgz when Filename.check_suffix base ".fastq.gz" ->
Filename.chop_suffix base ".gz"
| fqz when Filename.check_suffix base ".fqz" ->
Filename.chop_suffix base ".fqz" ^ ".fastq"
| other ->
ksprintf failwith "To_workflow.gunzip: cannot recognize Gz-Fastq extension: %S" other)
in
let gunzip read =
let result_path = make_result_path read#product#path in
Workflow_utilities.Gunzip.concat
~run_with [read] ~result_path in
let fastq_r1 = gunzip f#product#r1 in
let fastq_r2 = Option.map f#product#r2 ~f:gunzip in
Fastq (
KEDSL.fastq_node_of_single_file_nodes ~host
~name:f#product#sample_name
?fragment_id:f#product#fragment_id
fastq_r1 fastq_r2
)
| other ->
ksprintf failwith "To_workflow.gunzip: non-FASTQ input not implemented"
end
let gunzip_concat gzl =
ksprintf failwith "To_workflow.gunzip_concat: not implemented"
let concat : type a. a list t -> a t =
fun l ->
let l = get_list l in
begin match l with
| Fastq first_fastq :: _ as lfq ->
let fqs = List.map lfq ~f:get_fastq in
let r1s = List.map fqs ~f:(fun f -> f#product#r1) in
let r2s = List.filter_map fqs ~f:(fun f -> f#product#r2) in
let concat_files ~read l =
let result_path =
Config.work_dir //
sprintf "%s-Read%d-Concat.fastq"
first_fastq#product#escaped_sample_name read in
Workflow_utilities.Cat.concat ~run_with l ~result_path in
let read_1 = concat_files r1s ~read:1 in
let read_2 =
match r2s with [] -> None | more -> Some (concat_files more ~read:2)
in
Fastq (
KEDSL.fastq_node_of_single_file_nodes ~host
~name:first_fastq#product#sample_name
~fragment_id:"edsl-concat"
read_1 read_2
)
| other ->
ksprintf failwith "To_workflow.concat: not implemented"
end
let merge_bams: [ `Bam ] list t -> [ `Bam ] t =
function
| List [ one_bam ] -> one_bam
| List bam_files ->
let bams = List.map bam_files ~f:get_bam in
let output_path =
let one = List.hd_exn bams in
Filename.chop_extension one#product#path
^ sprintf "-merged-%s.bam"
(List.map bams ~f:(fun bam -> bam#product#path)
|> String.concat ""
|> Digest.string |> Digest.to_hex)
in
Bam (Tools.Samtools.merge_bams ~run_with bams output_path)
| other ->
fail_get other "To_workflow.merge_bams: not a list of bams?"
let stringtie ?(configuration = Tools.Stringtie.Configuration.default) bamt =
let bam = get_bam bamt in
let result_prefix =
Filename.chop_extension bam#product#path
^ sprintf "_stringtie-%s"
(configuration.Tools.Stringtie.Configuration.name)
in
Gtf (Tools.Stringtie.run ~configuration ~bam ~result_prefix ~run_with ())
let indel_realigner_function:
type a. ?configuration: _ -> a KEDSL.bam_or_bams -> a =
fun
?(configuration = Tools.Gatk.Configuration.default_indel_realigner)
on_what ->
match Config.map_reduce_gatk_indel_realigner with
| true ->
Tools.Gatk.indel_realigner_map_reduce ~run_with ~compress:false
~configuration on_what
| false ->
Tools.Gatk.indel_realigner ~run_with ~compress:false
~configuration on_what
let gatk_indel_realigner ?configuration bam =
let input_bam = get_bam bam in
Bam (indel_realigner_function ?configuration (KEDSL.Single_bam input_bam))
let gatk_indel_realigner_joint ?configuration bam_pair =
let bam1 = bam_pair |> pair_first |> get_bam in
let bam2 = bam_pair |> pair_second |> get_bam in
let bam_list_node =
indel_realigner_function (KEDSL.Bam_workflow_list [bam1; bam2])
?configuration
in
begin match KEDSL.explode_bam_list_node bam_list_node with
| [realigned_normal; realigned_tumor] ->
pair (Bam realigned_normal) (Bam realigned_tumor)
| other ->
failf "Gatk.indel_realigner did not return the correct list of length 2 (tumor, normal): it gave %d bams"
(List.length other)
end
let picard_mark_duplicates
?(configuration = Tools.Picard.Mark_duplicates_settings.default) bam =
let input_bam = get_bam bam in
let output_bam =
Filename.chop_extension input_bam#product#path ^ "_markdup.bam" in
Bam (
Tools.Picard.mark_duplicates ~settings:configuration
~run_with ~input_bam output_bam
)
let gatk_bqsr ?(configuration = Tools.Gatk.Configuration.default_bqsr) bam =
let input_bam = get_bam bam in
let output_bam =
let (bqsr, preads) = configuration in
Filename.chop_extension input_bam#product#path
^ sprintf "_bqsr-B%sP%s.bam"
bqsr.Tools.Gatk.Configuration.Bqsr.name
preads.Tools.Gatk.Configuration.Print_reads.name
in
Bam (
Tools.Gatk.base_quality_score_recalibrator ~configuration
~run_with ~input_bam ~output_bam
)
let seq2hla fq =
let fastq = get_fastq fq in
let r1 = fastq#product#r1 in
let r2 =
match fastq#product#r2 with
| Some r -> r
| None ->
failf "Seq2HLA doesn't support Single_end_sample(s)."
in
let work_dir =
Config.work_dir //
sprintf "%s-%s_seq2hla-workdir"
fastq#product#escaped_sample_name
fastq#product#fragment_id_forced
in
Seq2hla_result (
Tools.Seq2HLA.hla_type
~work_dir ~run_with ~run_name:fastq#product#escaped_sample_name ~r1 ~r2
)
let optitype how fq =
let fastq = get_fastq fq in
let work_dir =
Config.work_dir //
sprintf "%s-%s_optitype-%s-workdir"
fastq#product#escaped_sample_name
(match how with `RNA -> "RNA" | `DNA -> "DNA")
fastq#product#fragment_id_forced
in
Optitype_result (
Tools.Optitype.hla_type
~work_dir ~run_with ~run_name:fastq#product#escaped_sample_name ~fastq
how
:> KEDSL.unknown_product KEDSL.workflow_node
)
let gatk_haplotype_caller bam =
let input_bam = get_bam bam in
let result_prefix =
Filename.chop_extension input_bam#product#path ^ sprintf "_gatkhaplo" in
Vcf (
Tools.Gatk.haplotype_caller ~run_with
~input_bam ~result_prefix `Map_reduce
)
let bam_to_fastq ~sample_name ?fragment_id how bam =
let input_bam = get_bam bam in
let sample_type = match how with `SE -> `Single_end | `PE -> `Paired_end in
let output_prefix =
Config.work_dir
// sprintf "%s-b2fq-%s"
Filename.(chop_extension (basename input_bam#product#path))
(match how with `PE -> "PE" | `SE -> "SE")
in
Fastq (
Tools.Picard.bam_to_fastq ~run_with ~sample_type
~sample_name ~output_prefix input_bam
)
let somatic_vc
name confname default_conf runfun
?configuration ~normal ~tumor () =
let normal_bam = get_bam normal in
let tumor_bam = get_bam tumor in
let configuration = Option.value configuration ~default:default_conf in
let result_prefix =
Filename.chop_extension tumor_bam#product#path
^ sprintf "_%s-%s" name (confname configuration)
in
Vcf (
runfun
~configuration ~run_with
~normal:normal_bam ~tumor:tumor_bam ~result_prefix
)
let mutect =
somatic_vc "mutect"
Tools.Mutect.Configuration.name
Tools.Mutect.Configuration.default
(fun
~configuration ~run_with
~normal ~tumor ~result_prefix ->
Tools.Mutect.run ~configuration ~run_with
~normal ~tumor ~result_prefix `Map_reduce)
let mutect2 =
somatic_vc "mutect2"
Tools.Gatk.Configuration.Mutect2.name
Tools.Gatk.Configuration.Mutect2.default
(fun
~configuration ~run_with
~normal ~tumor ~result_prefix ->
Tools.Gatk.mutect2 ~configuration ~run_with
~input_normal_bam:normal
~input_tumor_bam:tumor
~result_prefix `Map_reduce)
let somaticsniper =
somatic_vc "somaticsniper"
Tools.Somaticsniper.Configuration.name
Tools.Somaticsniper.Configuration.default
(fun
~configuration ~run_with
~normal ~tumor ~result_prefix ->
Tools.Somaticsniper.run
~configuration ~run_with ~normal ~tumor ~result_prefix ())
let strelka =
somatic_vc "strelka"
Tools.Strelka.Configuration.name
Tools.Strelka.Configuration.default
(fun
~configuration ~run_with
~normal ~tumor ~result_prefix ->
Tools.Strelka.run
~configuration ~normal ~tumor ~run_with ~result_prefix ())
let varscan_somatic ?adjust_mapq =
somatic_vc "varscan_somatic" (fun () ->
sprintf "Amq%s"
(Option.value_map adjust_mapq ~default:"N" ~f:Int.to_string))
()
(fun
~configuration ~run_with
~normal ~tumor ~result_prefix ->
Tools.Varscan.somatic_map_reduce ?adjust_mapq
~run_with ~normal ~tumor ~result_prefix ())
~configuration:()
let muse =
somatic_vc "muse"
Tools.Muse.Configuration.name
(Tools.Muse.Configuration.default `WGS)
(fun
~configuration ~run_with
~normal ~tumor ~result_prefix ->
Tools.Muse.run
~configuration
~run_with ~normal ~tumor ~result_prefix `Map_reduce)
let virmid =
somatic_vc "virmid"
Tools.Virmid.Configuration.name
Tools.Virmid.Configuration.default
(fun
~configuration ~run_with
~normal ~tumor ~result_prefix ->
Tools.Virmid.run
~configuration ~normal ~tumor ~run_with ~result_prefix ())
end
end
module Transform_applications
= struct
open Nonstd
module Apply_optimization_framework
(Input : Semantics.Bioinformatics_base)
= struct
module Transformation_types = struct
type 'a from = 'a Input.repr
type 'a term =
| Unknown : 'a from -> 'a term
| Apply : ('a -> 'b) term * 'a term -> 'b term
| Lambda : ('a term -> 'b term) -> ('a -> 'b) term
| List_make: ('a term) list -> 'a list term
| List_map: ('a list term * ('a -> 'b) term) -> 'b list term
| Pair: 'a term * 'b term -> ('a * 'b) term
| Fst: ('a * 'b) term -> 'a term
| Snd: ('a * 'b) term -> 'b term
let fwd x = Unknown x
let rec bwd : type a. a term -> a from =
function
| Apply (Lambda f, v) -> bwd (f v)
| Apply (other, v) -> Input.apply (bwd other) (bwd v)
| List_map (List_make l, Lambda f) ->
Input.list (List.map ~f:(fun x -> bwd (f x)) l)
| List_map (x, f) ->
Input.list_map ~f:(bwd f) (bwd x)
| Lambda f ->
Input.lambda (fun x -> (bwd (f (fwd x))))
| List_make l -> Input.list (List.map ~f:bwd l)
| Fst (Pair (a, b)) -> bwd a
| Snd (Pair (a, b)) -> bwd b
| Pair (a, b) -> Input.pair (bwd a) (bwd b)
| Fst b -> Input.pair_first (bwd b)
| Snd b -> Input.pair_second (bwd b)
| Unknown x -> x
end
module Transformation =
Optimization_framework.Define_transformation(Transformation_types)
open Transformation
module Language_delta = struct
open Transformation_types
let apply f x = Apply (f, x)
let lambda f = Lambda f
let list l = List_make l
let list_map l ~f = List_map (l, f)
let pair a b = Pair (a, b)
let pair_first p = Fst p
let pair_second p = Snd p
end
end
module Apply (Input : Semantics.Bioinformatics_base) = struct
module The_pass = Apply_optimization_framework(Input)
include Optimization_framework.Generic_optimizer(The_pass.Transformation)(Input)
include The_pass.Language_delta
end
end