diff --git a/README.md b/README.md index 9bce1e2..ca27f67 100755 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ data. It is designed to analyse genetically heterogeneous samples. Its tools are written in different programming languages and provide error correction, haplotype reconstruction and estimation of the frequency of the different genetic variants present in a mixed sample. +VILOCA takes an alignment file as input, and subsequently generates mutation calls and local haplotypes. + The corresponding manuscript can be found here: https://www.biorxiv.org/content/10.1101/2024.06.06.597712v1 @@ -62,6 +64,13 @@ There are several parameters available: `--windowsize`: In case no insert file is provided, the genome is tiled into uniform local regions. `windowsize` determines the length of those local regions. It should be of roughly the length of the reads. This is also the length of the haplotypes that are produced. +### Output +`haplotypes` This directory contains the reconstructed local haplotypes as separate fasta files per local region. + +`coverage.txt` List of each local region with start and end positions, and number of reads considered in the region. + +`cooccurring_mutations.csv` All mutation calls including the information on which haplotype it occurred on. + ## Development/CI with Docker The following command in the root directory will let you interact with the project locally through Docker. ```bash diff --git a/tests/test_long_deletions.py b/tests/test_long_deletions.py index 8dca47d..0eb9119 100644 --- a/tests/test_long_deletions.py +++ b/tests/test_long_deletions.py @@ -16,8 +16,8 @@ def test_long_deletions(): # Input data bamfile = "test_aln.cram" - snvsfile = "snv/SNV.tsv" - outfile = "snv/SNVs_0.010000.tsv" + snvsfile = "work/snv/SNV.tsv" + outfile = "work/snv/SNVs_0.010000.tsv" helper_long_deletions.main(bamfile=bamfile, snvsfile=snvsfile, outfile=outfile) diff --git a/tests/test_pooled_post.py b/tests/test_pooled_post.py index 7baf31c..f5363d7 100644 --- a/tests/test_pooled_post.py +++ b/tests/test_pooled_post.py @@ -24,7 +24,7 @@ def open_side_effect(name): def test__ingest_sampler_results_gamma_theta(): with patch("builtins.open", mock_open(read_data="bla\n#gamma = 0.967662\n\nlolz\n#theta = 0.88\neof")): - out = pooled_post._ingest_sampler_results_gamma_theta(open("debug/dbg.dbg"), "shorah") + out = pooled_post._ingest_sampler_results_gamma_theta(open("work/debug/dbg.dbg"), "shorah") np.testing.assert_almost_equal(out[0][0], -0.032872426234558716) np.testing.assert_almost_equal(out[0][1], -3.431512269664515) diff --git a/tests/test_shotgun_e2e.py b/tests/test_shotgun_e2e.py index 4928d1f..6457f88 100644 --- a/tests/test_shotgun_e2e.py +++ b/tests/test_shotgun_e2e.py @@ -5,7 +5,7 @@ import shutil cwd = "./data_1" -f_path = "raw_reads/w-HXB2-2804-3004.extended-ref.fas" +f_path = "work/raw_reads/w-HXB2-2804-3004.extended-ref.fas" base_files = [] @pytest.fixture(scope="session", autouse=True) @@ -35,7 +35,7 @@ def test_e2e_shorah(): assert filecmp.cmp( "./data_1/test.csv", - "./data_1/snv/SNVs_0.010000_final.csv", + "./data_1/work/snv/SNVs_0.010000_final.csv", shallow=False ) diff --git a/viloca/b2w.py b/viloca/b2w.py index 961a34e..3536710 100644 --- a/viloca/b2w.py +++ b/viloca/b2w.py @@ -555,6 +555,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy, reference_filename=reference_filename, threads=max_proc #1 ) + # print this message since pysam print confusing error message with the functions above. + print("Index file was created successfully.") #reffile = pysam.FastaFile(reference_filename) --> we need to read it in each child processo #counter = 0 #--> counter is now coputed initially for all windows reference_name = tiling_strategy.get_reference_name() diff --git a/viloca/cli.py b/viloca/cli.py index aee2cd2..024a62a 100644 --- a/viloca/cli.py +++ b/viloca/cli.py @@ -178,7 +178,7 @@ def main(): parser_shotgun.add_argument("--n_max_haplotypes", metavar='INT', type=int, required=False, default=100, dest="n_max_haplotypes", - help="Guess of maximal guess of haplotypes.") + help="Guess of maximal guess of haplotypes. If VILOCA returns the maximal number of haplotypes then this number was choosen to little and needs to be increased.") parser_shotgun.add_argument("--conv_thres", metavar='FLOAT', type=float, required=False, default=1e-03, dest="conv_thres", diff --git a/viloca/local_haplotype_inference/learn_error_params/run_dpm_mfa.py b/viloca/local_haplotype_inference/learn_error_params/run_dpm_mfa.py index ffdb8a9..eabb610 100644 --- a/viloca/local_haplotype_inference/learn_error_params/run_dpm_mfa.py +++ b/viloca/local_haplotype_inference/learn_error_params/run_dpm_mfa.py @@ -11,7 +11,7 @@ from . import cavi logging.basicConfig( - filename="viloca_inference.log", encoding="utf-8", level=logging.INFO + filename="viloca.log", encoding="utf-8", level=logging.INFO ) diff --git a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py index b62a84f..58ff5b4 100644 --- a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py +++ b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py @@ -13,7 +13,7 @@ from . import cavi logging.basicConfig( - filename="viloca_inference.log", encoding="utf-8", level=logging.INFO + filename="viloca.log", encoding="utf-8", level=logging.INFO ) diff --git a/viloca/shorah_snv.py b/viloca/shorah_snv.py index a0b5472..077b03d 100644 --- a/viloca/shorah_snv.py +++ b/viloca/shorah_snv.py @@ -203,7 +203,7 @@ def parseWindow(line, extended_window_mode, exclude_non_var_pos_threshold, _, chrom, beg, end, _ = line.rstrip().split("\t") file_stem = "w-%s-%s-%s" % (chrom, beg, end) - haplo_filename = os.path.join(working_dir, "support", file_stem + ".reads-support.fas") + haplo_filename = os.path.join(working_dir, "haplotypes", file_stem + ".reads-support.fas") if extended_window_mode: ref_name = "extended-ref" diff --git a/viloca/shotgun.py b/viloca/shotgun.py index 4a40cb0..1495a71 100644 --- a/viloca/shotgun.py +++ b/viloca/shotgun.py @@ -523,7 +523,7 @@ def main(args): logging.debug("[shotgun] All processes completed successfully.") # prepare directories - for sd_name in ['debug', 'sampling', 'freq', 'support', + for sd_name in ['debug', 'haplotypes', 'freq', 'sampling', 'work', 'corrected', 'raw_reads', 'inference']: try: os.mkdir(sd_name) @@ -557,7 +557,7 @@ def main(args): move_files_into_dir("debug", glob.glob("./w*dbg")) move_files_into_dir("sampling", glob.glob("./w*smp")) move_files_into_dir("corrected", glob.glob("./w*reads-cor.fas")) - move_files_into_dir("support", glob.glob("./w*reads-support.fas")) + move_files_into_dir("haplotypes", glob.glob("./w*reads-support.fas")) move_files_into_dir("freq", glob.glob("./w*reads-freq.csv")) move_files_into_dir("sampling", glob.glob("./w*smp")) raw_reads_files = glob.glob('./w*reads.fas') + glob.glob('./w*ref.fas') + glob.glob('./w*qualities.npy') @@ -628,15 +628,15 @@ def main(args): envp_post.post_process_for_envp( open(f"raw_reads/{stem}.envp-full-ref.fas"), open(f"raw_reads/{stem}.envp-ref.fas"), - f"support/{stem}.reads-support.fas", - f"support/{stem}.reads-support.fas" # overwrite + f"haplotypes/{stem}.reads-support.fas", + f"haplotypes/{stem}.reads-support.fas" # overwrite ) # Pooled b_list = args.b.copy() if len(b_list) > 1: for idx, i in enumerate(b_list): - Path(f"sample{idx}/support").mkdir(parents=True, exist_ok=True) + Path(f"sample{idx}/haplotypes").mkdir(parents=True, exist_ok=True) Path(f"sample{idx}/corrected").mkdir(parents=True, exist_ok=True) with open("coverage.txt") as cov: for line in cov: @@ -654,7 +654,7 @@ def main(args): f"raw_reads/{stem}.ref.fas", filtered_reads.name, open(f"debug/{stem}.dbg") if inference_type == "shorah" else open(f"inference/{stem}.reads-all_results.pkl", "rb"), - open(f"support/{stem}.reads-support.fas"), + open(f"haplotypes/{stem}.reads-support.fas"), filtered_cor_reads_path, inference_type, None if inference_type != "use_quality_scores" else f"raw_reads/{stem}.qualities.npy" # TODO untested @@ -662,8 +662,8 @@ def main(args): filtered_reads.close() pooled_post.write_support_file_per_sample( - open(f"support/{stem}.reads-support.fas"), - open(f"sample{idx}/support/{stem}.reads-support.fas", "w+"), # TODO + open(f"haplotypes/{stem}.reads-support.fas"), + open(f"sample{idx}/haplotypes/{stem}.reads-support.fas", "w+"), # TODO *posterior_and_avg ) @@ -682,7 +682,23 @@ def main(args): os.rename('snv', 'snv_before_%d' % int(time.time())) os.mkdir('snv') - for snv_file in glob.glob('./raw_snv*') + glob.glob('./SNV*')+ glob.glob('./cooccurring_mutations.csv'): + for snv_file in glob.glob('./SNV*_final*')+ glob.glob('./cooccurring_mutations.csv'): + shutil.copy(snv_file, 'snv/') + for snv_file in glob.glob('./raw_snv*') + glob.glob('./SNV*.tsv'): shutil.move(snv_file, 'snv/') + # now move all files that are not directly results into the debug directory + shutil.move("inference", "work") + shutil.move("raw_reads", "work") + shutil.move("sampling", "work") + shutil.move("debug", "work") + shutil.move("freq", "work") + shutil.move("corrected", "work") + shutil.move("reads.fas", "work") + shutil.move("proposed.dat", "work") + shutil.move("snv", "work") + shutil.move(glob.glob('*.cor.fas')[0], "work") + logging.info('shotgun run ends') + logging.info('VILOCA terminated') + print("VILOCA terminated successfully.")