From 9db02cf87e0a9d2ed869355208a6f2f4726b803d Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 17 Apr 2024 13:43:28 +0100 Subject: [PATCH 1/2] Filter out empty partitions Closes #120 Closes #45 --- CHANGELOG.md | 1 + bio2zarr/vcf_utils.py | 18 +++++++++++++++++- tests/test_vcf_utils.py | 8 +------- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba20b30..233a255 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - Move making ICF field partition directories into per-partition processing. Remove progress on the init mkdirs step. - Turn off progress monitor on dexplode-partition +- Fix empty partition bug # 0.0.4 2024-04-08 diff --git a/bio2zarr/vcf_utils.py b/bio2zarr/vcf_utils.py index d5cc00c..344d615 100644 --- a/bio2zarr/vcf_utils.py +++ b/bio2zarr/vcf_utils.py @@ -435,6 +435,22 @@ def variants(self, region): if var.POS >= start: yield var + def _filter_empty(self, regions): + """ + Return all regions in the specified list that have one or more records. + + Sometimes with Tabix indexes these seem to crop up: + + - https://github.com/sgkit-dev/bio2zarr/issues/45 + - https://github.com/sgkit-dev/bio2zarr/issues/120 + """ + ret = [] + for region in regions: + variants = self.variants(region) + if next(variants, None) is not None: + ret.append(region) + return ret + def partition_into_regions( self, num_parts: Optional[int] = None, @@ -511,4 +527,4 @@ def partition_into_regions( if self.index.record_counts[ri] > 0: regions.append(Region(self.sequence_names[ri])) - return regions + return self._filter_empty(regions) diff --git a/tests/test_vcf_utils.py b/tests/test_vcf_utils.py index 03cac42..898dcd6 100644 --- a/tests/test_vcf_utils.py +++ b/tests/test_vcf_utils.py @@ -9,13 +9,7 @@ def assert_part_counts_non_zero(part_counts, index_file): - # We may have one zero count value at the end in Tabix indexes. - # Should probably try to get rid of it, but probably no harm - # https://github.com/jeromekelleher/bio2zarr/issues/45 - if index_file.endswith(".tbi"): - assert np.all(part_counts[:-1] > 0) - else: - assert np.all(part_counts > 0) + assert np.all(part_counts > 0) class TestIndexedVcf: From d17ee5dbdbc262d56d73ec7f16017871ce80d7be Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 17 Apr 2024 13:45:33 +0100 Subject: [PATCH 2/2] Update CHANGELOG for 0.0.5 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 233a255..2b328c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -# 0.0.5 2024-04-XX +# 0.0.5 2024-04-17 - Fix bug in schema handling (compressor settings ignored) - Move making ICF field partition directories into per-partition processing.