Skip to content

Commit

Permalink
Merge pull request #121 from jeromekelleher/trim-empty-regions
Browse files Browse the repository at this point in the history
Trim empty regions
  • Loading branch information
jeromekelleher authored Apr 17, 2024
2 parents 33cf089 + d17ee5d commit 7c0cd3a
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 9 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# 0.0.5 2024-04-XX
# 0.0.5 2024-04-17

- Fix bug in schema handling (compressor settings ignored)
- Move making ICF field partition directories into per-partition processing.
Remove progress on the init mkdirs step.
- Turn off progress monitor on dexplode-partition
- Fix empty partition bug

# 0.0.4 2024-04-08

Expand Down
18 changes: 17 additions & 1 deletion bio2zarr/vcf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,22 @@ def variants(self, region):
if var.POS >= start:
yield var

def _filter_empty(self, regions):
"""
Return all regions in the specified list that have one or more records.
Sometimes with Tabix indexes these seem to crop up:
- https://github.com/sgkit-dev/bio2zarr/issues/45
- https://github.com/sgkit-dev/bio2zarr/issues/120
"""
ret = []
for region in regions:
variants = self.variants(region)
if next(variants, None) is not None:
ret.append(region)
return ret

def partition_into_regions(
self,
num_parts: Optional[int] = None,
Expand Down Expand Up @@ -511,4 +527,4 @@ def partition_into_regions(
if self.index.record_counts[ri] > 0:
regions.append(Region(self.sequence_names[ri]))

return regions
return self._filter_empty(regions)
8 changes: 1 addition & 7 deletions tests/test_vcf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,7 @@


def assert_part_counts_non_zero(part_counts, index_file):
# We may have one zero count value at the end in Tabix indexes.
# Should probably try to get rid of it, but probably no harm
# https://github.com/jeromekelleher/bio2zarr/issues/45
if index_file.endswith(".tbi"):
assert np.all(part_counts[:-1] > 0)
else:
assert np.all(part_counts > 0)
assert np.all(part_counts > 0)


class TestIndexedVcf:
Expand Down

0 comments on commit 7c0cd3a

Please sign in to comment.