From da4f6d637fb6428ce54350cd4cdeaea2d2aeb24c Mon Sep 17 00:00:00 2001 From: ayushis Date: Wed, 30 Oct 2024 09:35:17 -0700 Subject: [PATCH] remove EBS from the suggestions --- .../models/org/netflix/cassandra.py | 18 +++---- tests/netflix/test_cassandra.py | 49 ++++++++++++++++-- tests/netflix/test_cassandra_uncertain.py | 51 +++++++++++++++++++ 3 files changed, 105 insertions(+), 13 deletions(-) diff --git a/service_capacity_modeling/models/org/netflix/cassandra.py b/service_capacity_modeling/models/org/netflix/cassandra.py index 4661445..6f1a135 100644 --- a/service_capacity_modeling/models/org/netflix/cassandra.py +++ b/service_capacity_modeling/models/org/netflix/cassandra.py @@ -211,12 +211,12 @@ def _estimate_cassandra_cluster_zonal( desires: CapacityDesires, zones_per_region: int = 3, copies_per_region: int = 3, - require_local_disks: bool = False, + require_local_disks: bool = True, require_attached_disks: bool = False, required_cluster_size: Optional[int] = None, max_rps_to_disk: int = 500, - max_local_disk_gib: int = 2048, - max_regional_size: int = 96, + max_local_disk_gib: int = 5120, + max_regional_size: int = 192, max_write_buffer_percent: float = 0.25, max_table_buffer_percent: float = 0.11, ) -> Optional[CapacityPlan]: @@ -462,7 +462,7 @@ class NflxCassandraArguments(BaseModel): " this will be deduced from durability and consistency desires", ) require_local_disks: bool = Field( - default=False, + default=True, description="If local (ephemeral) drives are required", ) require_attached_disks: bool = Field( @@ -478,11 +478,11 @@ class NflxCassandraArguments(BaseModel): description="How many disk IOs should be allowed to hit disk per instance", ) max_regional_size: int = Field( - default=96, + default=192, description="What is the maximum size of a cluster in this region", ) max_local_disk_gib: int = Field( - default=2048, + default=5120, description="The maximum amount of data we store per machine", ) max_write_buffer_percent: float = Field( @@ -513,7 +513,7 @@ def capacity_plan( desires, extra_model_arguments.get("copies_per_region", None) ) require_local_disks: bool = extra_model_arguments.get( - "require_local_disks", False + "require_local_disks", True ) require_attached_disks: bool = extra_model_arguments.get( "require_attached_disks", False @@ -522,8 +522,8 @@ def capacity_plan( "required_cluster_size", None ) max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 500) - max_regional_size: int = extra_model_arguments.get("max_regional_size", 96) - max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 2048) + max_regional_size: int = extra_model_arguments.get("max_regional_size", 192) + max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 5120) max_write_buffer_percent: float = min( 0.5, extra_model_arguments.get("max_write_buffer_percent", 0.25) ) diff --git a/tests/netflix/test_cassandra.py b/tests/netflix/test_cassandra.py index ba20089..50c12ab 100644 --- a/tests/netflix/test_cassandra.py +++ b/tests/netflix/test_cassandra.py @@ -92,7 +92,10 @@ def test_ebs_high_reads(): estimated_state_size_gib=certain_int(1_000), ), ), - extra_model_arguments={"require_attached_disks": True}, + extra_model_arguments={ + "require_attached_disks": True, + "require_local_disks": False, + }, )[0] result = cap_plan.candidate_clusters.zonal[0] @@ -103,8 +106,8 @@ def test_ebs_high_reads(): # 1TiB / ~32 nodes assert result.attached_drives[0].read_io_per_s is not None ios = result.attached_drives[0].read_io_per_s * result.count - # Each zone is handling ~33k reads per second, so total disk ios should be < 3x that - # 3 from each level + # Each zone is handling ~33k reads per second, so total disk ios should be < 3x + # that 3 from each level assert 100_000 < ios < 400_000 @@ -123,7 +126,10 @@ def test_ebs_high_writes(): estimated_state_size_gib=certain_int(10_000), ), ), - extra_model_arguments={"require_attached_disks": True}, + extra_model_arguments={ + "require_attached_disks": True, + "require_local_disks": False, + }, )[0] result = cap_plan.candidate_clusters.zonal[0] @@ -192,6 +198,41 @@ def test_high_write_throughput(): extra_model_arguments={"max_regional_size": 96 * 2}, )[0] high_writes_result = cap_plan.candidate_clusters.zonal[0] + assert high_writes_result.instance.family not in ("m5", "r5") + assert high_writes_result.count > 16 + + cluster_cost = cap_plan.candidate_clusters.annual_costs["cassandra.zonal-clusters"] + assert 125_000 < cluster_cost < 900_000 + + # We should require more than 4 tiering in order to meet this requirement + assert high_writes_result.cluster_params["cassandra.compaction.min_threshold"] > 4 + + +def test_high_write_throughput_ebs(): + desires = CapacityDesires( + service_tier=1, + query_pattern=QueryPattern( + estimated_read_per_second=certain_int(1000), + estimated_write_per_second=certain_int(1_000_000), + # Really large writes + estimated_mean_write_size_bytes=certain_int(4096), + ), + data_shape=DataShape( + estimated_state_size_gib=certain_int(100_000), + ), + ) + + cap_plan = planner.plan_certain( + model_name="org.netflix.cassandra", + region="us-east-1", + desires=desires, + extra_model_arguments={ + "max_regional_size": 96 * 2, + "require_local_disks": False, + "require_attached_disks": True, + }, + )[0] + high_writes_result = cap_plan.candidate_clusters.zonal[0] assert high_writes_result.instance.family in ("m5", "r5") assert high_writes_result.count > 16 diff --git a/tests/netflix/test_cassandra_uncertain.py b/tests/netflix/test_cassandra_uncertain.py index d7c4e1b..6313f85 100644 --- a/tests/netflix/test_cassandra_uncertain.py +++ b/tests/netflix/test_cassandra_uncertain.py @@ -150,6 +150,57 @@ def test_worn_dataset(): }, ) + lr = cap_plan.least_regret[0] + lr_cluster = lr.candidate_clusters.zonal[0] + assert 128 <= lr_cluster.count * lr_cluster.instance.cpu <= 512 + assert ( + 250_000 + <= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] + < 1_000_000 + ) + assert not lr_cluster.instance.name.startswith( + "m5." + ) or lr_cluster.instance.name.startswith("r5.") + assert len(lr_cluster.attached_drives) == 0 + assert lr.candidate_clusters.services[0].annual_cost > 5_000 + + +def test_worn_dataset_ebs(): + """Assert that a write once read never (aka tracing) dataset uses + CPU and GP2 cloud drives to max ability. Paying for fast ephmeral storage + is silly when we're never reading from it. + """ + worn_desire = CapacityDesires( + service_tier=1, + query_pattern=QueryPattern( + # Very Very few reads. + estimated_read_per_second=Interval( + low=1, mid=10, high=100, confidence=0.98 + ), + # We think we're going to have around 1 million writes per second + estimated_write_per_second=Interval( + low=100_000, mid=1_000_000, high=2_000_000, confidence=0.98 + ), + ), + # We think we're going to have around 200 TiB of data + data_shape=DataShape( + estimated_state_size_gib=Interval( + low=104800, mid=204800, high=404800, confidence=0.98 + ), + ), + ) + cap_plan = planner.plan( + model_name="org.netflix.cassandra", + region="us-east-1", + desires=worn_desire, + extra_model_arguments={ + "max_regional_size": 200, + "copies_per_region": 2, + "require_local_disks": False, + "require_attached_disks": True, + }, + ) + lr = cap_plan.least_regret[0] lr_cluster = lr.candidate_clusters.zonal[0] assert 128 <= lr_cluster.count * lr_cluster.instance.cpu <= 512