diff --git a/service_capacity_modeling/models/org/netflix/cassandra.py b/service_capacity_modeling/models/org/netflix/cassandra.py index 4661445..c1c7d0c 100644 --- a/service_capacity_modeling/models/org/netflix/cassandra.py +++ b/service_capacity_modeling/models/org/netflix/cassandra.py @@ -215,8 +215,8 @@ def _estimate_cassandra_cluster_zonal( require_attached_disks: bool = False, required_cluster_size: Optional[int] = None, max_rps_to_disk: int = 500, - max_local_disk_gib: int = 2048, - max_regional_size: int = 96, + max_local_disk_gib: int = 5120, + max_regional_size: int = 192, max_write_buffer_percent: float = 0.25, max_table_buffer_percent: float = 0.11, ) -> Optional[CapacityPlan]: @@ -224,6 +224,10 @@ def _estimate_cassandra_cluster_zonal( if instance.cpu < 2 or instance.ram_gib < 14: return None + # temporarily dont suggest EBS instances + if instance.drive is None: + return None + # if we're not allowed to use gp2, skip EBS only types if instance.drive is None and require_local_disks: return None @@ -478,11 +482,11 @@ class NflxCassandraArguments(BaseModel): description="How many disk IOs should be allowed to hit disk per instance", ) max_regional_size: int = Field( - default=96, + default=192, description="What is the maximum size of a cluster in this region", ) max_local_disk_gib: int = Field( - default=2048, + default=5120, description="The maximum amount of data we store per machine", ) max_write_buffer_percent: float = Field( @@ -522,8 +526,8 @@ def capacity_plan( "required_cluster_size", None ) max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 500) - max_regional_size: int = extra_model_arguments.get("max_regional_size", 96) - max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 2048) + max_regional_size: int = extra_model_arguments.get("max_regional_size", 192) + max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 5120) max_write_buffer_percent: float = min( 0.5, extra_model_arguments.get("max_write_buffer_percent", 0.25) ) diff --git a/tests/netflix/test_cassandra.py b/tests/netflix/test_cassandra.py index ba20089..6d3e0c4 100644 --- a/tests/netflix/test_cassandra.py +++ b/tests/netflix/test_cassandra.py @@ -13,6 +13,9 @@ from service_capacity_modeling.interface import Interval from service_capacity_modeling.interface import QueryPattern +# from service_capacity_modeling.interface import CurrentClusters +# from service_capacity_modeling.interface import CurrentZoneClusterCapacity + small_but_high_qps = CapacityDesires( service_tier=1, query_pattern=QueryPattern( @@ -78,70 +81,70 @@ def test_capacity_small_fast(): assert small_result.cluster_params["cassandra.heap.table.percent"] == 0.11 -def test_ebs_high_reads(): - cap_plan = planner.plan_certain( - model_name="org.netflix.cassandra", - region="us-east-1", - desires=CapacityDesires( - service_tier=1, - query_pattern=QueryPattern( - estimated_read_per_second=certain_int(100_000), - estimated_write_per_second=certain_int(1_000), - ), - data_shape=DataShape( - estimated_state_size_gib=certain_int(1_000), - ), - ), - extra_model_arguments={"require_attached_disks": True}, - )[0] - result = cap_plan.candidate_clusters.zonal[0] - - cores = result.count * result.instance.cpu - assert 64 <= cores <= 128 - # Should get gp3 - assert result.attached_drives[0].name == "gp3" - # 1TiB / ~32 nodes - assert result.attached_drives[0].read_io_per_s is not None - ios = result.attached_drives[0].read_io_per_s * result.count - # Each zone is handling ~33k reads per second, so total disk ios should be < 3x that - # 3 from each level - assert 100_000 < ios < 400_000 - - -def test_ebs_high_writes(): - cap_plan = planner.plan_certain( - model_name="org.netflix.cassandra", - region="us-east-1", - desires=CapacityDesires( - service_tier=1, - query_pattern=QueryPattern( - estimated_read_per_second=certain_int(10_000), - estimated_write_per_second=certain_int(100_000), - estimated_mean_write_size_bytes=certain_int(1024 * 8), - ), - data_shape=DataShape( - estimated_state_size_gib=certain_int(10_000), - ), - ), - extra_model_arguments={"require_attached_disks": True}, - )[0] - result = cap_plan.candidate_clusters.zonal[0] - - cores = result.count * result.instance.cpu - assert 128 <= cores <= 512 - # Should get gp3 - assert result.attached_drives[0].name == "gp3" - # 1TiB / ~32 nodes - assert result.attached_drives[0].read_io_per_s is not None - assert result.attached_drives[0].write_io_per_s is not None - - read_ios = result.attached_drives[0].read_io_per_s * result.count - write_ios = result.attached_drives[0].write_io_per_s * result.count - - # 10TiB ~= 4 IO/read -> 3.3k r/zone/s -> 12k /s - assert 20_000 < read_ios < 60_000 - # 33k wps * 8KiB / 256KiB write IO size = 16.5k / s * 4 for compaction = 6.4k - assert 4_000 < write_ios < 7_000 +# def test_ebs_high_reads(): +# cap_plan = planner.plan_certain( +# model_name="org.netflix.cassandra", +# region="us-east-1", +# desires=CapacityDesires( +# service_tier=1, +# query_pattern=QueryPattern( +# estimated_read_per_second=certain_int(100_000), +# estimated_write_per_second=certain_int(1_000), +# ), +# data_shape=DataShape( +# estimated_state_size_gib=certain_int(1_000), +# ), +# ), +# extra_model_arguments={"require_attached_disks": True}, +# )[0] +# result = cap_plan.candidate_clusters.zonal[0] +# +# cores = result.count * result.instance.cpu +# assert 64 <= cores <= 128 +# # Should get gp3 +# assert result.attached_drives[0].name == "gp3" +# # 1TiB / ~32 nodes +# assert result.attached_drives[0].read_io_per_s is not None +# ios = result.attached_drives[0].read_io_per_s * result.count +# # Each zone is handling ~33k reads per second, so total disk ios should be < 3x +# # that 3 from each level +# assert 100_000 < ios < 400_000 + + +# def test_ebs_high_writes(): +# cap_plan = planner.plan_certain( +# model_name="org.netflix.cassandra", +# region="us-east-1", +# desires=CapacityDesires( +# service_tier=1, +# query_pattern=QueryPattern( +# estimated_read_per_second=certain_int(10_000), +# estimated_write_per_second=certain_int(100_000), +# estimated_mean_write_size_bytes=certain_int(1024 * 8), +# ), +# data_shape=DataShape( +# estimated_state_size_gib=certain_int(10_000), +# ), +# ), +# extra_model_arguments={"require_attached_disks": True}, +# )[0] +# result = cap_plan.candidate_clusters.zonal[0] +# +# cores = result.count * result.instance.cpu +# assert 128 <= cores <= 512 +# # Should get gp3 +# assert result.attached_drives[0].name == "gp3" +# # 1TiB / ~32 nodes +# assert result.attached_drives[0].read_io_per_s is not None +# assert result.attached_drives[0].write_io_per_s is not None +# +# read_ios = result.attached_drives[0].read_io_per_s * result.count +# write_ios = result.attached_drives[0].write_io_per_s * result.count +# +# # 10TiB ~= 4 IO/read -> 3.3k r/zone/s -> 12k /s +# assert 20_000 < read_ios < 60_000 +# # 33k wps * 8KiB / 256KiB write IO size = 16.5k / s * 4 for compaction = 6.4k +# assert 4_000 < write_ios < 7_000 def test_capacity_high_writes(): @@ -192,15 +195,14 @@ def test_high_write_throughput(): extra_model_arguments={"max_regional_size": 96 * 2}, )[0] high_writes_result = cap_plan.candidate_clusters.zonal[0] - assert high_writes_result.instance.family in ("m5", "r5") + assert high_writes_result.instance.family not in ("m5", "r5") assert high_writes_result.count > 16 - - assert high_writes_result.attached_drives[0].size_gib >= 400 - assert ( - 300_000 - > high_writes_result.count * high_writes_result.attached_drives[0].size_gib - >= 100_000 - ) + # assert high_writes_result.instance.drive.size_gib >= 400 + # assert ( + # 300_000 + # > high_writes_result.count * high_writes_result.instance.drive.size_gib + # >= 100_000 + # ) cluster_cost = cap_plan.candidate_clusters.annual_costs["cassandra.zonal-clusters"] assert 125_000 < cluster_cost < 900_000 diff --git a/tests/netflix/test_cassandra_uncertain.py b/tests/netflix/test_cassandra_uncertain.py index d7c4e1b..951cbda 100644 --- a/tests/netflix/test_cassandra_uncertain.py +++ b/tests/netflix/test_cassandra_uncertain.py @@ -158,13 +158,14 @@ def test_worn_dataset(): <= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 1_000_000 ) - assert lr_cluster.instance.name.startswith( + assert not lr_cluster.instance.name.startswith( "m5." ) or lr_cluster.instance.name.startswith("r5.") - assert lr_cluster.attached_drives[0].name == "gp3" - # gp2 should not provision massive drives, prefer to upcolor - assert lr_cluster.attached_drives[0].size_gib < 9000 - assert lr_cluster.attached_drives[0].size_gib * lr_cluster.count * 3 > 204800 + assert len(lr_cluster.attached_drives) == 0 + # assert lr_cluster.attached_drives[0].name == "gp3" + # # gp2 should not provision massive drives, prefer to upcolor + # assert lr_cluster.instance.drive.size_gib < 9000 + # assert lr_cluster.instance.drive.size_gib * lr_cluster.count * 3 > 204800 # We should have S3 backup cost assert lr.candidate_clusters.services[0].annual_cost > 5_000