Skip to content

Commit

Permalink
remove EBS from the suggestions
Browse files Browse the repository at this point in the history
  • Loading branch information
ayushis committed Oct 31, 2024
1 parent 5734f51 commit da86670
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 83 deletions.
16 changes: 10 additions & 6 deletions service_capacity_modeling/models/org/netflix/cassandra.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,15 +215,19 @@ def _estimate_cassandra_cluster_zonal(
require_attached_disks: bool = False,
required_cluster_size: Optional[int] = None,
max_rps_to_disk: int = 500,
max_local_disk_gib: int = 2048,
max_regional_size: int = 96,
max_local_disk_gib: int = 5120,
max_regional_size: int = 192,
max_write_buffer_percent: float = 0.25,
max_table_buffer_percent: float = 0.11,
) -> Optional[CapacityPlan]:
# Netflix Cassandra doesn't like to deploy on really small instances
if instance.cpu < 2 or instance.ram_gib < 14:
return None

# temporarily dont suggest EBS instances
if instance.drive is None:
return None

# if we're not allowed to use gp2, skip EBS only types
if instance.drive is None and require_local_disks:
return None
Expand Down Expand Up @@ -478,11 +482,11 @@ class NflxCassandraArguments(BaseModel):
description="How many disk IOs should be allowed to hit disk per instance",
)
max_regional_size: int = Field(
default=96,
default=192,
description="What is the maximum size of a cluster in this region",
)
max_local_disk_gib: int = Field(
default=2048,
default=5120,
description="The maximum amount of data we store per machine",
)
max_write_buffer_percent: float = Field(
Expand Down Expand Up @@ -522,8 +526,8 @@ def capacity_plan(
"required_cluster_size", None
)
max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 500)
max_regional_size: int = extra_model_arguments.get("max_regional_size", 96)
max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 2048)
max_regional_size: int = extra_model_arguments.get("max_regional_size", 192)
max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 5120)
max_write_buffer_percent: float = min(
0.5, extra_model_arguments.get("max_write_buffer_percent", 0.25)
)
Expand Down
146 changes: 74 additions & 72 deletions tests/netflix/test_cassandra.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
from service_capacity_modeling.interface import Interval
from service_capacity_modeling.interface import QueryPattern

# from service_capacity_modeling.interface import CurrentClusters
# from service_capacity_modeling.interface import CurrentZoneClusterCapacity

small_but_high_qps = CapacityDesires(
service_tier=1,
query_pattern=QueryPattern(
Expand Down Expand Up @@ -78,70 +81,70 @@ def test_capacity_small_fast():
assert small_result.cluster_params["cassandra.heap.table.percent"] == 0.11


def test_ebs_high_reads():
cap_plan = planner.plan_certain(
model_name="org.netflix.cassandra",
region="us-east-1",
desires=CapacityDesires(
service_tier=1,
query_pattern=QueryPattern(
estimated_read_per_second=certain_int(100_000),
estimated_write_per_second=certain_int(1_000),
),
data_shape=DataShape(
estimated_state_size_gib=certain_int(1_000),
),
),
extra_model_arguments={"require_attached_disks": True},
)[0]
result = cap_plan.candidate_clusters.zonal[0]

cores = result.count * result.instance.cpu
assert 64 <= cores <= 128
# Should get gp3
assert result.attached_drives[0].name == "gp3"
# 1TiB / ~32 nodes
assert result.attached_drives[0].read_io_per_s is not None
ios = result.attached_drives[0].read_io_per_s * result.count
# Each zone is handling ~33k reads per second, so total disk ios should be < 3x that
# 3 from each level
assert 100_000 < ios < 400_000


def test_ebs_high_writes():
cap_plan = planner.plan_certain(
model_name="org.netflix.cassandra",
region="us-east-1",
desires=CapacityDesires(
service_tier=1,
query_pattern=QueryPattern(
estimated_read_per_second=certain_int(10_000),
estimated_write_per_second=certain_int(100_000),
estimated_mean_write_size_bytes=certain_int(1024 * 8),
),
data_shape=DataShape(
estimated_state_size_gib=certain_int(10_000),
),
),
extra_model_arguments={"require_attached_disks": True},
)[0]
result = cap_plan.candidate_clusters.zonal[0]

cores = result.count * result.instance.cpu
assert 128 <= cores <= 512
# Should get gp3
assert result.attached_drives[0].name == "gp3"
# 1TiB / ~32 nodes
assert result.attached_drives[0].read_io_per_s is not None
assert result.attached_drives[0].write_io_per_s is not None

read_ios = result.attached_drives[0].read_io_per_s * result.count
write_ios = result.attached_drives[0].write_io_per_s * result.count

# 10TiB ~= 4 IO/read -> 3.3k r/zone/s -> 12k /s
assert 20_000 < read_ios < 60_000
# 33k wps * 8KiB / 256KiB write IO size = 16.5k / s * 4 for compaction = 6.4k
assert 4_000 < write_ios < 7_000
# def test_ebs_high_reads():
# cap_plan = planner.plan_certain(
# model_name="org.netflix.cassandra",
# region="us-east-1",
# desires=CapacityDesires(
# service_tier=1,
# query_pattern=QueryPattern(
# estimated_read_per_second=certain_int(100_000),
# estimated_write_per_second=certain_int(1_000),
# ),
# data_shape=DataShape(
# estimated_state_size_gib=certain_int(1_000),
# ),
# ),
# extra_model_arguments={"require_attached_disks": True},
# )[0]
# result = cap_plan.candidate_clusters.zonal[0]
#
# cores = result.count * result.instance.cpu
# assert 64 <= cores <= 128
# # Should get gp3
# assert result.attached_drives[0].name == "gp3"
# # 1TiB / ~32 nodes
# assert result.attached_drives[0].read_io_per_s is not None
# ios = result.attached_drives[0].read_io_per_s * result.count
# # Each zone is handling ~33k reads per second, so total disk ios should be < 3x
# # that 3 from each level
# assert 100_000 < ios < 400_000


# def test_ebs_high_writes():
# cap_plan = planner.plan_certain(
# model_name="org.netflix.cassandra",
# region="us-east-1",
# desires=CapacityDesires(
# service_tier=1,
# query_pattern=QueryPattern(
# estimated_read_per_second=certain_int(10_000),
# estimated_write_per_second=certain_int(100_000),
# estimated_mean_write_size_bytes=certain_int(1024 * 8),
# ),
# data_shape=DataShape(
# estimated_state_size_gib=certain_int(10_000),
# ),
# ),
# extra_model_arguments={"require_attached_disks": True},
# )[0]
# result = cap_plan.candidate_clusters.zonal[0]
#
# cores = result.count * result.instance.cpu
# assert 128 <= cores <= 512
# # Should get gp3
# assert result.attached_drives[0].name == "gp3"
# # 1TiB / ~32 nodes
# assert result.attached_drives[0].read_io_per_s is not None
# assert result.attached_drives[0].write_io_per_s is not None
#
# read_ios = result.attached_drives[0].read_io_per_s * result.count
# write_ios = result.attached_drives[0].write_io_per_s * result.count
#
# # 10TiB ~= 4 IO/read -> 3.3k r/zone/s -> 12k /s
# assert 20_000 < read_ios < 60_000
# # 33k wps * 8KiB / 256KiB write IO size = 16.5k / s * 4 for compaction = 6.4k
# assert 4_000 < write_ios < 7_000


def test_capacity_high_writes():
Expand Down Expand Up @@ -192,15 +195,14 @@ def test_high_write_throughput():
extra_model_arguments={"max_regional_size": 96 * 2},
)[0]
high_writes_result = cap_plan.candidate_clusters.zonal[0]
assert high_writes_result.instance.family in ("m5", "r5")
assert high_writes_result.instance.family not in ("m5", "r5")
assert high_writes_result.count > 16

assert high_writes_result.attached_drives[0].size_gib >= 400
assert (
300_000
> high_writes_result.count * high_writes_result.attached_drives[0].size_gib
>= 100_000
)
# assert high_writes_result.instance.drive.size_gib >= 400
# assert (
# 300_000
# > high_writes_result.count * high_writes_result.instance.drive.size_gib
# >= 100_000
# )

cluster_cost = cap_plan.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
assert 125_000 < cluster_cost < 900_000
Expand Down
11 changes: 6 additions & 5 deletions tests/netflix/test_cassandra_uncertain.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,14 @@ def test_worn_dataset():
<= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
< 1_000_000
)
assert lr_cluster.instance.name.startswith(
assert not lr_cluster.instance.name.startswith(
"m5."
) or lr_cluster.instance.name.startswith("r5.")
assert lr_cluster.attached_drives[0].name == "gp3"
# gp2 should not provision massive drives, prefer to upcolor
assert lr_cluster.attached_drives[0].size_gib < 9000
assert lr_cluster.attached_drives[0].size_gib * lr_cluster.count * 3 > 204800
assert len(lr_cluster.attached_drives) == 0
# assert lr_cluster.attached_drives[0].name == "gp3"
# # gp2 should not provision massive drives, prefer to upcolor
# assert lr_cluster.instance.drive.size_gib < 9000
# assert lr_cluster.instance.drive.size_gib * lr_cluster.count * 3 > 204800
# We should have S3 backup cost
assert lr.candidate_clusters.services[0].annual_cost > 5_000

Expand Down

0 comments on commit da86670

Please sign in to comment.