Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/remove ebs #95

Merged
merged 1 commit into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions service_capacity_modeling/models/org/netflix/cassandra.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,12 @@ def _estimate_cassandra_cluster_zonal(
desires: CapacityDesires,
zones_per_region: int = 3,
copies_per_region: int = 3,
require_local_disks: bool = False,
require_local_disks: bool = True,
require_attached_disks: bool = False,
required_cluster_size: Optional[int] = None,
max_rps_to_disk: int = 500,
max_local_disk_gib: int = 2048,
max_regional_size: int = 96,
max_local_disk_gib: int = 5120,
max_regional_size: int = 192,
max_write_buffer_percent: float = 0.25,
max_table_buffer_percent: float = 0.11,
) -> Optional[CapacityPlan]:
Expand Down Expand Up @@ -462,7 +462,7 @@ class NflxCassandraArguments(BaseModel):
" this will be deduced from durability and consistency desires",
)
require_local_disks: bool = Field(
default=False,
default=True,
description="If local (ephemeral) drives are required",
)
require_attached_disks: bool = Field(
Expand All @@ -478,11 +478,11 @@ class NflxCassandraArguments(BaseModel):
description="How many disk IOs should be allowed to hit disk per instance",
)
max_regional_size: int = Field(
default=96,
default=192,
description="What is the maximum size of a cluster in this region",
)
max_local_disk_gib: int = Field(
default=2048,
default=5120,
description="The maximum amount of data we store per machine",
)
max_write_buffer_percent: float = Field(
Expand Down Expand Up @@ -513,7 +513,7 @@ def capacity_plan(
desires, extra_model_arguments.get("copies_per_region", None)
)
require_local_disks: bool = extra_model_arguments.get(
"require_local_disks", False
"require_local_disks", True
)
require_attached_disks: bool = extra_model_arguments.get(
"require_attached_disks", False
Expand All @@ -522,8 +522,8 @@ def capacity_plan(
"required_cluster_size", None
)
max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 500)
max_regional_size: int = extra_model_arguments.get("max_regional_size", 96)
max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 2048)
max_regional_size: int = extra_model_arguments.get("max_regional_size", 192)
max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 5120)
max_write_buffer_percent: float = min(
0.5, extra_model_arguments.get("max_write_buffer_percent", 0.25)
)
Expand Down
49 changes: 45 additions & 4 deletions tests/netflix/test_cassandra.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ def test_ebs_high_reads():
estimated_state_size_gib=certain_int(1_000),
),
),
extra_model_arguments={"require_attached_disks": True},
extra_model_arguments={
"require_attached_disks": True,
"require_local_disks": False,
},
)[0]
result = cap_plan.candidate_clusters.zonal[0]

Expand All @@ -103,8 +106,8 @@ def test_ebs_high_reads():
# 1TiB / ~32 nodes
assert result.attached_drives[0].read_io_per_s is not None
ios = result.attached_drives[0].read_io_per_s * result.count
# Each zone is handling ~33k reads per second, so total disk ios should be < 3x that
# 3 from each level
# Each zone is handling ~33k reads per second, so total disk ios should be < 3x
# that 3 from each level
assert 100_000 < ios < 400_000


Expand All @@ -123,7 +126,10 @@ def test_ebs_high_writes():
estimated_state_size_gib=certain_int(10_000),
),
),
extra_model_arguments={"require_attached_disks": True},
extra_model_arguments={
"require_attached_disks": True,
"require_local_disks": False,
},
)[0]
result = cap_plan.candidate_clusters.zonal[0]

Expand Down Expand Up @@ -192,6 +198,41 @@ def test_high_write_throughput():
extra_model_arguments={"max_regional_size": 96 * 2},
)[0]
high_writes_result = cap_plan.candidate_clusters.zonal[0]
assert high_writes_result.instance.family not in ("m5", "r5")
assert high_writes_result.count > 16

cluster_cost = cap_plan.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
assert 125_000 < cluster_cost < 900_000

# We should require more than 4 tiering in order to meet this requirement
assert high_writes_result.cluster_params["cassandra.compaction.min_threshold"] > 4


def test_high_write_throughput_ebs():
desires = CapacityDesires(
service_tier=1,
query_pattern=QueryPattern(
estimated_read_per_second=certain_int(1000),
estimated_write_per_second=certain_int(1_000_000),
# Really large writes
estimated_mean_write_size_bytes=certain_int(4096),
),
data_shape=DataShape(
estimated_state_size_gib=certain_int(100_000),
),
)

cap_plan = planner.plan_certain(
model_name="org.netflix.cassandra",
region="us-east-1",
desires=desires,
extra_model_arguments={
"max_regional_size": 96 * 2,
"require_local_disks": False,
"require_attached_disks": True,
},
)[0]
high_writes_result = cap_plan.candidate_clusters.zonal[0]
assert high_writes_result.instance.family in ("m5", "r5")
assert high_writes_result.count > 16

Expand Down
51 changes: 51 additions & 0 deletions tests/netflix/test_cassandra_uncertain.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,57 @@ def test_worn_dataset():
},
)

lr = cap_plan.least_regret[0]
lr_cluster = lr.candidate_clusters.zonal[0]
assert 128 <= lr_cluster.count * lr_cluster.instance.cpu <= 512
assert (
250_000
<= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
< 1_000_000
)
assert not lr_cluster.instance.name.startswith(
"m5."
) or lr_cluster.instance.name.startswith("r5.")
assert len(lr_cluster.attached_drives) == 0
assert lr.candidate_clusters.services[0].annual_cost > 5_000


def test_worn_dataset_ebs():
"""Assert that a write once read never (aka tracing) dataset uses
CPU and GP2 cloud drives to max ability. Paying for fast ephmeral storage
is silly when we're never reading from it.
"""
worn_desire = CapacityDesires(
service_tier=1,
query_pattern=QueryPattern(
# Very Very few reads.
estimated_read_per_second=Interval(
low=1, mid=10, high=100, confidence=0.98
),
# We think we're going to have around 1 million writes per second
estimated_write_per_second=Interval(
low=100_000, mid=1_000_000, high=2_000_000, confidence=0.98
),
),
# We think we're going to have around 200 TiB of data
data_shape=DataShape(
estimated_state_size_gib=Interval(
low=104800, mid=204800, high=404800, confidence=0.98
),
),
)
cap_plan = planner.plan(
model_name="org.netflix.cassandra",
region="us-east-1",
desires=worn_desire,
extra_model_arguments={
"max_regional_size": 200,
"copies_per_region": 2,
"require_local_disks": False,
"require_attached_disks": True,
},
)

lr = cap_plan.least_regret[0]
lr_cluster = lr.candidate_clusters.zonal[0]
assert 128 <= lr_cluster.count * lr_cluster.instance.cpu <= 512
Expand Down
Loading