Skip to content

Commit

Permalink
Add cpu utilization as a parameter in extra model arguments; If you h…
Browse files Browse the repository at this point in the history
…ave cpu usage already use it directly
  • Loading branch information
ramsrivatsak committed Oct 13, 2023
1 parent f0dde0d commit 2c09f54
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 11 deletions.
7 changes: 7 additions & 0 deletions service_capacity_modeling/capacity_planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,13 @@ def _plan_certain(
if len(allowed_drives) == 0:
allowed_drives.update(hardware.drives.keys())

# Get current instance type if exists
current_instance_name: str = extra_model_arguments.get("current_instance_name", None)
if current_instance_name is not None:
for instance in hardware.instances.values():
if instance.name == current_instance_name:
extra_model_arguments["current_instance_name"] = instance

plans = []
if model.run_hardware_simulation():
for instance in hardware.instances.values():
Expand Down
18 changes: 17 additions & 1 deletion service_capacity_modeling/models/org/netflix/cassandra.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,20 +64,26 @@ def _write_buffer_gib_zone(

def _estimate_cassandra_requirement(
instance: Instance,
current_instance: Instance,
required_cluster_size: Optional[int],
desires: CapacityDesires,
working_set: float,
reads_per_second: float,
max_rps_to_disk: int,
zones_per_region: int = 3,
copies_per_region: int = 3,
max_cpu_utilization: float = None,
) -> CapacityRequirement:
"""Estimate the capacity required for one zone given a regional desire
The input desires should be the **regional** desire, and this function will
return the zonal capacity requirement
"""
# Keep half of the cores free for background work (compaction, backup, repair)
needed_cores = sqrt_staffed_cores(desires) * 2
if all([max_cpu_utilization, current_instance, required_cluster_size]):
needed_cores = (current_instance.cpu * required_cluster_size) * (max_cpu_utilization / 20)
else:
needed_cores = sqrt_staffed_cores(desires) * 2
# Keep half of the bandwidth available for backup
needed_network_mbps = simple_network_mbps(desires) * 2

Expand Down Expand Up @@ -109,6 +115,7 @@ def _estimate_cassandra_requirement(
needed_cores = max(1, needed_cores // zones_per_region)
needed_disk = max(1, needed_disk // zones_per_region)
needed_memory = max(1, int(needed_memory // zones_per_region))
# print(instance.name, zones_per_region, needed_cores)
logger.debug(
"Need (cpu, mem, disk, working) = (%s, %s, %s, %f)",
needed_cores,
Expand Down Expand Up @@ -169,6 +176,7 @@ def _upsert_params(cluster, params):
# flake8: noqa: C901
def _estimate_cassandra_cluster_zonal(
instance: Instance,
current_instance: Instance,
drive: Drive,
context: RegionContext,
desires: CapacityDesires,
Expand All @@ -182,6 +190,7 @@ def _estimate_cassandra_cluster_zonal(
max_regional_size: int = 96,
max_write_buffer_percent: float = 0.25,
max_table_buffer_percent: float = 0.11,
max_cpu_utilization: float = None,
) -> Optional[CapacityPlan]:

# Netflix Cassandra doesn't like to deploy on really small instances
Expand Down Expand Up @@ -234,12 +243,15 @@ def _estimate_cassandra_cluster_zonal(

requirement = _estimate_cassandra_requirement(
instance=instance,
current_instance=current_instance,
required_cluster_size=required_cluster_size,
desires=desires,
working_set=working_set,
reads_per_second=rps,
max_rps_to_disk=max_rps_to_disk,
zones_per_region=zones_per_region,
copies_per_region=copies_per_region,
max_cpu_utilization=max_cpu_utilization,
)

# Cassandra clusters should aim to be at least 2 nodes per zone to start
Expand Down Expand Up @@ -493,6 +505,8 @@ def capacity_plan(
max_table_buffer_percent: float = min(
0.5, extra_model_arguments.get("max_table_buffer_percent", 0.11)
)
max_cpu_utilization: float = extra_model_arguments.get("max_cpu_utilization", None)
current_instance: Instance = extra_model_arguments.get("current_instance_name", None)

# Adjust heap defaults for high write clusters
if (
Expand All @@ -504,6 +518,7 @@ def capacity_plan(

return _estimate_cassandra_cluster_zonal(
instance=instance,
current_instance=current_instance,
drive=drive,
context=context,
desires=desires,
Expand All @@ -517,6 +532,7 @@ def capacity_plan(
max_local_disk_gib=max_local_disk_gib,
max_write_buffer_percent=max_write_buffer_percent,
max_table_buffer_percent=max_table_buffer_percent,
max_cpu_utilization=max_cpu_utilization,
)

@staticmethod
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"numpy",
'importlib_resources; python_version < "3.7"',
"isodate",
"pytest",
],
extras_require={
"aws": ["boto3"],
Expand Down
67 changes: 57 additions & 10 deletions tests/netflix/test_cassandra_uncertain.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import json

from service_capacity_modeling.capacity_planner import planner
from service_capacity_modeling.interface import CapacityDesires
from service_capacity_modeling.interface import DataShape
from service_capacity_modeling.interface import Interval
from service_capacity_modeling.interface import QueryPattern

from service_capacity_modeling.interface import AccessPattern

uncertain_mid = CapacityDesires(
service_tier=1,
Expand Down Expand Up @@ -42,14 +44,14 @@ def test_uncertain_planning():
lr_cluster = lr.candidate_clusters.zonal[0]
assert 8 <= lr_cluster.count * lr_cluster.instance.cpu <= 64
assert (
5_000 <= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 45_000
5_000 <= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 45_000
)

sr = mid_plan.least_regret[1]
sr_cluster = sr.candidate_clusters.zonal[0]
assert 8 <= sr_cluster.count * sr_cluster.instance.cpu <= 64
assert (
5_000 <= sr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 45_000
5_000 <= sr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 45_000
)

tiny_plan = planner.plan(
Expand All @@ -61,7 +63,7 @@ def test_uncertain_planning():
lr_cluster = lr.candidate_clusters.zonal[0]
assert 2 <= lr_cluster.count * lr_cluster.instance.cpu < 16
assert (
1_000 < lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 6_000
1_000 < lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 6_000
)


Expand Down Expand Up @@ -155,9 +157,9 @@ def test_worn_dataset():
lr_cluster = lr.candidate_clusters.zonal[0]
assert 128 <= lr_cluster.count * lr_cluster.instance.cpu <= 512
assert (
250_000
<= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
< 1_000_000
250_000
<= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
< 1_000_000
)
assert lr_cluster.instance.name.startswith(
"m5."
Expand Down Expand Up @@ -193,11 +195,56 @@ def test_very_small_has_disk():
lr_cluster = lr.candidate_clusters.zonal[0]
assert 2 <= lr_cluster.count * lr_cluster.instance.cpu < 16
assert (
1_000
< lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
< 6_000
1_000
< lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
< 6_000
)
if lr_cluster.instance.drive is None:
assert sum(dr.size_gib for dr in lr_cluster.attached_drives) > 10
else:
assert lr_cluster.instance.drive.size_gib > 10


def test_plan_certain():
"""
Use cpu utilization to determine instance types directly as supposed to extrapolating it from the Data Shape
"""
worn_desire = CapacityDesires(
service_tier=1,
query_pattern=QueryPattern(
access_pattern=AccessPattern(
AccessPattern.latency
),
estimated_read_per_second=Interval(
low=234248, mid=351854, high=485906, confidence=0.98
),
estimated_write_per_second=Interval(
low=19841, mid=31198, high=37307, confidence=0.98
),
),
# We think we're going to have around 200 TiB of data
data_shape=DataShape(
estimated_state_size_gib=Interval(
low=2006.083, mid=2252.5, high=2480.41, confidence=0.98
),
estimated_compression_ratio=Interval(
low=1, mid=1, high=1, confidence=1
),
),
)
cap_plan = planner.plan_certain(
model_name="org.netflix.cassandra",
region="us-east-1",
num_results=3,
num_regions=4,
desires=worn_desire,
extra_model_arguments={
"required_cluster_size": 24,
"current_instance_name": "i4i.8xlarge",
"max_cpu_utilization": 14.194801291058118,
},
)

lr_clusters = cap_plan[0].candidate_clusters.zonal[0]
assert lr_clusters.count == 24
assert lr_clusters.instance.cpu < 32

0 comments on commit 2c09f54

Please sign in to comment.