Skip to content

Commit

Permalink
joey changes on new types (+4 squashed commits)
Browse files Browse the repository at this point in the history
Squashed commits:
[592def6] consolidate current instance type (str), count (interval), cpu% (interval) into a new model object
[82dd4c5] required cluster type parameter is now per zone
[0ce2064] Moved instance type definition inside interface (joey comments addressed)
[352a776] Add cpu utilization as a parameter in extra model arguments; If you have cpu usage already use it directly
  • Loading branch information
ramsrivatsak authored and abersnaze committed Oct 19, 2023
1 parent 84dc834 commit fbc2aad
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 12 deletions.
27 changes: 20 additions & 7 deletions service_capacity_modeling/capacity_planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,10 +523,21 @@ def _plan_certain(
) -> Sequence[CapacityPlan]:
extra_model_arguments = extra_model_arguments or {}
model = self._models[model_name]
lifecycles = lifecycles or self._default_lifecycles
instance_families = instance_families or []
drives = drives or []

hardware = self._shapes.region(region)

# Get current instance object if exists
if desires.current_cluster_capacity:
desires.current_cluster_capacity.cluster_instance = hardware.instances[
desires.current_cluster_capacity.cluster_instance_name
]

plans = []
for instance, drive, context in self.generate_scenarios(
model, region, desires, num_regions, lifecycles, instance_families, drives
model, hardware, desires, num_regions, lifecycles, instance_families, drives
):
plan = model.capacity_plan(
instance=instance,
Expand All @@ -545,13 +556,15 @@ def _plan_certain(
return reduce_by_family(plans)[:num_results]

def generate_scenarios(
self, model, region, desires, num_regions, lifecycles, instance_families, drives
self,
model,
hardware,
desires,
num_regions,
lifecycles,
instance_families,
drives,
):
lifecycles = lifecycles or self._default_lifecycles
instance_families = instance_families or []
drives = drives or []

hardware = self._shapes.region(region)

context = RegionContext(
zones_in_region=hardware.zones_in_region,
Expand Down
11 changes: 10 additions & 1 deletion service_capacity_modeling/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from pydantic import BaseModel
from pydantic import Field


GIB_IN_BYTES = 1024 * 1024 * 1024
MIB_IN_BYTES = 1024 * 1024
MEGABIT_IN_BYTES = (1000 * 1000) / 8
Expand Down Expand Up @@ -621,6 +620,13 @@ class DataShape(ExcludeUnsetModel):
)


class CurrentClusterCapacity(ExcludeUnsetModel):
cluster_instance_name: str
cluster_instance: Optional[Instance]
cluster_instance_count: Interval
cpu_utilization: Interval


class CapacityDesires(ExcludeUnsetModel):
# How critical is this cluster, impacts how much "extra" we provision
# 0 = Critical to the product (Product does not function)
Expand All @@ -635,6 +641,9 @@ class CapacityDesires(ExcludeUnsetModel):
# What will the state look like
data_shape: DataShape = DataShape()

# What is the current microarchitectural/system configuration of the system
current_cluster_capacity: Optional[CurrentClusterCapacity]

# When users are providing latency estimates, what is the typical
# instance core frequency we are comparing to. Databases use i3s a lot
# hence this default
Expand Down
16 changes: 14 additions & 2 deletions service_capacity_modeling/models/org/netflix/cassandra.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def _estimate_cassandra_requirement(
working_set: float,
reads_per_second: float,
max_rps_to_disk: int,
required_cluster_size: Optional[int] = None,
zones_per_region: int = 3,
copies_per_region: int = 3,
) -> CapacityRequirement:
Expand All @@ -77,7 +78,18 @@ def _estimate_cassandra_requirement(
return the zonal capacity requirement
"""
# Keep half of the cores free for background work (compaction, backup, repair)
needed_cores = sqrt_staffed_cores(desires) * 2
if (
desires.current_cluster_capacity is not None
and desires.current_cluster_capacity.cluster_instance is not None
and required_cluster_size is not None
):
needed_cores = (
desires.current_cluster_capacity.cluster_instance.cpu
* required_cluster_size
* zones_per_region
) * (desires.current_cluster_capacity.cpu_utilization.high / 20)
else:
needed_cores = sqrt_staffed_cores(desires) * 2
# Keep half of the bandwidth available for backup
needed_network_mbps = simple_network_mbps(desires) * 2

Expand Down Expand Up @@ -183,7 +195,6 @@ def _estimate_cassandra_cluster_zonal(
max_write_buffer_percent: float = 0.25,
max_table_buffer_percent: float = 0.11,
) -> Optional[CapacityPlan]:

# Netflix Cassandra doesn't like to deploy on really small instances
if instance.cpu < 2 or instance.ram_gib < 14:
return None
Expand Down Expand Up @@ -238,6 +249,7 @@ def _estimate_cassandra_cluster_zonal(
working_set=working_set,
reads_per_second=rps,
max_rps_to_disk=max_rps_to_disk,
required_cluster_size=required_cluster_size,
zones_per_region=zones_per_region,
copies_per_region=copies_per_region,
)
Expand Down
51 changes: 50 additions & 1 deletion tests/netflix/test_cassandra.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
from service_capacity_modeling.capacity_planner import planner
from service_capacity_modeling.interface import AccessConsistency
from service_capacity_modeling.interface import AccessPattern
from service_capacity_modeling.interface import CapacityDesires
from service_capacity_modeling.interface import certain_float
from service_capacity_modeling.interface import certain_int
from service_capacity_modeling.interface import Consistency
from service_capacity_modeling.interface import CurrentClusterCapacity
from service_capacity_modeling.interface import DataShape
from service_capacity_modeling.interface import FixedInterval
from service_capacity_modeling.interface import GlobalConsistency
from service_capacity_modeling.interface import Interval
from service_capacity_modeling.interface import QueryPattern


small_but_high_qps = CapacityDesires(
service_tier=1,
query_pattern=QueryPattern(
Expand Down Expand Up @@ -301,3 +303,50 @@ def test_reduced_durability():
cheap_plan.candidate_clusters.zonal[0].cluster_params["cassandra.keyspace.rf"]
== 2
)


def test_plan_certain():
"""
Use cpu utilization to determine instance types directly as supposed to
extrapolating it from the Data Shape
"""
worn_desire = CapacityDesires(
service_tier=1,
current_cluster_capacity=CurrentClusterCapacity(
cluster_instance_name="i4i.8xlarge",
cluster_instance_count=Interval(low=8, mid=8, high=8, confidence=1),
cpu_utilization=Interval(
low=10.12, mid=13.2, high=14.194801291058118, confidence=1
),
),
query_pattern=QueryPattern(
access_pattern=AccessPattern(AccessPattern.latency),
estimated_read_per_second=Interval(
low=234248, mid=351854, high=485906, confidence=0.98
),
estimated_write_per_second=Interval(
low=19841, mid=31198, high=37307, confidence=0.98
),
),
# We think we're going to have around 200 TiB of data
data_shape=DataShape(
estimated_state_size_gib=Interval(
low=2006.083, mid=2252.5, high=2480.41, confidence=0.98
),
estimated_compression_ratio=Interval(low=1, mid=1, high=1, confidence=1),
),
)
cap_plan = planner.plan_certain(
model_name="org.netflix.cassandra",
region="us-east-1",
num_results=3,
num_regions=4,
desires=worn_desire,
extra_model_arguments={
"required_cluster_size": 8,
},
)

lr_clusters = cap_plan[0].candidate_clusters.zonal[0]
assert lr_clusters.count == 8
assert lr_clusters.instance.cpu == 16
1 change: 0 additions & 1 deletion tests/netflix/test_cassandra_uncertain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from service_capacity_modeling.interface import Interval
from service_capacity_modeling.interface import QueryPattern


uncertain_mid = CapacityDesires(
service_tier=1,
query_pattern=QueryPattern(
Expand Down

0 comments on commit fbc2aad

Please sign in to comment.