diff --git a/service_capacity_modeling/capacity_planner.py b/service_capacity_modeling/capacity_planner.py index ce64ed3..68ec3c0 100644 --- a/service_capacity_modeling/capacity_planner.py +++ b/service_capacity_modeling/capacity_planner.py @@ -523,10 +523,21 @@ def _plan_certain( ) -> Sequence[CapacityPlan]: extra_model_arguments = extra_model_arguments or {} model = self._models[model_name] + lifecycles = lifecycles or self._default_lifecycles + instance_families = instance_families or [] + drives = drives or [] + + hardware = self._shapes.region(region) + + # Get current instance object if exists + if desires.current_cluster_capacity: + desires.current_cluster_capacity.cluster_instance = hardware.instances[ + desires.current_cluster_capacity.cluster_instance_name + ] plans = [] for instance, drive, context in self.generate_scenarios( - model, region, desires, num_regions, lifecycles, instance_families, drives + model, hardware, desires, num_regions, lifecycles, instance_families, drives ): plan = model.capacity_plan( instance=instance, @@ -545,13 +556,15 @@ def _plan_certain( return reduce_by_family(plans)[:num_results] def generate_scenarios( - self, model, region, desires, num_regions, lifecycles, instance_families, drives + self, + model, + hardware, + desires, + num_regions, + lifecycles, + instance_families, + drives, ): - lifecycles = lifecycles or self._default_lifecycles - instance_families = instance_families or [] - drives = drives or [] - - hardware = self._shapes.region(region) context = RegionContext( zones_in_region=hardware.zones_in_region, diff --git a/service_capacity_modeling/interface.py b/service_capacity_modeling/interface.py index 81c56bb..c4d66e6 100644 --- a/service_capacity_modeling/interface.py +++ b/service_capacity_modeling/interface.py @@ -18,7 +18,6 @@ from pydantic import BaseModel from pydantic import Field - GIB_IN_BYTES = 1024 * 1024 * 1024 MIB_IN_BYTES = 1024 * 1024 MEGABIT_IN_BYTES = (1000 * 1000) / 8 @@ -621,6 +620,13 @@ class DataShape(ExcludeUnsetModel): ) +class CurrentClusterCapacity(ExcludeUnsetModel): + cluster_instance_name: str + cluster_instance: Optional[Instance] + cluster_instance_count: Interval + cpu_utilization: Interval + + class CapacityDesires(ExcludeUnsetModel): # How critical is this cluster, impacts how much "extra" we provision # 0 = Critical to the product (Product does not function) @@ -635,6 +641,9 @@ class CapacityDesires(ExcludeUnsetModel): # What will the state look like data_shape: DataShape = DataShape() + # What is the current microarchitectural/system configuration of the system + current_cluster_capacity: Optional[CurrentClusterCapacity] + # When users are providing latency estimates, what is the typical # instance core frequency we are comparing to. Databases use i3s a lot # hence this default diff --git a/service_capacity_modeling/models/org/netflix/cassandra.py b/service_capacity_modeling/models/org/netflix/cassandra.py index 28c584f..19c32a1 100644 --- a/service_capacity_modeling/models/org/netflix/cassandra.py +++ b/service_capacity_modeling/models/org/netflix/cassandra.py @@ -68,6 +68,7 @@ def _estimate_cassandra_requirement( working_set: float, reads_per_second: float, max_rps_to_disk: int, + required_cluster_size: Optional[int] = None, zones_per_region: int = 3, copies_per_region: int = 3, ) -> CapacityRequirement: @@ -77,7 +78,18 @@ def _estimate_cassandra_requirement( return the zonal capacity requirement """ # Keep half of the cores free for background work (compaction, backup, repair) - needed_cores = sqrt_staffed_cores(desires) * 2 + if ( + desires.current_cluster_capacity is not None + and desires.current_cluster_capacity.cluster_instance is not None + and required_cluster_size is not None + ): + needed_cores = ( + desires.current_cluster_capacity.cluster_instance.cpu + * required_cluster_size + * zones_per_region + ) * (desires.current_cluster_capacity.cpu_utilization.high / 20) + else: + needed_cores = sqrt_staffed_cores(desires) * 2 # Keep half of the bandwidth available for backup needed_network_mbps = simple_network_mbps(desires) * 2 @@ -183,7 +195,6 @@ def _estimate_cassandra_cluster_zonal( max_write_buffer_percent: float = 0.25, max_table_buffer_percent: float = 0.11, ) -> Optional[CapacityPlan]: - # Netflix Cassandra doesn't like to deploy on really small instances if instance.cpu < 2 or instance.ram_gib < 14: return None @@ -238,6 +249,7 @@ def _estimate_cassandra_cluster_zonal( working_set=working_set, reads_per_second=rps, max_rps_to_disk=max_rps_to_disk, + required_cluster_size=required_cluster_size, zones_per_region=zones_per_region, copies_per_region=copies_per_region, ) diff --git a/tests/netflix/test_cassandra.py b/tests/netflix/test_cassandra.py index d85e618..c463597 100644 --- a/tests/netflix/test_cassandra.py +++ b/tests/netflix/test_cassandra.py @@ -1,15 +1,17 @@ from service_capacity_modeling.capacity_planner import planner from service_capacity_modeling.interface import AccessConsistency +from service_capacity_modeling.interface import AccessPattern from service_capacity_modeling.interface import CapacityDesires from service_capacity_modeling.interface import certain_float from service_capacity_modeling.interface import certain_int from service_capacity_modeling.interface import Consistency +from service_capacity_modeling.interface import CurrentClusterCapacity from service_capacity_modeling.interface import DataShape from service_capacity_modeling.interface import FixedInterval from service_capacity_modeling.interface import GlobalConsistency +from service_capacity_modeling.interface import Interval from service_capacity_modeling.interface import QueryPattern - small_but_high_qps = CapacityDesires( service_tier=1, query_pattern=QueryPattern( @@ -301,3 +303,50 @@ def test_reduced_durability(): cheap_plan.candidate_clusters.zonal[0].cluster_params["cassandra.keyspace.rf"] == 2 ) + + +def test_plan_certain(): + """ + Use cpu utilization to determine instance types directly as supposed to + extrapolating it from the Data Shape + """ + worn_desire = CapacityDesires( + service_tier=1, + current_cluster_capacity=CurrentClusterCapacity( + cluster_instance_name="i4i.8xlarge", + cluster_instance_count=Interval(low=8, mid=8, high=8, confidence=1), + cpu_utilization=Interval( + low=10.12, mid=13.2, high=14.194801291058118, confidence=1 + ), + ), + query_pattern=QueryPattern( + access_pattern=AccessPattern(AccessPattern.latency), + estimated_read_per_second=Interval( + low=234248, mid=351854, high=485906, confidence=0.98 + ), + estimated_write_per_second=Interval( + low=19841, mid=31198, high=37307, confidence=0.98 + ), + ), + # We think we're going to have around 200 TiB of data + data_shape=DataShape( + estimated_state_size_gib=Interval( + low=2006.083, mid=2252.5, high=2480.41, confidence=0.98 + ), + estimated_compression_ratio=Interval(low=1, mid=1, high=1, confidence=1), + ), + ) + cap_plan = planner.plan_certain( + model_name="org.netflix.cassandra", + region="us-east-1", + num_results=3, + num_regions=4, + desires=worn_desire, + extra_model_arguments={ + "required_cluster_size": 8, + }, + ) + + lr_clusters = cap_plan[0].candidate_clusters.zonal[0] + assert lr_clusters.count == 8 + assert lr_clusters.instance.cpu == 16 diff --git a/tests/netflix/test_cassandra_uncertain.py b/tests/netflix/test_cassandra_uncertain.py index eedf91c..ce93dfb 100644 --- a/tests/netflix/test_cassandra_uncertain.py +++ b/tests/netflix/test_cassandra_uncertain.py @@ -4,7 +4,6 @@ from service_capacity_modeling.interface import Interval from service_capacity_modeling.interface import QueryPattern - uncertain_mid = CapacityDesires( service_tier=1, query_pattern=QueryPattern(