diff --git a/service_capacity_modeling/capacity_planner.py b/service_capacity_modeling/capacity_planner.py index ce64ed3..a5318ae 100644 --- a/service_capacity_modeling/capacity_planner.py +++ b/service_capacity_modeling/capacity_planner.py @@ -24,6 +24,7 @@ from service_capacity_modeling.interface import certain_float from service_capacity_modeling.interface import DataShape from service_capacity_modeling.interface import Drive +from service_capacity_modeling.interface import Hardware from service_capacity_modeling.interface import Instance from service_capacity_modeling.interface import Interval from service_capacity_modeling.interface import interval @@ -182,6 +183,21 @@ def model_desires_percentiles( return results, d +def _set_instance_objects( + desires: CapacityDesires, + hardware: Hardware, +): + if desires.current_clusters: + for zonal_cluster_capacity in desires.current_clusters.zonal: + zonal_cluster_capacity.cluster_instance = hardware.instances[ + zonal_cluster_capacity.cluster_instance_name + ] + for regional_cluster_capacity in desires.current_clusters.regional: + regional_cluster_capacity.cluster_instance = hardware.instances[ + regional_cluster_capacity.cluster_instance_name + ] + + def _allow_instance( instance: Instance, allowed_names: Sequence[str], @@ -575,6 +591,9 @@ def generate_scenarios( if len(allowed_drives) == 0: allowed_drives.update(hardware.drives.keys()) + # Set current instance object if exists + _set_instance_objects(desires, hardware) + if model.run_hardware_simulation(): for instance in hardware.instances.values(): if not _allow_instance( diff --git a/service_capacity_modeling/interface.py b/service_capacity_modeling/interface.py index 81c56bb..fc71d67 100644 --- a/service_capacity_modeling/interface.py +++ b/service_capacity_modeling/interface.py @@ -18,7 +18,6 @@ from pydantic import BaseModel from pydantic import Field - GIB_IN_BYTES = 1024 * 1024 * 1024 MIB_IN_BYTES = 1024 * 1024 MEGABIT_IN_BYTES = (1000 * 1000) / 8 @@ -621,6 +620,29 @@ class DataShape(ExcludeUnsetModel): ) +class CurrentClusterCapacity(ExcludeUnsetModel): + cluster_instance_name: str + cluster_instance: Optional[Instance] = None + cluster_instance_count: Interval + cpu_utilization: Interval + + +# For services that are provisioned by zone (e.g. Cassandra, EVCache) +class CurrentZoneClusterCapacity(CurrentClusterCapacity): + pass + + +# For services that are provisioned regionally (e.g. Java services, RDS, etc ..) +class CurrentRegionClusterCapacity(CurrentClusterCapacity): + pass + + +class CurrentClusters(ExcludeUnsetModel): + zonal: Sequence[CurrentZoneClusterCapacity] = [] + regional: Sequence[CurrentRegionClusterCapacity] = [] + services: Sequence[ServiceCapacity] = [] + + class CapacityDesires(ExcludeUnsetModel): # How critical is this cluster, impacts how much "extra" we provision # 0 = Critical to the product (Product does not function) @@ -635,6 +657,9 @@ class CapacityDesires(ExcludeUnsetModel): # What will the state look like data_shape: DataShape = DataShape() + # What is the current microarchitectural/system configuration of the system + current_clusters: Optional[CurrentClusters] = None + # When users are providing latency estimates, what is the typical # instance core frequency we are comparing to. Databases use i3s a lot # hence this default diff --git a/service_capacity_modeling/models/org/netflix/cassandra.py b/service_capacity_modeling/models/org/netflix/cassandra.py index 28c584f..a778293 100644 --- a/service_capacity_modeling/models/org/netflix/cassandra.py +++ b/service_capacity_modeling/models/org/netflix/cassandra.py @@ -68,6 +68,7 @@ def _estimate_cassandra_requirement( working_set: float, reads_per_second: float, max_rps_to_disk: int, + required_cluster_size: Optional[int] = None, zones_per_region: int = 3, copies_per_region: int = 3, ) -> CapacityRequirement: @@ -76,8 +77,28 @@ def _estimate_cassandra_requirement( The input desires should be the **regional** desire, and this function will return the zonal capacity requirement """ - # Keep half of the cores free for background work (compaction, backup, repair) - needed_cores = sqrt_staffed_cores(desires) * 2 + current_capacity = ( + None + if desires.current_clusters is None + else desires.current_clusters.zonal[0] + if len(desires.current_clusters.zonal) + else desires.current_clusters.regional[0] + ) + # Keep half of the cores free for background work (compaction, backup, repair). + # Currently, zones and regions are configured in a homogeneous manner. Hence, + # we just take any one of the current cluster configuration + if ( + current_capacity + and current_capacity.cluster_instance + and required_cluster_size is not None + ): + needed_cores = ( + current_capacity.cluster_instance.cpu + * required_cluster_size + * zones_per_region + ) * (current_capacity.cpu_utilization.high / 20) + else: + needed_cores = sqrt_staffed_cores(desires) * 2 # Keep half of the bandwidth available for backup needed_network_mbps = simple_network_mbps(desires) * 2 @@ -183,7 +204,6 @@ def _estimate_cassandra_cluster_zonal( max_write_buffer_percent: float = 0.25, max_table_buffer_percent: float = 0.11, ) -> Optional[CapacityPlan]: - # Netflix Cassandra doesn't like to deploy on really small instances if instance.cpu < 2 or instance.ram_gib < 14: return None @@ -238,6 +258,7 @@ def _estimate_cassandra_cluster_zonal( working_set=working_set, reads_per_second=rps, max_rps_to_disk=max_rps_to_disk, + required_cluster_size=required_cluster_size, zones_per_region=zones_per_region, copies_per_region=copies_per_region, ) diff --git a/tests/netflix/test_cassandra.py b/tests/netflix/test_cassandra.py index d85e618..ba20089 100644 --- a/tests/netflix/test_cassandra.py +++ b/tests/netflix/test_cassandra.py @@ -1,15 +1,18 @@ from service_capacity_modeling.capacity_planner import planner from service_capacity_modeling.interface import AccessConsistency +from service_capacity_modeling.interface import AccessPattern from service_capacity_modeling.interface import CapacityDesires from service_capacity_modeling.interface import certain_float from service_capacity_modeling.interface import certain_int from service_capacity_modeling.interface import Consistency +from service_capacity_modeling.interface import CurrentClusterCapacity +from service_capacity_modeling.interface import CurrentClusters from service_capacity_modeling.interface import DataShape from service_capacity_modeling.interface import FixedInterval from service_capacity_modeling.interface import GlobalConsistency +from service_capacity_modeling.interface import Interval from service_capacity_modeling.interface import QueryPattern - small_but_high_qps = CapacityDesires( service_tier=1, query_pattern=QueryPattern( @@ -301,3 +304,52 @@ def test_reduced_durability(): cheap_plan.candidate_clusters.zonal[0].cluster_params["cassandra.keyspace.rf"] == 2 ) + + +def test_plan_certain(): + """ + Use cpu utilization to determine instance types directly as supposed to + extrapolating it from the Data Shape + """ + cluster_capacity = CurrentClusterCapacity( + cluster_instance_name="i4i.8xlarge", + cluster_instance_count=Interval(low=8, mid=8, high=8, confidence=1), + cpu_utilization=Interval( + low=10.12, mid=13.2, high=14.194801291058118, confidence=1 + ), + ) + + worn_desire = CapacityDesires( + service_tier=1, + current_clusters=CurrentClusters(zonal=[cluster_capacity]), + query_pattern=QueryPattern( + access_pattern=AccessPattern(AccessPattern.latency), + estimated_read_per_second=Interval( + low=234248, mid=351854, high=485906, confidence=0.98 + ), + estimated_write_per_second=Interval( + low=19841, mid=31198, high=37307, confidence=0.98 + ), + ), + # We think we're going to have around 200 TiB of data + data_shape=DataShape( + estimated_state_size_gib=Interval( + low=2006.083, mid=2252.5, high=2480.41, confidence=0.98 + ), + estimated_compression_ratio=Interval(low=1, mid=1, high=1, confidence=1), + ), + ) + cap_plan = planner.plan_certain( + model_name="org.netflix.cassandra", + region="us-east-1", + num_results=3, + num_regions=4, + desires=worn_desire, + extra_model_arguments={ + "required_cluster_size": 8, + }, + ) + + lr_clusters = cap_plan[0].candidate_clusters.zonal[0] + assert lr_clusters.count == 8 + assert lr_clusters.instance.cpu == 16 diff --git a/tests/netflix/test_cassandra_uncertain.py b/tests/netflix/test_cassandra_uncertain.py index eedf91c..ce93dfb 100644 --- a/tests/netflix/test_cassandra_uncertain.py +++ b/tests/netflix/test_cassandra_uncertain.py @@ -4,7 +4,6 @@ from service_capacity_modeling.interface import Interval from service_capacity_modeling.interface import QueryPattern - uncertain_mid = CapacityDesires( service_tier=1, query_pattern=QueryPattern(