Skip to content

Commit

Permalink
Merge pull request #74 from Netflix-Skunkworks/ramsrivatsak/cpu-utili…
Browse files Browse the repository at this point in the history
…zation-cass

Add cpu utilization as a parameter for the capacity planner to consume
  • Loading branch information
ramsrivatsa authored Oct 26, 2023
2 parents a7caf1d + de475e0 commit f819b9d
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 6 deletions.
19 changes: 19 additions & 0 deletions service_capacity_modeling/capacity_planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from service_capacity_modeling.interface import certain_float
from service_capacity_modeling.interface import DataShape
from service_capacity_modeling.interface import Drive
from service_capacity_modeling.interface import Hardware
from service_capacity_modeling.interface import Instance
from service_capacity_modeling.interface import Interval
from service_capacity_modeling.interface import interval
Expand Down Expand Up @@ -182,6 +183,21 @@ def model_desires_percentiles(
return results, d


def _set_instance_objects(
desires: CapacityDesires,
hardware: Hardware,
):
if desires.current_clusters:
for zonal_cluster_capacity in desires.current_clusters.zonal:
zonal_cluster_capacity.cluster_instance = hardware.instances[
zonal_cluster_capacity.cluster_instance_name
]
for regional_cluster_capacity in desires.current_clusters.regional:
regional_cluster_capacity.cluster_instance = hardware.instances[
regional_cluster_capacity.cluster_instance_name
]


def _allow_instance(
instance: Instance,
allowed_names: Sequence[str],
Expand Down Expand Up @@ -575,6 +591,9 @@ def generate_scenarios(
if len(allowed_drives) == 0:
allowed_drives.update(hardware.drives.keys())

# Set current instance object if exists
_set_instance_objects(desires, hardware)

if model.run_hardware_simulation():
for instance in hardware.instances.values():
if not _allow_instance(
Expand Down
27 changes: 26 additions & 1 deletion service_capacity_modeling/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from pydantic import BaseModel
from pydantic import Field


GIB_IN_BYTES = 1024 * 1024 * 1024
MIB_IN_BYTES = 1024 * 1024
MEGABIT_IN_BYTES = (1000 * 1000) / 8
Expand Down Expand Up @@ -621,6 +620,29 @@ class DataShape(ExcludeUnsetModel):
)


class CurrentClusterCapacity(ExcludeUnsetModel):
cluster_instance_name: str
cluster_instance: Optional[Instance] = None
cluster_instance_count: Interval
cpu_utilization: Interval


# For services that are provisioned by zone (e.g. Cassandra, EVCache)
class CurrentZoneClusterCapacity(CurrentClusterCapacity):
pass


# For services that are provisioned regionally (e.g. Java services, RDS, etc ..)
class CurrentRegionClusterCapacity(CurrentClusterCapacity):
pass


class CurrentClusters(ExcludeUnsetModel):
zonal: Sequence[CurrentZoneClusterCapacity] = []
regional: Sequence[CurrentRegionClusterCapacity] = []
services: Sequence[ServiceCapacity] = []


class CapacityDesires(ExcludeUnsetModel):
# How critical is this cluster, impacts how much "extra" we provision
# 0 = Critical to the product (Product does not function)
Expand All @@ -635,6 +657,9 @@ class CapacityDesires(ExcludeUnsetModel):
# What will the state look like
data_shape: DataShape = DataShape()

# What is the current microarchitectural/system configuration of the system
current_clusters: Optional[CurrentClusters] = None

# When users are providing latency estimates, what is the typical
# instance core frequency we are comparing to. Databases use i3s a lot
# hence this default
Expand Down
27 changes: 24 additions & 3 deletions service_capacity_modeling/models/org/netflix/cassandra.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def _estimate_cassandra_requirement(
working_set: float,
reads_per_second: float,
max_rps_to_disk: int,
required_cluster_size: Optional[int] = None,
zones_per_region: int = 3,
copies_per_region: int = 3,
) -> CapacityRequirement:
Expand All @@ -76,8 +77,28 @@ def _estimate_cassandra_requirement(
The input desires should be the **regional** desire, and this function will
return the zonal capacity requirement
"""
# Keep half of the cores free for background work (compaction, backup, repair)
needed_cores = sqrt_staffed_cores(desires) * 2
current_capacity = (
None
if desires.current_clusters is None
else desires.current_clusters.zonal[0]
if len(desires.current_clusters.zonal)
else desires.current_clusters.regional[0]
)
# Keep half of the cores free for background work (compaction, backup, repair).
# Currently, zones and regions are configured in a homogeneous manner. Hence,
# we just take any one of the current cluster configuration
if (
current_capacity
and current_capacity.cluster_instance
and required_cluster_size is not None
):
needed_cores = (
current_capacity.cluster_instance.cpu
* required_cluster_size
* zones_per_region
) * (current_capacity.cpu_utilization.high / 20)
else:
needed_cores = sqrt_staffed_cores(desires) * 2
# Keep half of the bandwidth available for backup
needed_network_mbps = simple_network_mbps(desires) * 2

Expand Down Expand Up @@ -183,7 +204,6 @@ def _estimate_cassandra_cluster_zonal(
max_write_buffer_percent: float = 0.25,
max_table_buffer_percent: float = 0.11,
) -> Optional[CapacityPlan]:

# Netflix Cassandra doesn't like to deploy on really small instances
if instance.cpu < 2 or instance.ram_gib < 14:
return None
Expand Down Expand Up @@ -238,6 +258,7 @@ def _estimate_cassandra_cluster_zonal(
working_set=working_set,
reads_per_second=rps,
max_rps_to_disk=max_rps_to_disk,
required_cluster_size=required_cluster_size,
zones_per_region=zones_per_region,
copies_per_region=copies_per_region,
)
Expand Down
54 changes: 53 additions & 1 deletion tests/netflix/test_cassandra.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
from service_capacity_modeling.capacity_planner import planner
from service_capacity_modeling.interface import AccessConsistency
from service_capacity_modeling.interface import AccessPattern
from service_capacity_modeling.interface import CapacityDesires
from service_capacity_modeling.interface import certain_float
from service_capacity_modeling.interface import certain_int
from service_capacity_modeling.interface import Consistency
from service_capacity_modeling.interface import CurrentClusterCapacity
from service_capacity_modeling.interface import CurrentClusters
from service_capacity_modeling.interface import DataShape
from service_capacity_modeling.interface import FixedInterval
from service_capacity_modeling.interface import GlobalConsistency
from service_capacity_modeling.interface import Interval
from service_capacity_modeling.interface import QueryPattern


small_but_high_qps = CapacityDesires(
service_tier=1,
query_pattern=QueryPattern(
Expand Down Expand Up @@ -301,3 +304,52 @@ def test_reduced_durability():
cheap_plan.candidate_clusters.zonal[0].cluster_params["cassandra.keyspace.rf"]
== 2
)


def test_plan_certain():
"""
Use cpu utilization to determine instance types directly as supposed to
extrapolating it from the Data Shape
"""
cluster_capacity = CurrentClusterCapacity(
cluster_instance_name="i4i.8xlarge",
cluster_instance_count=Interval(low=8, mid=8, high=8, confidence=1),
cpu_utilization=Interval(
low=10.12, mid=13.2, high=14.194801291058118, confidence=1
),
)

worn_desire = CapacityDesires(
service_tier=1,
current_clusters=CurrentClusters(zonal=[cluster_capacity]),
query_pattern=QueryPattern(
access_pattern=AccessPattern(AccessPattern.latency),
estimated_read_per_second=Interval(
low=234248, mid=351854, high=485906, confidence=0.98
),
estimated_write_per_second=Interval(
low=19841, mid=31198, high=37307, confidence=0.98
),
),
# We think we're going to have around 200 TiB of data
data_shape=DataShape(
estimated_state_size_gib=Interval(
low=2006.083, mid=2252.5, high=2480.41, confidence=0.98
),
estimated_compression_ratio=Interval(low=1, mid=1, high=1, confidence=1),
),
)
cap_plan = planner.plan_certain(
model_name="org.netflix.cassandra",
region="us-east-1",
num_results=3,
num_regions=4,
desires=worn_desire,
extra_model_arguments={
"required_cluster_size": 8,
},
)

lr_clusters = cap_plan[0].candidate_clusters.zonal[0]
assert lr_clusters.count == 8
assert lr_clusters.instance.cpu == 16
1 change: 0 additions & 1 deletion tests/netflix/test_cassandra_uncertain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from service_capacity_modeling.interface import Interval
from service_capacity_modeling.interface import QueryPattern


uncertain_mid = CapacityDesires(
service_tier=1,
query_pattern=QueryPattern(
Expand Down

0 comments on commit f819b9d

Please sign in to comment.