Merge pull request #256 from twitter/jbaxter/2024_08_12

Multi-group models, Support NMRDueToStableCRHTime, & more
twitter · Aug 12, 2024 · 956d8bd · 956d8bd
2 parents 779c728 + 9cf9458
commit 956d8bd
Show file tree

Hide file tree

Showing 8 changed files with 540 additions and 165 deletions.
diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -3,7 +3,7 @@
 from enum import Enum
 import os
 import time
-from typing import Dict, Optional
+from typing import Dict, Optional, Set
 
 import numpy as np
 import pandas as pd
@@ -34,12 +34,17 @@
 intervalHalfWidth = 0.3
 
 # Max flip rates
-prescoringAllUnlockedNotesMaxCrhChurn = 0.04
-finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.03
+prescoringAllUnlockedNotesMaxCrhChurn = 0.2
+prescoringAllNotesCreatedThreeToThirteenDaysAgoMaxChurn = 0.06
+finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.05
 finalNotesWithNewRatingsMaxNewCrhChurn = 0.80
 finalNotesWithNewRatingsMaxOldCrhChurn = 0.25
 finalNotesThatJustFlippedStatusMaxCrhChurn = 1e8
 finalNotesThatFlippedRecentlyMaxCrhChurn = 1e8
+# TODO(jiansongc): adjust these 2 below
+finalNotesNmrDueToMinStableCrhTimeMaxOldCrhChurn = 1.0
+finalNotesNmrDueToMinStableCrhTimeMaxNewCrhChurn = 1.0
+
 
 # Data Filenames
 scoredNotesOutputPath = "scoredNotes.tsv"
@@ -59,17 +64,14 @@
 authorTopNotHelpfulTagValues = "authorTopNotHelpfulTagValues"
 modelingPopulationKey = "modelingPopulation"
 modelingGroupKey = "modelingGroup"
+modelingMultiGroupKey = "modelingMultiGroup"
 numberOfTimesEarnedOutKey = "numberOfTimesEarnedOut"
 defaultIndexKey = "index"
 
 # Scoring Groups
-coreGroups = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25}
-expansionGroups = (
- # Divide into 3 grouping aggregates to prepare for multi-group models,
- # and a 4th group containing leftovers
- {0, 15, 17, 24, 29, 30} | {4, 5, 7, 12, 26} | {27} | {16, 20, 22, 23, 28}
-)
-expansionPlusGroups = {18}
+coreGroups: Set[int] = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25}
+expansionGroups: Set[int] = {0, 4, 5, 7, 12, 16, 18, 20, 22, 23, 24, 26, 27, 28}
+expansionPlusGroups: Set[int] = {15, 17, 29, 30}
 
 # TSV Values
 notHelpfulValueTsv = "NOT_HELPFUL"
@@ -193,6 +195,14 @@ def rater_factor_key(i):
 groupRaterFactor1Key = "groupRaterFactor1"
 groupInternalActiveRulesKey = "groupActiveRules"
 groupNumFinalRoundRatingsKey = "groupNumFinalRoundRatings"
+# MultiGroup Model
+multiGroupNoteInterceptKey = "multiGroupNoteIntercept"
+multiGroupNoteFactor1Key = "multiGroupNoteFactor1"
+multiGroupRatingStatusKey = "multiGroupRatingStatus"
+multiGroupRaterInterceptKey = "multiGroupRaterIntercept"
+multiGroupRaterFactor1Key = "multiGroupRaterFactor1"
+multiGroupInternalActiveRulesKey = "multiGroupActiveRules"
+multiGroupNumFinalRoundRatingsKey = "multiGroupNumFinalRoundRatings"
 # Topic Model
 topicNoteInterceptKey = "topicNoteIntercept"
 topicNoteFactor1Key = "topicNoteFactor1"
@@ -445,6 +455,12 @@ def rater_factor_key(i):
 currentDecidedByKey = "currentDecidedBy"
 currentModelingGroupKey = "currentModelingGroup"
 timestampMillisOfMostRecentStatusChangeKey = "timestampMillisOfMostRecentStatusChange"
+currentMultiGroupStatusKey = "currentMultiGroupStatus"
+currentModelingMultiGroupKey = "currentModelingMultiGroup"
+timestampMillisOfNmrDueToMinStableCrhTimeKey = "timestampMillisOfNmrDueToMinStableCrhTime"
+updatedTimestampMillisOfNmrDueToMinStableCrhTimeKey = (
+ "updatedTimestampMillisOfNmrDueToMinStableCrhTime"
+)
 
 noteStatusHistoryTSVColumnsAndTypes = [
  (noteIdKey, np.int64),
@@ -465,12 +481,22 @@ def rater_factor_key(i):
  (currentDecidedByKey, "category"),
  (currentModelingGroupKey, np.double), # TODO: int
  (timestampMillisOfMostRecentStatusChangeKey, np.double), # double because nullable.
+ (timestampMillisOfNmrDueToMinStableCrhTimeKey, np.double), # double because nullable.
+ (currentMultiGroupStatusKey, "category"),
+ (currentModelingMultiGroupKey, np.double), # TODO: int
 ]
 noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
 noteStatusHistoryTSVTypes = [dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
 noteStatusHistoryTSVTypeMapping = {
  col: dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes
 }
+# TODO(jiansongc): clean up after new column is in production.
+noteStatusHistoryTSVColumnsOld = noteStatusHistoryTSVColumns[:-1]
+noteStatusHistoryTSVColumnsAndTypesOld = noteStatusHistoryTSVColumnsAndTypes[:-1]
+noteStatusHistoryTSVTypeMappingOld = {
+ col: dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypesOld
+}
+
 
 # Earn In + Earn Out
 enrollmentState = "enrollmentState"
@@ -587,6 +613,8 @@ def rater_factor_key(i):
  {
  coverageNoteInterceptMinKey,
  coverageNoteInterceptMaxKey,
+ groupNoteInterceptMinKey,
+ groupNoteInterceptMaxKey,
  }
 )
 
@@ -610,58 +638,64 @@ def rater_factor_key(i):
  (noteIdKey, np.int64),
  (coreNoteInterceptKey, np.double),
  (coreNoteFactor1Key, np.double),
- (finalRatingStatusKey, str),
- (firstTagKey, str),
- (secondTagKey, str),
+ (finalRatingStatusKey, "category"),
+ (firstTagKey, "category"),
+ (secondTagKey, "category"),
  # Note that this column was formerly named "activeRules" and the name is now
  # updated to "coreActiveRules". The data values remain the compatible,
  # but the new column only contains rules that ran when deciding status based on
  # the core model.
- (coreActiveRulesKey, str),
- (activeFilterTagsKey, str),
- (classificationKey, str),
+ (coreActiveRulesKey, "category"),
+ (activeFilterTagsKey, "category"),
+ (classificationKey, "category"),
  (createdAtMillisKey, np.int64),
- (coreRatingStatusKey, str),
- (metaScorerActiveRulesKey, str),
- (decidedByKey, str),
+ (coreRatingStatusKey, "category"),
+ (metaScorerActiveRulesKey, "category"),
+ (decidedByKey, "category"),
  (expansionNoteInterceptKey, np.double),
  (expansionNoteFactor1Key, np.double),
- (expansionRatingStatusKey, str),
+ (expansionRatingStatusKey, "category"),
  (coverageNoteInterceptKey, np.double),
  (coverageNoteFactor1Key, np.double),
- (coverageRatingStatusKey, str),
+ (coverageRatingStatusKey, "category"),
  (coreNoteInterceptMinKey, np.double),
  (coreNoteInterceptMaxKey, np.double),
- (expansionNoteInterceptMinKey, np.double),
- (expansionNoteInterceptMaxKey, np.double),
- (coverageNoteInterceptMinKey, np.double),
- (coverageNoteInterceptMaxKey, np.double),
+ (expansionNoteInterceptMinKey, "category"), # category because always nan
+ (expansionNoteInterceptMaxKey, "category"), # category because always nan
+ (coverageNoteInterceptMinKey, "category"), # category because always nan
+ (coverageNoteInterceptMaxKey, "category"), # category because always nan
  (groupNoteInterceptKey, np.double),
  (groupNoteFactor1Key, np.double),
- (groupRatingStatusKey, str),
- (groupNoteInterceptMaxKey, np.double),
- (groupNoteInterceptMinKey, np.double),
+ (groupRatingStatusKey, "category"),
+ (groupNoteInterceptMaxKey, "category"), # category because always nan
+ (groupNoteInterceptMinKey, "category"), # category because always nan
  (modelingGroupKey, np.float64),
  (numRatingsKey, np.int64),
  (timestampMillisOfNoteCurrentLabelKey, np.double),
  (expansionPlusNoteInterceptKey, np.double),
  (expansionPlusNoteFactor1Key, np.double),
- (expansionPlusRatingStatusKey, str),
+ (expansionPlusRatingStatusKey, "category"),
  (topicNoteInterceptKey, np.double),
  (topicNoteFactor1Key, np.double),
- (topicRatingStatusKey, str),
- (noteTopicKey, str),
+ (topicRatingStatusKey, "category"),
+ (noteTopicKey, "category"),
  (topicNoteConfidentKey, pd.BooleanDtype()),
- (expansionInternalActiveRulesKey, str),
- (expansionPlusInternalActiveRulesKey, str),
- (groupInternalActiveRulesKey, str),
- (topicInternalActiveRulesKey, str),
+ (expansionInternalActiveRulesKey, "category"),
+ (expansionPlusInternalActiveRulesKey, "category"),
+ (groupInternalActiveRulesKey, "category"),
+ (topicInternalActiveRulesKey, "category"),
  (coreNumFinalRoundRatingsKey, np.double), # double because nullable.
  (expansionNumFinalRoundRatingsKey, np.double), # double because nullable.
  (expansionPlusNumFinalRoundRatingsKey, np.double), # double because nullable.
  (groupNumFinalRoundRatingsKey, np.double), # double because nullable.
  (topicNumFinalRoundRatingsKey, np.double), # double because nullable.
- (rescoringActiveRulesKey, str),
+ (rescoringActiveRulesKey, "category"),
+ (multiGroupNoteInterceptKey, np.double),
+ (multiGroupNoteFactor1Key, np.double),
+ (multiGroupRatingStatusKey, str),
+ (modelingMultiGroupKey, np.float64),
+ (multiGroupInternalActiveRulesKey, str),
+ (multiGroupNumFinalRoundRatingsKey, np.double), # double because nullable.
 ]
 noteModelOutputTSVColumns = [col for (col, dtype) in noteModelOutputTSVColumnsAndTypes]
 noteModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in noteModelOutputTSVColumnsAndTypes}
@@ -733,6 +767,9 @@ def rater_factor_key(i):
  (expansionRaterFactor1Key, np.double),
  (expansionPlusRaterInterceptKey, np.double),
  (expansionPlusRaterFactor1Key, np.double),
+ (multiGroupRaterInterceptKey, np.double),
+ (multiGroupRaterFactor1Key, np.double),
+ (modelingMultiGroupKey, np.float64),
 ]
 raterModelOutputTSVColumns = [col for (col, dtype) in raterModelOutputTSVColumnsAndTypes]
 raterModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in raterModelOutputTSVColumnsAndTypes}
@@ -781,6 +818,8 @@ def rater_factor_key(i):
 inputPathsTSVColumns = [col for (col, _) in inputPathsTSVColumnsAndTypes]
 inputPathsTSVTypeMapping = {col: dtype for (col, dtype) in inputPathsTSVColumnsAndTypes}
 
+timestampMinuteOfFinalScoringOutput = "timestampMinuteOfFinalScoringOutput"
+
 
 @contextmanager
 def time_block(label):
@@ -888,6 +927,8 @@ class RescoringRuleID(Enum):
  NOTES_FLIPPED_PREVIOUS_RUN = 3
  NEW_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 4
  RECENTLY_FLIPPED_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 5
+ NMR_DUE_TO_MIN_STABLE_CRH_TIME = 6
+ NOTES_CREATED_SOMEWHAT_RECENTLY = 7
 
 
 @dataclass

diff --git a/sourcecode/scoring/enums.py b/sourcecode/scoring/enums.py
@@ -13,6 +13,7 @@ class Scorers(Enum):
  MFExpansionPlusScorer = auto()
  ReputationScorer = auto()
  MFTopicScorer = auto()
+ MFMultiGroupScorer = auto()
 
 
 class Topics(Enum):

diff --git a/sourcecode/scoring/mf_group_scorer.py b/sourcecode/scoring/mf_group_scorer.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 from . import constants as c
 from .mf_base_scorer import MFBaseScorer, coalesce_columns
@@ -13,7 +13,7 @@
 trialScoringGroup = 14
 
 # Mapping of how many threads to assign to each group scorer
-_groupScorerParalleism = {
+groupScorerParalleism = {
  # Group model 13 is larger and benefits from more threads.
  # Others can default to 4.
  13: 8
@@ -32,8 +32,6 @@ def coalesce_group_model_scored_notes(scoredNotes: pd.DataFrame) -> pd.DataFrame
  c.groupNoteInterceptKey,
  c.groupNoteFactor1Key,
  c.groupRatingStatusKey,
- c.groupNoteInterceptMaxKey,
- c.groupNoteInterceptMinKey,
  c.modelingGroupKey,
  c.groupInternalActiveRulesKey,
  c.groupNumFinalRoundRatingsKey,
@@ -59,9 +57,9 @@ def coalesce_group_model_helpfulness_scores(helpfulnessScores: pd.DataFrame) ->
 class MFGroupScorer(MFBaseScorer):
  def __init__(
  self,
- groupNumber: int,
+ includedGroups: Set[int],
+ groupId: int,
  seed: Optional[int] = None,
- pseudoraters: Optional[bool] = False,
  groupThreshold: float = 0.8,
  saveIntermediateState: bool = False,
  userFactorLambda=None,
@@ -86,6 +84,7 @@ def __init__(
  tagConsensusHarassmentHelpfulRatingPenalty: int = 10,
  tagFilterPercentile: int = 95,
  incorrectFilterThreshold: float = 2.5,
+ threads: int = 4,
  ) -> None:
  """Configure MFGroupScorer object.
 
@@ -104,14 +103,14 @@ def __init__(
  for the model to be active
  """
  super().__init__(
- includedGroups={groupNumber},
+ includedGroups=includedGroups,
  includeUnassigned=False,
  captureThreshold=groupThreshold,
  seed=seed,
- pseudoraters=pseudoraters,
+ pseudoraters=False,
  useStableInitialization=False,
  saveIntermediateState=saveIntermediateState,
- threads=_groupScorerParalleism.get(groupNumber, 4),
+ threads=threads,
  userFactorLambda=userFactorLambda,
  noteFactorLambda=noteFactorLambda,
  userInterceptLambda=userInterceptLambda,
@@ -135,31 +134,30 @@ def __init__(
  tagFilterPercentile=tagFilterPercentile,
  incorrectFilterThreshold=incorrectFilterThreshold,
  )
- assert groupNumber > 0, "groupNumber must be positive. 0 is reserved for unassigned."
- assert groupNumber <= groupScorerCount, "groupNumber exceeds maximum expected groups."
- self._groupNumber = groupNumber
- self._groupNoteInterceptKey = f"{c.groupNoteInterceptKey}_{self._groupNumber}"
- self._groupNoteFactor1Key = f"{c.groupNoteFactor1Key}_{self._groupNumber}"
- self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupNumber}"
- self._groupNoteInterceptMaxKey = f"{c.groupNoteInterceptMaxKey}_{self._groupNumber}"
- self._groupNoteInterceptMinKey = f"{c.groupNoteInterceptMinKey}_{self._groupNumber}"
- self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupNumber}"
- self._groupNumFinalRoundRatingsKey = f"{c.groupNumFinalRoundRatingsKey}_{self._groupNumber}"
- self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupNumber}"
- self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupNumber}"
- self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupNumber}"
+ assert groupId > 0, "groupNumber must be positive. 0 is reserved for unassigned."
+ self._groupId = groupId
+ self._init_column_names()
+
+ def _init_column_names(self):
+ """Initialize column names based on prefixes and groupId."""
+ self._groupNoteInterceptKey = f"{c.groupNoteInterceptKey}_{self._groupId}"
+ self._groupNoteFactor1Key = f"{c.groupNoteFactor1Key}_{self._groupId}"
+ self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupId}"
+ self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupId}"
+ self._groupNumFinalRoundRatingsKey = f"{c.groupNumFinalRoundRatingsKey}_{self._groupId}"
+ self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupId}"
+ self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupId}"
+ self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupId}"
 
  def get_name(self):
- return f"MFGroupScorer_{self._groupNumber}"
+ return f"MFGroupScorer_{self._groupId}"
 
  def _get_note_col_mapping(self) -> Dict[str, str]:
  """Returns a dict mapping default note column names to custom names for a specific model."""
  return {
  c.internalNoteInterceptKey: self._groupNoteInterceptKey,
  c.internalNoteFactor1Key: self._groupNoteFactor1Key,
  c.internalRatingStatusKey: self._groupRatingStatusKey,
- c.noteInterceptMinKey: self._groupNoteInterceptMinKey,
- c.noteInterceptMaxKey: self._groupNoteInterceptMaxKey,
  c.internalActiveRulesKey: self._groupInternalActiveRulesKey,
  c.numFinalRoundRatingsKey: self._groupNumFinalRoundRatingsKey,
  c.lowDiligenceNoteInterceptKey: c.lowDiligenceLegacyNoteInterceptKey,
@@ -179,8 +177,6 @@ def get_scored_notes_cols(self) -> List[str]:
  self._groupNoteInterceptKey,
  self._groupNoteFactor1Key,
  self._groupRatingStatusKey,
- self._groupNoteInterceptMaxKey,
- self._groupNoteInterceptMinKey,
  self._groupInternalActiveRulesKey,
  self._modelingGroupKey,
  self._groupNumFinalRoundRatingsKey,
@@ -205,6 +201,8 @@ def _get_dropped_note_cols(self) -> List[str]:
  [
  c.activeFilterTagsKey,
  c.ratingWeightKey,
+ c.noteInterceptMinKey,
+ c.noteInterceptMaxKey,
  ]
  + c.notHelpfulTagsAdjustedColumns
  + c.notHelpfulTagsAdjustedRatioColumns
@@ -261,9 +259,9 @@ def _postprocess_output(
  ),
  how="left",
  )
- userScores = userScores[userScores[c.modelingGroupKey] == self._groupNumber]
+ userScores = userScores[userScores[c.modelingGroupKey].isin(self._includedGroups)]
  userScores = userScores.drop(columns=c.modelingGroupKey)
  # Set the modelingGroupKey column in each output
- noteScores[self._modelingGroupKey] = self._groupNumber
- userScores[self._modelingGroupKey] = self._groupNumber
+ noteScores[self._modelingGroupKey] = self._groupId
+ userScores[self._modelingGroupKey] = self._groupId
  return noteScores, userScores