Skip to content

Commit

Permalink
Merge pull request #256 from twitter/jbaxter/2024_08_12
Browse files Browse the repository at this point in the history
Multi-group models, Support NMRDueToStableCRHTime, & more
  • Loading branch information
jbaxter authored Aug 12, 2024
2 parents 779c728 + 9cf9458 commit 956d8bd
Show file tree
Hide file tree
Showing 8 changed files with 540 additions and 165 deletions.
113 changes: 77 additions & 36 deletions sourcecode/scoring/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from enum import Enum
import os
import time
from typing import Dict, Optional
from typing import Dict, Optional, Set

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -34,12 +34,17 @@
intervalHalfWidth = 0.3

# Max flip rates
prescoringAllUnlockedNotesMaxCrhChurn = 0.04
finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.03
prescoringAllUnlockedNotesMaxCrhChurn = 0.2
prescoringAllNotesCreatedThreeToThirteenDaysAgoMaxChurn = 0.06
finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.05
finalNotesWithNewRatingsMaxNewCrhChurn = 0.80
finalNotesWithNewRatingsMaxOldCrhChurn = 0.25
finalNotesThatJustFlippedStatusMaxCrhChurn = 1e8
finalNotesThatFlippedRecentlyMaxCrhChurn = 1e8
# TODO(jiansongc): adjust these 2 below
finalNotesNmrDueToMinStableCrhTimeMaxOldCrhChurn = 1.0
finalNotesNmrDueToMinStableCrhTimeMaxNewCrhChurn = 1.0


# Data Filenames
scoredNotesOutputPath = "scoredNotes.tsv"
Expand All @@ -59,17 +64,14 @@
authorTopNotHelpfulTagValues = "authorTopNotHelpfulTagValues"
modelingPopulationKey = "modelingPopulation"
modelingGroupKey = "modelingGroup"
modelingMultiGroupKey = "modelingMultiGroup"
numberOfTimesEarnedOutKey = "numberOfTimesEarnedOut"
defaultIndexKey = "index"

# Scoring Groups
coreGroups = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25}
expansionGroups = (
# Divide into 3 grouping aggregates to prepare for multi-group models,
# and a 4th group containing leftovers
{0, 15, 17, 24, 29, 30} | {4, 5, 7, 12, 26} | {27} | {16, 20, 22, 23, 28}
)
expansionPlusGroups = {18}
coreGroups: Set[int] = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25}
expansionGroups: Set[int] = {0, 4, 5, 7, 12, 16, 18, 20, 22, 23, 24, 26, 27, 28}
expansionPlusGroups: Set[int] = {15, 17, 29, 30}

# TSV Values
notHelpfulValueTsv = "NOT_HELPFUL"
Expand Down Expand Up @@ -193,6 +195,14 @@ def rater_factor_key(i):
groupRaterFactor1Key = "groupRaterFactor1"
groupInternalActiveRulesKey = "groupActiveRules"
groupNumFinalRoundRatingsKey = "groupNumFinalRoundRatings"
# MultiGroup Model
multiGroupNoteInterceptKey = "multiGroupNoteIntercept"
multiGroupNoteFactor1Key = "multiGroupNoteFactor1"
multiGroupRatingStatusKey = "multiGroupRatingStatus"
multiGroupRaterInterceptKey = "multiGroupRaterIntercept"
multiGroupRaterFactor1Key = "multiGroupRaterFactor1"
multiGroupInternalActiveRulesKey = "multiGroupActiveRules"
multiGroupNumFinalRoundRatingsKey = "multiGroupNumFinalRoundRatings"
# Topic Model
topicNoteInterceptKey = "topicNoteIntercept"
topicNoteFactor1Key = "topicNoteFactor1"
Expand Down Expand Up @@ -445,6 +455,12 @@ def rater_factor_key(i):
currentDecidedByKey = "currentDecidedBy"
currentModelingGroupKey = "currentModelingGroup"
timestampMillisOfMostRecentStatusChangeKey = "timestampMillisOfMostRecentStatusChange"
currentMultiGroupStatusKey = "currentMultiGroupStatus"
currentModelingMultiGroupKey = "currentModelingMultiGroup"
timestampMillisOfNmrDueToMinStableCrhTimeKey = "timestampMillisOfNmrDueToMinStableCrhTime"
updatedTimestampMillisOfNmrDueToMinStableCrhTimeKey = (
"updatedTimestampMillisOfNmrDueToMinStableCrhTime"
)

noteStatusHistoryTSVColumnsAndTypes = [
(noteIdKey, np.int64),
Expand All @@ -465,12 +481,22 @@ def rater_factor_key(i):
(currentDecidedByKey, "category"),
(currentModelingGroupKey, np.double), # TODO: int
(timestampMillisOfMostRecentStatusChangeKey, np.double), # double because nullable.
(timestampMillisOfNmrDueToMinStableCrhTimeKey, np.double), # double because nullable.
(currentMultiGroupStatusKey, "category"),
(currentModelingMultiGroupKey, np.double), # TODO: int
]
noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
noteStatusHistoryTSVTypes = [dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
noteStatusHistoryTSVTypeMapping = {
col: dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes
}
# TODO(jiansongc): clean up after new column is in production.
noteStatusHistoryTSVColumnsOld = noteStatusHistoryTSVColumns[:-1]
noteStatusHistoryTSVColumnsAndTypesOld = noteStatusHistoryTSVColumnsAndTypes[:-1]
noteStatusHistoryTSVTypeMappingOld = {
col: dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypesOld
}


# Earn In + Earn Out
enrollmentState = "enrollmentState"
Expand Down Expand Up @@ -587,6 +613,8 @@ def rater_factor_key(i):
{
coverageNoteInterceptMinKey,
coverageNoteInterceptMaxKey,
groupNoteInterceptMinKey,
groupNoteInterceptMaxKey,
}
)

Expand All @@ -610,58 +638,64 @@ def rater_factor_key(i):
(noteIdKey, np.int64),
(coreNoteInterceptKey, np.double),
(coreNoteFactor1Key, np.double),
(finalRatingStatusKey, str),
(firstTagKey, str),
(secondTagKey, str),
(finalRatingStatusKey, "category"),
(firstTagKey, "category"),
(secondTagKey, "category"),
# Note that this column was formerly named "activeRules" and the name is now
# updated to "coreActiveRules". The data values remain the compatible,
# but the new column only contains rules that ran when deciding status based on
# the core model.
(coreActiveRulesKey, str),
(activeFilterTagsKey, str),
(classificationKey, str),
(coreActiveRulesKey, "category"),
(activeFilterTagsKey, "category"),
(classificationKey, "category"),
(createdAtMillisKey, np.int64),
(coreRatingStatusKey, str),
(metaScorerActiveRulesKey, str),
(decidedByKey, str),
(coreRatingStatusKey, "category"),
(metaScorerActiveRulesKey, "category"),
(decidedByKey, "category"),
(expansionNoteInterceptKey, np.double),
(expansionNoteFactor1Key, np.double),
(expansionRatingStatusKey, str),
(expansionRatingStatusKey, "category"),
(coverageNoteInterceptKey, np.double),
(coverageNoteFactor1Key, np.double),
(coverageRatingStatusKey, str),
(coverageRatingStatusKey, "category"),
(coreNoteInterceptMinKey, np.double),
(coreNoteInterceptMaxKey, np.double),
(expansionNoteInterceptMinKey, np.double),
(expansionNoteInterceptMaxKey, np.double),
(coverageNoteInterceptMinKey, np.double),
(coverageNoteInterceptMaxKey, np.double),
(expansionNoteInterceptMinKey, "category"), # category because always nan
(expansionNoteInterceptMaxKey, "category"), # category because always nan
(coverageNoteInterceptMinKey, "category"), # category because always nan
(coverageNoteInterceptMaxKey, "category"), # category because always nan
(groupNoteInterceptKey, np.double),
(groupNoteFactor1Key, np.double),
(groupRatingStatusKey, str),
(groupNoteInterceptMaxKey, np.double),
(groupNoteInterceptMinKey, np.double),
(groupRatingStatusKey, "category"),
(groupNoteInterceptMaxKey, "category"), # category because always nan
(groupNoteInterceptMinKey, "category"), # category because always nan
(modelingGroupKey, np.float64),
(numRatingsKey, np.int64),
(timestampMillisOfNoteCurrentLabelKey, np.double),
(expansionPlusNoteInterceptKey, np.double),
(expansionPlusNoteFactor1Key, np.double),
(expansionPlusRatingStatusKey, str),
(expansionPlusRatingStatusKey, "category"),
(topicNoteInterceptKey, np.double),
(topicNoteFactor1Key, np.double),
(topicRatingStatusKey, str),
(noteTopicKey, str),
(topicRatingStatusKey, "category"),
(noteTopicKey, "category"),
(topicNoteConfidentKey, pd.BooleanDtype()),
(expansionInternalActiveRulesKey, str),
(expansionPlusInternalActiveRulesKey, str),
(groupInternalActiveRulesKey, str),
(topicInternalActiveRulesKey, str),
(expansionInternalActiveRulesKey, "category"),
(expansionPlusInternalActiveRulesKey, "category"),
(groupInternalActiveRulesKey, "category"),
(topicInternalActiveRulesKey, "category"),
(coreNumFinalRoundRatingsKey, np.double), # double because nullable.
(expansionNumFinalRoundRatingsKey, np.double), # double because nullable.
(expansionPlusNumFinalRoundRatingsKey, np.double), # double because nullable.
(groupNumFinalRoundRatingsKey, np.double), # double because nullable.
(topicNumFinalRoundRatingsKey, np.double), # double because nullable.
(rescoringActiveRulesKey, str),
(rescoringActiveRulesKey, "category"),
(multiGroupNoteInterceptKey, np.double),
(multiGroupNoteFactor1Key, np.double),
(multiGroupRatingStatusKey, str),
(modelingMultiGroupKey, np.float64),
(multiGroupInternalActiveRulesKey, str),
(multiGroupNumFinalRoundRatingsKey, np.double), # double because nullable.
]
noteModelOutputTSVColumns = [col for (col, dtype) in noteModelOutputTSVColumnsAndTypes]
noteModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in noteModelOutputTSVColumnsAndTypes}
Expand Down Expand Up @@ -733,6 +767,9 @@ def rater_factor_key(i):
(expansionRaterFactor1Key, np.double),
(expansionPlusRaterInterceptKey, np.double),
(expansionPlusRaterFactor1Key, np.double),
(multiGroupRaterInterceptKey, np.double),
(multiGroupRaterFactor1Key, np.double),
(modelingMultiGroupKey, np.float64),
]
raterModelOutputTSVColumns = [col for (col, dtype) in raterModelOutputTSVColumnsAndTypes]
raterModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in raterModelOutputTSVColumnsAndTypes}
Expand Down Expand Up @@ -781,6 +818,8 @@ def rater_factor_key(i):
inputPathsTSVColumns = [col for (col, _) in inputPathsTSVColumnsAndTypes]
inputPathsTSVTypeMapping = {col: dtype for (col, dtype) in inputPathsTSVColumnsAndTypes}

timestampMinuteOfFinalScoringOutput = "timestampMinuteOfFinalScoringOutput"


@contextmanager
def time_block(label):
Expand Down Expand Up @@ -888,6 +927,8 @@ class RescoringRuleID(Enum):
NOTES_FLIPPED_PREVIOUS_RUN = 3
NEW_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 4
RECENTLY_FLIPPED_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 5
NMR_DUE_TO_MIN_STABLE_CRH_TIME = 6
NOTES_CREATED_SOMEWHAT_RECENTLY = 7


@dataclass
Expand Down
1 change: 1 addition & 0 deletions sourcecode/scoring/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class Scorers(Enum):
MFExpansionPlusScorer = auto()
ReputationScorer = auto()
MFTopicScorer = auto()
MFMultiGroupScorer = auto()


class Topics(Enum):
Expand Down
58 changes: 28 additions & 30 deletions sourcecode/scoring/mf_group_scorer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Set, Tuple

from . import constants as c
from .mf_base_scorer import MFBaseScorer, coalesce_columns
Expand All @@ -13,7 +13,7 @@
trialScoringGroup = 14

# Mapping of how many threads to assign to each group scorer
_groupScorerParalleism = {
groupScorerParalleism = {
# Group model 13 is larger and benefits from more threads.
# Others can default to 4.
13: 8
Expand All @@ -32,8 +32,6 @@ def coalesce_group_model_scored_notes(scoredNotes: pd.DataFrame) -> pd.DataFrame
c.groupNoteInterceptKey,
c.groupNoteFactor1Key,
c.groupRatingStatusKey,
c.groupNoteInterceptMaxKey,
c.groupNoteInterceptMinKey,
c.modelingGroupKey,
c.groupInternalActiveRulesKey,
c.groupNumFinalRoundRatingsKey,
Expand All @@ -59,9 +57,9 @@ def coalesce_group_model_helpfulness_scores(helpfulnessScores: pd.DataFrame) ->
class MFGroupScorer(MFBaseScorer):
def __init__(
self,
groupNumber: int,
includedGroups: Set[int],
groupId: int,
seed: Optional[int] = None,
pseudoraters: Optional[bool] = False,
groupThreshold: float = 0.8,
saveIntermediateState: bool = False,
userFactorLambda=None,
Expand All @@ -86,6 +84,7 @@ def __init__(
tagConsensusHarassmentHelpfulRatingPenalty: int = 10,
tagFilterPercentile: int = 95,
incorrectFilterThreshold: float = 2.5,
threads: int = 4,
) -> None:
"""Configure MFGroupScorer object.
Expand All @@ -104,14 +103,14 @@ def __init__(
for the model to be active
"""
super().__init__(
includedGroups={groupNumber},
includedGroups=includedGroups,
includeUnassigned=False,
captureThreshold=groupThreshold,
seed=seed,
pseudoraters=pseudoraters,
pseudoraters=False,
useStableInitialization=False,
saveIntermediateState=saveIntermediateState,
threads=_groupScorerParalleism.get(groupNumber, 4),
threads=threads,
userFactorLambda=userFactorLambda,
noteFactorLambda=noteFactorLambda,
userInterceptLambda=userInterceptLambda,
Expand All @@ -135,31 +134,30 @@ def __init__(
tagFilterPercentile=tagFilterPercentile,
incorrectFilterThreshold=incorrectFilterThreshold,
)
assert groupNumber > 0, "groupNumber must be positive. 0 is reserved for unassigned."
assert groupNumber <= groupScorerCount, "groupNumber exceeds maximum expected groups."
self._groupNumber = groupNumber
self._groupNoteInterceptKey = f"{c.groupNoteInterceptKey}_{self._groupNumber}"
self._groupNoteFactor1Key = f"{c.groupNoteFactor1Key}_{self._groupNumber}"
self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupNumber}"
self._groupNoteInterceptMaxKey = f"{c.groupNoteInterceptMaxKey}_{self._groupNumber}"
self._groupNoteInterceptMinKey = f"{c.groupNoteInterceptMinKey}_{self._groupNumber}"
self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupNumber}"
self._groupNumFinalRoundRatingsKey = f"{c.groupNumFinalRoundRatingsKey}_{self._groupNumber}"
self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupNumber}"
self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupNumber}"
self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupNumber}"
assert groupId > 0, "groupNumber must be positive. 0 is reserved for unassigned."
self._groupId = groupId
self._init_column_names()

def _init_column_names(self):
"""Initialize column names based on prefixes and groupId."""
self._groupNoteInterceptKey = f"{c.groupNoteInterceptKey}_{self._groupId}"
self._groupNoteFactor1Key = f"{c.groupNoteFactor1Key}_{self._groupId}"
self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupId}"
self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupId}"
self._groupNumFinalRoundRatingsKey = f"{c.groupNumFinalRoundRatingsKey}_{self._groupId}"
self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupId}"
self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupId}"
self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupId}"

def get_name(self):
return f"MFGroupScorer_{self._groupNumber}"
return f"MFGroupScorer_{self._groupId}"

def _get_note_col_mapping(self) -> Dict[str, str]:
"""Returns a dict mapping default note column names to custom names for a specific model."""
return {
c.internalNoteInterceptKey: self._groupNoteInterceptKey,
c.internalNoteFactor1Key: self._groupNoteFactor1Key,
c.internalRatingStatusKey: self._groupRatingStatusKey,
c.noteInterceptMinKey: self._groupNoteInterceptMinKey,
c.noteInterceptMaxKey: self._groupNoteInterceptMaxKey,
c.internalActiveRulesKey: self._groupInternalActiveRulesKey,
c.numFinalRoundRatingsKey: self._groupNumFinalRoundRatingsKey,
c.lowDiligenceNoteInterceptKey: c.lowDiligenceLegacyNoteInterceptKey,
Expand All @@ -179,8 +177,6 @@ def get_scored_notes_cols(self) -> List[str]:
self._groupNoteInterceptKey,
self._groupNoteFactor1Key,
self._groupRatingStatusKey,
self._groupNoteInterceptMaxKey,
self._groupNoteInterceptMinKey,
self._groupInternalActiveRulesKey,
self._modelingGroupKey,
self._groupNumFinalRoundRatingsKey,
Expand All @@ -205,6 +201,8 @@ def _get_dropped_note_cols(self) -> List[str]:
[
c.activeFilterTagsKey,
c.ratingWeightKey,
c.noteInterceptMinKey,
c.noteInterceptMaxKey,
]
+ c.notHelpfulTagsAdjustedColumns
+ c.notHelpfulTagsAdjustedRatioColumns
Expand Down Expand Up @@ -261,9 +259,9 @@ def _postprocess_output(
),
how="left",
)
userScores = userScores[userScores[c.modelingGroupKey] == self._groupNumber]
userScores = userScores[userScores[c.modelingGroupKey].isin(self._includedGroups)]
userScores = userScores.drop(columns=c.modelingGroupKey)
# Set the modelingGroupKey column in each output
noteScores[self._modelingGroupKey] = self._groupNumber
userScores[self._modelingGroupKey] = self._groupNumber
noteScores[self._modelingGroupKey] = self._groupId
userScores[self._modelingGroupKey] = self._groupId
return noteScores, userScores
Loading

0 comments on commit 956d8bd

Please sign in to comment.