Skip to content

Commit

Permalink
Collapse verbs: create_final_text_units (#1143)
Browse files Browse the repository at this point in the history
* Load default config in verb tests

* Load proper workflow config

* Collapse text unit pre-embedding steps

* Format

* Update smoke tests

* Semver

* Format

* Merge join* subflows into create_final_text_units

* Remove join_text_units_to_covariate_ids

* Format

* Remove join_text_units_to_entity_ids

* Remove join_text_units_to_relationship_ids

* Clean up merges and aggregations

* Remove unnecessary cast
  • Loading branch information
natoverse authored Sep 17, 2024
1 parent f7f96c3 commit a473265
Show file tree
Hide file tree
Showing 23 changed files with 274 additions and 500 deletions.
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20240917004639047120.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Collapse create_final_text_units."
}
30 changes: 1 addition & 29 deletions graphrag/index/create_pipeline_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,6 @@
create_final_relationships,
create_final_text_units,
create_summarized_entities,
join_text_units_to_covariate_ids,
join_text_units_to_entity_ids,
join_text_units_to_relationship_ids,
)

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -113,7 +110,7 @@ def create_pipeline_config(settings: GraphRagConfig, verbose=False) -> PipelineC
if verbose:
_log_llm_settings(settings)

skip_workflows = _determine_skip_workflows(settings)
skip_workflows = settings.skip_workflows
embedded_fields = _get_embedded_fields(settings)
covariates_enabled = (
settings.claim_extraction.enabled
Expand Down Expand Up @@ -152,16 +149,6 @@ def _get_embedded_fields(settings: GraphRagConfig) -> set[str]:
raise ValueError(msg)


def _determine_skip_workflows(settings: GraphRagConfig) -> list[str]:
skip_workflows = settings.skip_workflows
if (
create_final_covariates in skip_workflows
and join_text_units_to_covariate_ids not in skip_workflows
):
skip_workflows.append(join_text_units_to_covariate_ids)
return skip_workflows


def _log_llm_settings(settings: GraphRagConfig) -> None:
log.info(
"Using LLM Config %s",
Expand Down Expand Up @@ -229,21 +216,6 @@ def _text_unit_workflows(
},
},
),
PipelineWorkflowReference(
name=join_text_units_to_entity_ids,
),
PipelineWorkflowReference(
name=join_text_units_to_relationship_ids,
),
*(
[
PipelineWorkflowReference(
name=join_text_units_to_covariate_ids,
)
]
if covariates_enabled
else []
),
PipelineWorkflowReference(
name=create_final_text_units,
config={
Expand Down
22 changes: 1 addition & 21 deletions graphrag/index/workflows/default_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,24 +85,7 @@
from .v1.create_summarized_entities import (
workflow_name as create_summarized_entities,
)
from .v1.join_text_units_to_covariate_ids import (
build_steps as join_text_units_to_covariate_ids_steps,
)
from .v1.join_text_units_to_covariate_ids import (
workflow_name as join_text_units_to_covariate_ids,
)
from .v1.join_text_units_to_entity_ids import (
build_steps as join_text_units_to_entity_ids_steps,
)
from .v1.join_text_units_to_entity_ids import (
workflow_name as join_text_units_to_entity_ids,
)
from .v1.join_text_units_to_relationship_ids import (
build_steps as join_text_units_to_relationship_ids_steps,
)
from .v1.join_text_units_to_relationship_ids import (
workflow_name as join_text_units_to_relationship_ids,
)


default_workflows: WorkflowDefinitions = {
create_base_extracted_entities: build_create_base_extracted_entities_steps,
Expand All @@ -118,7 +101,4 @@
create_final_entities: build_create_final_entities_steps,
create_final_communities: build_create_final_communities_steps,
create_summarized_entities: build_create_summarized_entities_steps,
join_text_units_to_entity_ids: join_text_units_to_entity_ids_steps,
join_text_units_to_covariate_ids: join_text_units_to_covariate_ids_steps,
join_text_units_to_relationship_ids: join_text_units_to_relationship_ids_steps,
}
103 changes: 11 additions & 92 deletions graphrag/index/workflows/v1/create_final_text_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,103 +28,22 @@ def build_steps(
is not None
)

others = [
"workflow:create_final_entities",
"workflow:create_final_relationships",
]
if covariates_enabled:
others.append("workflow:create_final_covariates")

return [
{
"verb": "select",
"args": {"columns": ["id", "chunk", "document_ids", "n_tokens"]},
"input": {"source": "workflow:create_base_text_units"},
},
{
"id": "pre_entity_join",
"verb": "rename",
"args": {
"columns": {
"chunk": "text",
},
},
},
# Expand the TextUnits with EntityIDs
{
"id": "pre_relationship_join",
"verb": "join",
"args": {
"on": ["id", "id"],
"strategy": "left outer",
},
"input": {
"source": "pre_entity_join",
"others": ["workflow:join_text_units_to_entity_ids"],
},
},
# Expand the TextUnits with RelationshipIDs
{
"id": "pre_covariate_join",
"verb": "join",
"verb": "create_final_text_units_pre_embedding",
"args": {
"on": ["id", "id"],
"strategy": "left outer",
"covariates_enabled": covariates_enabled,
},
"input": {
"source": "pre_relationship_join",
"others": ["workflow:join_text_units_to_relationship_ids"],
},
},
# Expand the TextUnits with CovariateIDs
{
"enabled": covariates_enabled,
"verb": "join",
"args": {
"on": ["id", "id"],
"strategy": "left outer",
},
"input": {
"source": "pre_covariate_join",
"others": ["workflow:join_text_units_to_covariate_ids"],
},
},
# Mash the entities and relationships into arrays
{
"verb": "aggregate_override",
"args": {
"groupby": ["id"], # from the join above
"aggregations": [
{
"column": "text",
"operation": "any",
"to": "text",
},
{
"column": "n_tokens",
"operation": "any",
"to": "n_tokens",
},
{
"column": "document_ids",
"operation": "any",
"to": "document_ids",
},
{
"column": "entity_ids",
"operation": "any",
"to": "entity_ids",
},
{
"column": "relationship_ids",
"operation": "any",
"to": "relationship_ids",
},
*(
[]
if not covariates_enabled
else [
{
"column": "covariate_ids",
"operation": "any",
"to": "covariate_ids",
}
]
),
],
"source": "workflow:create_base_text_units",
"others": others,
},
},
# Text-Embed after final aggregations
Expand Down
41 changes: 0 additions & 41 deletions graphrag/index/workflows/v1/join_text_units_to_covariate_ids.py

This file was deleted.

42 changes: 0 additions & 42 deletions graphrag/index/workflows/v1/join_text_units_to_entity_ids.py

This file was deleted.

43 changes: 0 additions & 43 deletions graphrag/index/workflows/v1/join_text_units_to_relationship_ids.py

This file was deleted.

8 changes: 2 additions & 6 deletions graphrag/index/workflows/v1/subflows/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,8 @@

"""The Indexing Engine workflows -> subflows package root."""

from .join_text_units_to_covariate_ids import join_text_units_to_covariate_ids
from .join_text_units_to_entity_ids import join_text_units_to_entity_ids
from .join_text_units_to_relationship_ids import join_text_units_to_relationship_ids
from .create_final_text_units_pre_embedding import create_final_text_units_pre_embedding

__all__ = [
"join_text_units_to_covariate_ids",
"join_text_units_to_entity_ids",
"join_text_units_to_relationship_ids",
"create_final_text_units_pre_embedding",
]
Loading

0 comments on commit a473265

Please sign in to comment.