Skip to content

Commit

Permalink
Clean and organize run index code (#1090)
Browse files Browse the repository at this point in the history
* Create entypoint for cli and api (#1067)

* Add cli and api entrypoints for update index

* Semver

* Update docs

* Run tests on feature branch main

* Better /main handling in tests

* Clean and organize run index code

* Ruff fix

* Pyright fix

* Format fixes

* Pyright fix

* Format

* Fix integ tests

* Fix ruff

* Reorganize and clean up
  • Loading branch information
AlonsoGuevara authored Sep 5, 2024
1 parent 2d45ece commit 044516f
Show file tree
Hide file tree
Showing 20 changed files with 742 additions and 488 deletions.
10 changes: 7 additions & 3 deletions .github/workflows/python-ci.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
name: Python CI
on:
push:
branches: [main]
branches:
- "**/main" # Matches branches like feature/main
- "main" # Matches the main branch
pull_request:
branches: [main]
branches:
- "**/main"
- "main"

permissions:
contents: read
Expand Down Expand Up @@ -72,4 +76,4 @@ jobs:
- name: Unit Test
run: |
poetry run poe test_unit
poetry run poe test_unit
8 changes: 6 additions & 2 deletions .github/workflows/python-integration-tests.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
name: Python Integration Tests
on:
push:
branches: [main]
branches:
- "**/main" # Matches branches like feature/main
- "main" # Matches the main branch
pull_request:
branches: [main]
branches:
- "**/main"
- "main"

permissions:
contents: read
Expand Down
9 changes: 6 additions & 3 deletions .github/workflows/python-notebook-tests.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
name: Python Notebook Tests
on:
push:
branches: [main]
branches:
- "**/main" # Matches branches like feature/main
- "main" # Matches the main branch
pull_request:
branches: [main]
branches:
- "**/main"
- "main"

permissions:
contents: read
Expand Down Expand Up @@ -64,7 +68,6 @@ jobs:
poetry run python -m pip install gensim
poetry install
- name: Notebook Test
run: |
poetry run poe test_notebook
8 changes: 6 additions & 2 deletions .github/workflows/python-smoke-tests.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
name: Python Smoke Tests
on:
push:
branches: [main]
branches:
- "**/main" # Matches branches like feature/main
- "main" # Matches the main branch
pull_request:
branches: [main]
branches:
- "**/main"
- "main"

permissions:
contents: read
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/spellcheck.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
branches: [main]
pull_request:
paths:
- '**/*'
- "**/*"
jobs:
spellcheck:
runs-on: ubuntu-latest
Expand Down
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20240830181135475287.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Add entrypoints for incremental indexing"
}
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20240903205022597458.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Clean up and organize run index code"
}
13 changes: 13 additions & 0 deletions graphrag/index/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,25 @@
help="Skip any preflight validation. Useful when running no LLM steps.",
action="store_true",
)
parser.add_argument(
"--update-index",
help="Update a given index run id, leveraging previous outputs and applying new indexes.",
# Only required if config is not defined
required=False,
default=None,
type=str,
)
args = parser.parse_args()

if args.resume and args.update_index:
msg = "Cannot resume and update a run at the same time."
raise ValueError(msg)

index_cli(
root_dir=args.root,
verbose=args.verbose or False,
resume=args.resume,
update_index_id=args.update_index,
memprofile=args.memprofile or False,
nocache=args.nocache or False,
reporter=args.reporter,
Expand Down
20 changes: 14 additions & 6 deletions graphrag/index/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
Backwards compatibility is not guaranteed at this time.
"""

from pathlib import Path

from graphrag.config import CacheType, GraphRagConfig

from .cache.noop_pipeline_cache import NoopPipelineCache
Expand All @@ -24,8 +22,10 @@

async def build_index(
config: GraphRagConfig,
run_id: str,
memory_profile: bool,
run_id: str = "",
is_resume_run: bool = False,
is_update_run: bool = False,
memory_profile: bool = False,
progress_reporter: ProgressReporter | None = None,
emit: list[str] | None = None,
) -> list[PipelineRunResult]:
Expand All @@ -37,6 +37,10 @@ async def build_index(
The configuration.
run_id : str
The run id. Creates a output directory with this name.
is_resume_run : bool default=False
Whether to resume a previous index run.
is_update_run : bool default=False
Whether to update a previous index run.
memory_profile : bool
Whether to enable memory profiling.
progress_reporter : ProgressReporter | None default=None
Expand All @@ -50,7 +54,10 @@ async def build_index(
list[PipelineRunResult]
The list of pipeline run results
"""
resume = Path(config.storage.base_dir).exists()
if is_resume_run and is_update_run:
msg = "Cannot resume and update a run at the same time."
raise ValueError(msg)

pipeline_config = create_pipeline_config(config)
pipeline_cache = (
NoopPipelineCache() if config.cache.type == CacheType.none is None else None
Expand All @@ -63,7 +70,8 @@ async def build_index(
cache=pipeline_cache,
progress_reporter=progress_reporter,
emit=([TableEmitterType(e) for e in emit] if emit is not None else None),
is_resume_run=resume,
is_resume_run=is_resume_run,
is_update_run=is_update_run,
):
outputs.append(output)
if progress_reporter:
Expand Down
15 changes: 9 additions & 6 deletions graphrag/index/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def index_cli(
init: bool,
verbose: bool,
resume: str | None,
update_index_id: str | None,
memprofile: bool,
nocache: bool,
reporter: str | None,
Expand All @@ -112,7 +113,7 @@ def index_cli(
"""Run the pipeline with the given config."""
progress_reporter = load_progress_reporter(reporter or "rich")
info, error, success = _logger(progress_reporter)
run_id = resume or time.strftime("%Y%m%d-%H%M%S")
run_id = resume or update_index_id or time.strftime("%Y%m%d-%H%M%S")

if init:
_initialize_project_at(root_dir, progress_reporter)
Expand Down Expand Up @@ -152,11 +153,13 @@ def index_cli(

outputs = asyncio.run(
build_index(
config,
run_id,
memprofile,
progress_reporter,
pipeline_emit,
config=config,
run_id=run_id,
is_resume_run=bool(resume),
is_update_run=bool(update_index_id),
memory_profile=memprofile,
progress_reporter=progress_reporter,
emit=pipeline_emit,
)
)
encountered_errors = any(
Expand Down
Loading

0 comments on commit 044516f

Please sign in to comment.