diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..df49486 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM debian:bookworm + +ENV DEBIAN_FRONTEND="noninteractive" + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + python3-full \ + python3-pip \ + python3-packaging \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt ./ +RUN pip install --break-system-packages -r requirements.txt + +COPY tests/requirements.txt tests/requirements.txt +RUN python3 -m pip install --break-system-packages -r tests/requirements.txt + +COPY . ./ + +RUN python3 -m pip install --break-system-packages . \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..15d1cb0 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +version: '3.9' + +services: + tests: + image: lightning-universe/lightning-hivemind:latest + command: pytest tests/ -v + tty: true + stdin_open: true + build: + context: . + dockerfile: Dockerfile + volumes: + - ./src:/app/src + - ./tests:/app/tests \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 865c6b7..8311e75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ lightning >=2.0.0 -hivemind >=1.1.0, <=1.1.5; sys_platform == 'linux' +hivemind >=1.1.0, <=1.1.10.post2; sys_platform == 'linux' pydantic <2.0.0 # fixme: lift when resolved diff --git a/src/lightning_hivemind/strategy.py b/src/lightning_hivemind/strategy.py index ad16928..ad6f9ae 100644 --- a/src/lightning_hivemind/strategy.py +++ b/src/lightning_hivemind/strategy.py @@ -113,6 +113,10 @@ class HivemindStrategy(Strategy): bootstrap_timeout: after one of peers responds, await other peers for at most this many seconds + use_relay: disable circuit relay functionality in libp2p (see https://docs.libp2p.io/concepts/nat/circuit-relay/) + + use_auto_relay: look for libp2p relays to become reachable if we are behind NAT/firewall + **optimizer_kwargs: kwargs are passed to the :class:`hivemind.Optimizer` class. """ @@ -139,6 +143,8 @@ def __init__( use_ipfs: bool = False, wait_timeout: int = 3, bootstrap_timeout: Optional[float] = None, + use_relay: bool = True, + use_auto_relay: bool = False, **optimizer_kwargs: Any, ): if platform.system() != "Linux": @@ -177,9 +183,11 @@ def __init__( initial_peers=initial_peers, host_maddrs=host_maddrs if host_maddrs is not None else ["/ip4/0.0.0.0/tcp/0", "/ip4/0.0.0.0/udp/0/quic"], use_ipfs=use_ipfs, + ensure_bootstrap_success=bool(not use_ipfs), wait_timeout=wait_timeout, bootstrap_timeout=bootstrap_timeout, - ensure_bootstrap_success=bool(not use_ipfs), + use_relay=use_relay, + use_auto_relay=use_auto_relay ) visible_addresses = [ diff --git a/tests/test_strategy.py b/tests/test_strategy.py index de6497e..e7e72ad 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -69,6 +69,28 @@ def configure_optimizers(self): ) trainer.fit(model) +@mock.patch.dict(os.environ, {"HIVEMIND_MEMORY_SHARING_STRATEGY": "file_descriptor"}, clear=True) +def test_ipfs_integration(): + class TestModel(BoringModel): + def on_before_backward(self, loss: Tensor) -> None: + scheduler = self.trainer.lr_scheduler_configs[0].scheduler + assert isinstance(scheduler, HiveMindScheduler) + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + return [optimizer], [torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)] + + model = TestModel() + trainer = Trainer( + strategy=HivemindStrategy( + target_batch_size=1, + use_ipfs=True, + use_relay=True, + use_auto_relay=True + ), + fast_dev_run=True, + ) + trainer.fit(model) @mock.patch.dict( os.environ, @@ -139,7 +161,7 @@ def test_raise_exception_no_batch_size(mock__extract_batch_size): [(True, True, True), (False, True, False)], ) def test_warn_if_argument_passed(delay_grad_averaging, delay_state_averaging, delay_optimizer_step): - """Ensure that valid combination of HiveMind delay arguments warn if scheduler isn't passed in as a function.""" + """Ensure that valid combination of HiveMind delay arguments warn if scheduler isn't passed in as a function.""" model = BoringModel() trainer = Trainer( strategy=HivemindStrategy(