diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml index 6c1247123c..b60f219e16 100644 --- a/.circleci/unittest/linux/scripts/environment.yml +++ b/.circleci/unittest/linux/scripts/environment.yml @@ -5,7 +5,6 @@ dependencies: - codecov - pip - pip: - - clang-format - dataclasses - nltk - requests diff --git a/.circleci/unittest/linux/scripts/run_style_checks.sh b/.circleci/unittest/linux/scripts/run_style_checks.sh index acfade5349..c44c1d6554 100755 --- a/.circleci/unittest/linux/scripts/run_style_checks.sh +++ b/.circleci/unittest/linux/scripts/run_style_checks.sh @@ -20,9 +20,9 @@ if [ "${status}" -ne 0 ]; then fi printf "\x1b[34mRunning clang-format: " -clang-format --version +./clang-format --version printf "\x1b[0m\n" -git-clang-format origin/master +git-clang-format --binary ./clang-format origin/master git diff --exit-code status=$? exit_status="$((exit_status+status))" diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh index f64c4f49d8..d4f5457906 100755 --- a/.circleci/unittest/linux/scripts/setup_env.sh +++ b/.circleci/unittest/linux/scripts/setup_env.sh @@ -14,6 +14,11 @@ env_dir="${root_dir}/env" cd "${root_dir}" +case "$(uname -s)" in + Darwin*) os=MacOSX;; + *) os=Linux +esac + # 1. Install conda at ./conda if [ ! -d "${conda_dir}" ]; then printf "* Installing conda\n" @@ -32,6 +37,11 @@ conda activate "${env_dir}" # 3. Install Conda dependencies printf "* Installing dependencies (except PyTorch)\n" conda env update --file "${this_dir}/environment.yml" --prune +if [ "${os}" == Linux ] ; then + clangformat_path="${root_dir}/clang-format" + curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o "${clangformat_path}" + chmod +x "${clangformat_path}" +fi # 4. Download printf "* Downloading SpaCy English models\n" diff --git a/README.rst b/README.rst index a69be1e1e1..06d5a3ddc9 100644 --- a/README.rst +++ b/README.rst @@ -15,8 +15,11 @@ This repository consists of: * `torchtext.data <#data>`_: Generic data loaders, abstractions, and iterators for text (including vocabulary and word vectors) * `torchtext.datasets <#datasets>`_: Pre-built loaders for common NLP datasets -Note: we are currently re-designing the torchtext library to make it more compatible with pytorch (e.g. ``torch.utils.data``). Several datasets have been written with the new abstractions in `torchtext.experimental `_ folder. We also created an issue to discuss the new abstraction, and users are welcome to leave feedback `link `_. +Note: we are currently re-designing the torchtext library to make it more compatible with pytorch (e.g. ``torch.utils.data``). Several datasets have been written with the new abstractions in `torchtext.experimental `_ folder. We also created an issue to discuss the new abstraction, and users are welcome to leave feedback `link `_. These prototype building blocks and datasets in the experimental folder are available in the nightly release only. The nightly packages are accessible via Pip and Conda for Windows, Mac, and Linux. For example, Linux users can install the nightly wheels with the following command:: + pip install --pre torch torchtext -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html + +For more detail instructions, please refer to `Install PyTorch `_. It should be noted that the new building blocks are still under development, and the APIs have not been solidified. Installation ============ @@ -28,15 +31,17 @@ We recommend Anaconda as Python package management system. Please refer to `pyto :widths: 10, 10, 10 nightly build, master, 3.6+ - 1.5, 0.5, 3.5+ - 1.4, 0.4, "2.7, 3.5+" + 1.7, 0.8, 3.6+ + 1.6, 0.7, 3.6+ + 1.5, 0.6, 3.5+ + 1.4, 0.5, "2.7, 3.5+" 0.4 and below, 0.2.3, "2.7, 3.5+" -Using conda;:: +Using conda:: conda install -c pytorch torchtext -Using pip;:: +Using pip:: pip install torchtext @@ -64,7 +69,13 @@ To build torchtext from source, you need ``git``, ``CMake`` and C++11 compiler s git clone https://github.com/pytorch/text torchtext cd torchtext git submodule update --init --recursive + + # Linux python setup.py clean install + + # OSX + MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py clean install + # or ``python setup.py develop`` if you are making modifications. **Note** diff --git a/setup.py b/setup.py index 7efa31dbc4..381b617596 100644 --- a/setup.py +++ b/setup.py @@ -82,7 +82,7 @@ def run(self): license='BSD', install_requires=[ - 'tqdm', 'requests', 'torch', 'numpy', 'sentencepiece' + 'tqdm', 'requests', 'torch', 'numpy' ], python_requires='>=3.5', classifiers=[ diff --git a/test/data/test_functional.py b/test/data/test_functional.py index f14a46c1cc..8074859070 100644 --- a/test/data/test_functional.py +++ b/test/data/test_functional.py @@ -4,7 +4,6 @@ import uuid import unittest -import sentencepiece as spm import torch from torchtext.data.functional import ( generate_sp_model, @@ -38,11 +37,8 @@ def test_generate_sp_model(self): model_prefix = os.path.join(dir_name, f'spm_user_{uuid.uuid4()}') model_file = f'{model_prefix}.model' generate_sp_model(data_path, vocab_size=23456, model_prefix=model_prefix) - - sp_user = spm.SentencePieceProcessor() - sp_user.Load(model_file) - - self.assertEqual(len(sp_user), 23456) + sp_model = load_sp_model(model_file) + self.assertEqual(sp_model.GetPieceSize(), 23456) def test_sentencepiece_numericalizer(self): test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'