Merge pull request #38 from nathan-weinberg/spellcheck

ci: add spellchecker
instructlab · Apr 29, 2024 · fbf3719 · fbf3719
2 parents c4c6292 + aa3e7c2
commit fbf3719
Show file tree

Hide file tree

Showing 6 changed files with 186 additions and 2 deletions.
diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: Spellcheck
+
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      - '**.md'
+
+permissions:
+  contents: read
+
+jobs:
+  spellcheck:
+    name: Spellcheck (en_US)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+      - name: Spellcheck
+        uses: rojopolis/spellcheck-github-actions@0.35.0
diff --git a/.gitignore b/.gitignore
@@ -1 +1,4 @@
 .DS_Store
+
+# Spelling
+dictionary.dic
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
@@ -0,0 +1,122 @@
+# make spellcheck-sort
+# Please keep this file sorted:
+Abhishek
+Akash
+AMDGPU
+arge
+arXiv
+backend
+backends
+benchmarking
+Bhandwaldar
+CLI
+cli
+Colab
+compositional
+Conda
+Containerfile
+cpp
+cuBLAS
+CUDA
+dataset
+DCO
+dev
+ditaa
+dr
+Dropdown
+env
+Eval
+Excalidraw
+Finetuning
+GFX
+GGUF
+GGUFs
+GiB
+Gmail
+gpu
+hipBLAS
+ilab
+impactful
+Inferencing
+instructlab
+ISA
+JIT
+Jupyter
+KAGGLE
+Kaggle
+Kaggle's
+Kai
+Kubernetes
+lignment
+LLM
+llms
+LLVM
+lora
+md
+Mergify
+Merlinite
+mimimum
+Miniforge
+Mixtral
+MLX
+mlx
+NVidia
+Nvidia
+orchestrator
+ots
+Pareja
+PEFT
+PlantUML
+PNG
+Podman
+pre
+preprint
+PR's
+pyenv
+PyPI
+PyTorch
+qlora
+quantized
+Quantizing
+Radeon
+RDNA
+README
+rebase
+repo
+ROCm
+RTX
+RX
+safetensors
+Salawu
+SDG
+sexualized
+SHA
+Shivchander
+Signoff
+Srivastava
+subdirectory
+Sudalairaj
+Taj
+tatsu
+TBD
+Tesla
+th
+th
+tl
+tox
+triager
+Triagers
+triager's
+triagers
+unquantized
+USM
+venv
+watsonx
+wikisql
+WSL
+xcode
+XNACK
+XT
+XTX
+Xu
+YAML
diff --git a/.spellcheck.yml b/.spellcheck.yml
@@ -0,0 +1,29 @@
+
+# SPDX-License-Identifier: Apache-2.0
+
+matrix:
+- name: markdown
+  aspell:
+    lang: en
+    d: en_US
+    camel-case: true
+    mode: markdown
+  sources:
+  - "**/*.md|!REVIEWERS.md|!build/**|!.tox/**"
+  dictionary:
+    wordlists:
+    - .spellcheck-en-custom.txt
+  pipeline:
+  - pyspelling.filters.context:
+      context_visible_first: true
+      escapes: '\\[\\`~]'
+      delimiters:
+      # Ignore multiline content between fences (fences can have 3 or more back ticks)
+      # ```language
+      # content
+      # ```
+      - open: '(?s)^(?P<open> *`{3,}).*?$'
+        close: '^(?P=open)$'
+      # Ignore text between inline back ticks
+      - open: '(?P<open>`+)'
+        close: '(?P=open)'
diff --git a/Makefile b/Makefile
@@ -2,6 +2,14 @@
 help:
 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 
+.PHONY: spellcheck
+spellcheck: .spellcheck.yml
+	pyspelling --config $<
+
+.PHONY: spellcheck-sort
+spellcheck-sort: .spellcheck-en-custom.txt
+	sort -d -f -o $< $<
+
 #
 # If you want to see the full commands, run:
 #   NOISY_BUILD=y make

diff --git a/docs/huggingface-publish-strategy.md b/docs/huggingface-publish-strategy.md
@@ -16,13 +16,13 @@ We will be publishing two different kinds of model families - Merlinite and Gran
 
 The Merlinite model family is based off the [Mistral](https://mistral.ai/) model family and uses the [Large-scale Alignment for chatBots (LAB)](https://arxiv.org/abs/2403.01081) alignment. You can read more about it [here](https://huggingface.co/instructlab/merlinite-7b-lab).
 
-The InstructLab organzation will be publishing a community version of the Merlinite 7B size model, in both unquantized and 4-bit quantized GGUF form.
+The InstructLab organization will be publishing a community version of the Merlinite 7B size model, in both unquantized and 4-bit quantized GGUF form.
 
 ## Granite
 
 The Granite model family is the [foundational model family](https://www.ibm.com/downloads/cas/X9W4O6BM) for the IBM watsonx AI platform, designed for usage in a business environment. You can read more about it [here](https://huggingface.co/instructlab/granite-7b-lab).
 
-The InstructLab organzation will be publishing a community version of the Granite 7B size model using the [Large-scale Alignment for chatBots (LAB)](https://arxiv.org/abs/2403.01081) alignment, in both unquantized and 4-bit quantized GGUF form.
+The InstructLab organization will be publishing a community version of the Granite 7B size model using the [Large-scale Alignment for chatBots (LAB)](https://arxiv.org/abs/2403.01081) alignment, in both unquantized and 4-bit quantized GGUF form.
 
 ## Naming Scheme