manulera · manulera · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023
diff --git a/.dockerignore b/.dockerignore
@@ -2,4 +2,7 @@
 examples/
 .github/
 .venv/
-.git/
+.git/
+Lab_strains/
+notebooks/
+toml_example/
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -12,46 +12,50 @@ jobs:
         uses: actions/setup-python@v1
         with:
           python-version: 3.9
+      - name: Install Poetry
+        uses: snok/install-poetry@v1
+        with:
+          virtualenvs-create: false
+          installer-parallel: true
+          version: 1.2.2
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          pip install poetry
-          poetry config virtualenvs.create false
           poetry install --no-dev
+          sh install_local_dependency.sh
       # Before running the test you have to download the tags!
       - name: Run tests
         run: |
             python get_data/get_fpbase_data.py allele_components/tags_fpbase.toml
             cd genestorian_module/test
             python -m unittest
 
- # Update docker image when committing to master branch if tests pass
-  # push_to_registry:
-  #   name: Push Docker image to Docker Hub
-  #   runs-on: ubuntu-latest
-  #   needs: test
-  #   if: github.ref == 'refs/heads/master'
-  #   steps:
-  #     - name: Check out the repo
-  #       uses: actions/checkout@v3
+#  Update docker image when committing to master branch if tests pass
+  push_to_registry:
+    name: Push Docker image to Docker Hub
+    runs-on: ubuntu-latest
+    needs: test
+    if: github.ref == 'refs/heads/master'
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v3
 
-  #     - name: Log in to Docker Hub
-  #       uses: docker/login-action@v2
-  #       with:
-  #         username: ${{ secrets.DOCKER_USERNAME }}
-  #         password: ${{ secrets.DOCKER_PASSWORD }}
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
 
-  #     - name: Extract metadata (tags, labels) for Docker
-  #       id: meta
-  #       uses: docker/metadata-action@v2
-  #       with:
-  #         images: genestorian_refinement_pipeline
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v2
+        with:
+          images: genestorian_data_refinement
 
-  #     - name: Build and push Docker images
-  #       uses: docker/build-push-action@v3.1.1
+      - name: Build and push Docker images
+        uses: docker/build-push-action@v3.1.1
 
-  #       with:
-  #         context: .
-  #         push: true
-  #         tags: manulera/genestorian_refinement_pipeline:latest
-  #         labels: ${{ steps.meta.outputs.labels }}
+        with:
+          context: .
+          push: true
+          tags: manulera/genestorian_data_refinement:latest
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/Dockerfile b/Dockerfile
@@ -2,15 +2,12 @@ FROM python:3.9
 
 WORKDIR /pipeline
 
-RUN pip install poetry 
-RUN pip install nltk
-RUN pip install toml
-
-COPY ./ /pipeline/
-
+RUN pip install poetry
 RUN poetry config virtualenvs.create false
-RUN poetry install --without dev
-RUN poetry shell
 
-COPY . /pipeline
+COPY ./ /pipeline
+
+RUN poetry install --no-dev
+RUN sh install_local_dependency.sh
 
+CMD ["sh", "docker_start.sh"]
diff --git a/Lab_strains/Chen_lab/format.py b/Lab_strains/Chen_lab/format.py
@@ -1,5 +1,5 @@
 #%%
-from genestorian_module import excel_to_tsv
+from genestorian_module.read_and_write import excel_to_tsv
 excel_to_tsv('Chen lab strains.xlsx', ['Strain', 'Genotype'], 'strains.tsv')
 
 # %%
diff --git a/Lab_strains/dey_lab/format.py b/Lab_strains/dey_lab/format.py
@@ -1,5 +1,5 @@
 #%%
-from genestorian_module import excel_to_tsv
+from genestorian_module.read_and_write import excel_to_tsv
 excel_to_tsv('Manu_Strains.xlsx', ['Sample Name', 'Genotype'], 'strains.tsv')
 
 
diff --git a/Lab_strains/lilin_lab/format.py b/Lab_strains/lilin_lab/format.py
@@ -1,5 +1,5 @@
 #%%
-from genestorian_module import excel_to_tsv
+from genestorian_module.read_and_write import excel_to_tsv
 import pandas as pd
 
 read_file = pd.read_excel('DY-export-2.xlsx', na_filter=False)

diff --git a/Lab_strains/rincon_lab/format.py b/Lab_strains/rincon_lab/format.py
@@ -1,5 +1,5 @@
-#%%
-from genestorian_module import excel_to_tsv
+# %%
+from genestorian_module.read_and_write import excel_to_tsv
 import pandas as pd
 
 read_file = pd.read_excel('SR Strain List.xlsx', na_filter=False)

diff --git a/Lab_strains/subramaniam_lab/format.py b/Lab_strains/subramaniam_lab/format.py
@@ -1,4 +1,4 @@
-#%%
-from genestorian_module import excel_to_tsv
+# %%
+from genestorian_module.read_and_write import excel_to_tsv
 excel_to_tsv('MBYstrains-01.xlsx', ['STRAINS', 'GENOTYPE'], 'strains.tsv')
 # %%
diff --git a/Lab_strains/tran_lab/format.py b/Lab_strains/tran_lab/format.py
@@ -1,5 +1,5 @@
 # %%
-from genestorian_module import excel_to_tsv
+from genestorian_module.read_and_write import excel_to_tsv
 import pandas as pd
 
 read_file = pd.read_excel('pombe strains_20210413.xlsx', na_filter=False)

diff --git a/Lab_strains/zanders_lab/format.py b/Lab_strains/zanders_lab/format.py
@@ -1,10 +1,10 @@
-#%%
-from genestorian_module import excel_to_tsv
+# %%
+from genestorian_module.read_and_write import excel_to_tsv
 import pandas as pd
 
 read_file = pd.read_excel('yeastJune 2021.xlsx', na_filter=False)
 read_file['SEZY'] = read_file['SEZY'].astype(str)
-read_file['strain_id']= 'SEZY' + read_file['SEZY']
+read_file['strain_id'] = 'SEZY' + read_file['SEZY']
 read_file.to_excel('post_processed.xlsx')
 # %%
 excel_to_tsv('post_processed.xlsx', ['strain_id', 'genotype'], 'strains.tsv')

diff --git a/api.py b/api.py
@@ -2,8 +2,8 @@
 from nltk.tree import ParentedTree, TreePrettyPrinter
 from starlette.responses import HTMLResponse, FileResponse
 from pydantic import BaseModel
-from genestorian_module.genestorian_module.build_nltk_tags import build_nltk_tag
-from genestorian_module.genestorian_module.build_nltk_trees import apply_pseudo_grammar, post_process_pseudo_grammar
+from genestorian_module.build_nltk_tags import build_nltk_tag
+from genestorian_module.build_nltk_trees import apply_pseudo_grammar, post_process_pseudo_grammar
 import json
 
 

diff --git a/docker_start.sh b/docker_start.sh
@@ -0,0 +1,2 @@
+python get_data/get_fpbase_data.py allele_components/tags_fpbase.toml
+uvicorn api:app --host 0.0.0.0 --port 80
diff --git a/genestorian_module/genestorian_module.egg-info/SOURCES.txt b/genestorian_module/genestorian_module.egg-info/SOURCES.txt
@@ -2,6 +2,7 @@ setup.py
 genestorian_module/__init__.py
 genestorian_module/build_nltk_tags.py
 genestorian_module/build_nltk_trees.py
+genestorian_module/read_and_write.py
 genestorian_module/replace_feature.py
 genestorian_module/summary_nltk_tags.py
 genestorian_module/third_version_pipeline.py

diff --git a/genestorian_module/genestorian_module/__init__.py b/genestorian_module/genestorian_module/__init__.py
@@ -1,48 +0,0 @@
-import pandas as pd
-
-
-def excel_to_tsv(excel_file, read_cols, tsv_file):
-    '''Extracts genotype and strain id from excel file to tsv file
-
-            Parameter:
-                excel_file(path to file): path to the excel file
-                read_cols(list) : list of coloumn names to be read
-                tsv_file(path): path to tsv file
-
-            Returns:
-                None'''
-    #read_cols = ['strain_id/Sample Name', 'genotype']
-    read_file = pd.read_excel(excel_file, usecols=read_cols, na_filter=False)
-    read_file = read_file.rename(
-        columns={read_cols[0]: 'strain_id', read_cols[1]: 'genotype'})
-
-    read_file['strain_id'] = read_file['strain_id'].astype(str)
-    read_file['genotype'] = read_file['genotype'].astype(str)
-
-    inconsistent_char_list = ['‚àÜ0', 'Œî']
-
-    for i in range(len(read_file['genotype'])):
-        for inconsistent_char in inconsistent_char_list:
-            if inconsistent_char in read_file['genotype'][i]:
-                read_file['genotype'][i] = read_file['genotype'][i].replace(
-                    inconsistent_char, 'Δ')
-
-    read_file.to_csv(tsv_file, sep='\t', index=False)
-    return None
-
-
-def read_strains_tsv(tsv_file):
-    '''
-    Reads the genotype and strain_id coloumn from strain.tsv file
-
-        Parameter:
-            tsv_file(path): path to strains.tsv 
-
-        Return:
-            data(pandas dataframe): pandas dataframe where columns are strain_id and genotype 
-    '''
-    data = pd.read_csv(tsv_file, sep='\t', na_filter=False)
-    data['genotype'] = data['genotype'].astype(str)
-    data['strain_id'] = data['strain_id'].astype(str)
-    data['genotype'] = data['genotype'].str.lower()
-    return data

diff --git a/genestorian_module/genestorian_module/build_nltk_tags.py b/genestorian_module/genestorian_module/build_nltk_tags.py
@@ -1,5 +1,4 @@
-from genestorian_module.genestorian_module.replace_feature import (build_feature_dict,
-                                                                   build_strain_list)
+from genestorian_module.replace_feature import build_feature_dict, build_strain_list
 import re
 import json
 import sys

diff --git a/genestorian_module/genestorian_module/read_and_write.py b/genestorian_module/genestorian_module/read_and_write.py
@@ -0,0 +1,48 @@
+import pandas as pd
+
+
+def excel_to_tsv(excel_file, read_cols, tsv_file):
+    '''Extracts genotype and strain id from excel file to tsv file
+
+            Parameter:
+                excel_file(path to file): path to the excel file
+                read_cols(list) : list of coloumn names to be read
+                tsv_file(path): path to tsv file
+
+            Returns:
+                None'''
+    #read_cols = ['strain_id/Sample Name', 'genotype']
+    read_file = pd.read_excel(excel_file, usecols=read_cols, na_filter=False)
+    read_file = read_file.rename(
+        columns={read_cols[0]: 'strain_id', read_cols[1]: 'genotype'})
+
+    read_file['strain_id'] = read_file['strain_id'].astype(str)
+    read_file['genotype'] = read_file['genotype'].astype(str)
+
+    inconsistent_char_list = ['‚àÜ0', 'Œî']
+
+    for i in range(len(read_file['genotype'])):
+        for inconsistent_char in inconsistent_char_list:
+            if inconsistent_char in read_file['genotype'][i]:
+                read_file['genotype'][i] = read_file['genotype'][i].replace(
+                    inconsistent_char, 'Δ')
+
+    read_file.to_csv(tsv_file, sep='\t', index=False)
+    return None
+
+
+def read_strains_tsv(tsv_file):
+    '''
+    Reads the genotype and strain_id coloumn from strain.tsv file
+
+        Parameter:
+            tsv_file(path): path to strains.tsv 
+
+        Return:
+            data(pandas dataframe): pandas dataframe where columns are strain_id and genotype 
+    '''
+    data = pd.read_csv(tsv_file, sep='\t', na_filter=False)
+    data['genotype'] = data['genotype'].astype(str)
+    data['strain_id'] = data['strain_id'].astype(str)
+    data['genotype'] = data['genotype'].str.lower()
+    return data
diff --git a/genestorian_module/genestorian_module/replace_feature.py b/genestorian_module/genestorian_module/replace_feature.py
@@ -1,5 +1,5 @@
 import toml
-from genestorian_module.genestorian_module import read_strains_tsv
+from genestorian_module.read_and_write import read_strains_tsv
 import re
 
 

diff --git a/genestorian_module/genestorian_module/third_version_pipeline.py b/genestorian_module/genestorian_module/third_version_pipeline.py
@@ -1,7 +1,6 @@
-from genestorian_module import read_strains_tsv
+from genestorian_module.read_and_write import read_strains_tsv
 from genestorian_module.replace_feature import build_feature_dict
 import re
-import json
 from operator import itemgetter
 
 

diff --git a/genestorian_module/setup.py b/genestorian_module/setup.py
@@ -1,2 +1,5 @@
 from setuptools import setup, find_packages
-setup(name='genestorian_module', packages=find_packages())
+setup(
+    name='genestorian_module',
+    packages=find_packages(),
+)
diff --git a/install_local_dependency.sh b/install_local_dependency.sh
@@ -0,0 +1,3 @@
+# Until this issue is addressed: https://github.com/python-poetry/poetry/issues/8597
+cd genestorian_module
+poetry run pip install -e .
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		python get_data/get_fpbase_data.py allele_components/tags_fpbase.toml
		uvicorn api:app --host 0.0.0.0 --port 80