WIP: Upload and display sequencing results

rosalindfranklininstitute · Oct 4, 2023 · e15c97e · e15c97e
1 parent 5924f5c
commit e15c97e
Show file tree

Hide file tree

Showing 10 changed files with 1,590 additions and 511 deletions.
diff --git a/backend/Pipfile.lock b/backend/Pipfile.lock
diff --git a/backend/antigenapi/bioinformatics.py b/backend/antigenapi/bioinformatics.py
@@ -0,0 +1,110 @@
+import itertools
+import os
+import sys
+import zipfile
+
+import vquest.config
+import vquest.vq
+
+START_CODON = "ATG"
+
+
+def file_name_to_sequence_name(fn):
+    """Extract the sequence identifier from the file name (Ray Owens' method)."""
+    # Use whole filename as sequence name, as requested by Lauren
+    return fn[:-4] if fn.endswith(".seq") else fn
+
+
+def trim_sequence(seq):
+    """Trim the sequence after start codon, if present."""
+    try:
+        return seq[seq.index(START_CODON) + len(START_CODON) :]
+    except ValueError:
+        return ""
+
+
+def _load_sequences_zip(zip_file):
+    seq_data = {}
+    with zipfile.ZipFile(zip_file, "r") as zip_ref:
+        for fn in zip_ref.namelist():
+            if not fn.endswith(".seq"):
+                continue
+
+            # Convert the file name to a short sequence identifier
+            seq_name = os.path.basename(file_name_to_sequence_name(fn))
+
+            # Read the .seq file
+            with zip_ref.open(fn, "r") as f:
+                seq = f.read().decode("utf-8")
+
+            # Trim the sequence
+            seq = trim_sequence(seq)
+
+            # Add to dictionary of sequences, if a start codon was present
+            if seq:
+                seq_data[seq_name] = seq
+
+    return seq_data
+
+
+def load_sequences(directory_or_zip):
+    """Load a set of sequences from .seq files from a dictionary or .zip file."""
+    if os.path.isfile(directory_or_zip):
+        return _load_sequences_zip(directory_or_zip)
+
+    seq_data = {}
+    for fn in os.listdir(directory_or_zip):
+        if fn.endswith(".seq"):
+            # Convert the file name to a short sequence identifier
+            seq_name = file_name_to_sequence_name(fn)
+            # Read the .seq file
+            with open(os.path.join(directory_or_zip, fn), "r") as f:
+                seq = f.read()
+            # Trim the sequence
+            seq = trim_sequence(seq)
+            # Add to dictionary of sequences, if a start codon was present
+            if seq:
+                seq_data[seq_name] = seq
+
+    return seq_data
+
+
+def _chunks(data, size=None):
+    """Split a dict into multiple dicts of specified max size (iterator)."""
+    if size is None:
+        size = sys.maxsize
+    it = iter(data)
+    for _ in range(0, len(data), size):
+        yield {k: data[k] for k in itertools.islice(it, size)}
+
+
+def as_fasta_files(seq_data, max_file_size=50):
+    """Convert a dictionary of sequence names and data to FASTA format files.
+
+    max_file_size specifies the maximum number of sequences in each file
+    (For IMGT, this is 50)
+    """
+    fasta_files = []
+    for seq_data_chunk in _chunks(seq_data, max_file_size):
+        fasta_files.append(
+            "\n".join([f"> {name}\n{seq}" for name, seq in seq_data_chunk.items()])
+        )
+    return fasta_files
+
+
+def run_vquest(fasta_data, species="alpaca", receptor="IG"):
+    """Run vquest bioinformatics on a set of fasta files."""
+    conf = vquest.config.DEFAULTS.copy()
+    conf["inputType"] = "inline"
+    conf["species"] = species
+    conf["receptorOrLocusType"] = receptor
+    conf["sequences"] = fasta_data
+
+    # Set vquest logging to DEBUG
+    import logging
+
+    from vquest import LOGGER
+
+    LOGGER.setLevel(logging.DEBUG)
+
+    return vquest.vq.vquest(conf)
diff --git a/backend/antigenapi/migrations/0005_sequencingrunresults_and_more.py b/backend/antigenapi/migrations/0005_sequencingrunresults_and_more.py
@@ -0,0 +1,61 @@
+# Generated by Django 4.2.5 on 2023-10-03 20:40
+
+import django.db.models.deletion
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+        ("antigenapi", "0004_remove_sequencingrun_results_date_and_more"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="SequencingRunResults",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("seq", models.PositiveSmallIntegerField()),
+                (
+                    "seqres_file",
+                    models.FileField(upload_to="uploads/sequencingresults/"),
+                ),
+                (
+                    "parameters_file",
+                    models.FileField(upload_to="uploads/sequencingresults/"),
+                ),
+                ("airr_file", models.FileField(upload_to="uploads/sequencingresults/")),
+                ("added_date", models.DateTimeField(auto_now_add=True)),
+                (
+                    "added_by",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.PROTECT,
+                        to=settings.AUTH_USER_MODEL,
+                    ),
+                ),
+                (
+                    "sequencing_run",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.PROTECT,
+                        to="antigenapi.sequencingrun",
+                    ),
+                ),
+            ],
+        ),
+        migrations.AddConstraint(
+            model_name="sequencingrunresults",
+            constraint=models.UniqueConstraint(
+                fields=("sequencing_run", "seq"), name="unique_seqrun_seq"
+            ),
+        ),
+    ]
diff --git a/backend/antigenapi/models.py b/backend/antigenapi/models.py
@@ -164,10 +164,34 @@ class SequencingRun(Model):
     added_date = DateTimeField(auto_now_add=True)
 
 
+class SequencingRunResults(Model):
+    """A results file for a sequencing run."""
+
+    sequencing_run = ForeignKey(SequencingRun, on_delete=PROTECT)
+    seq: int = PositiveSmallIntegerField()
+    seqres_file: File = FileField(
+        upload_to="uploads/sequencingresults/",
+    )
+    parameters_file: File = FileField(
+        upload_to="uploads/sequencingresults/",
+    )
+    airr_file: File = FileField(
+        upload_to="uploads/sequencingresults/",
+    )
+    added_by = ForeignKey(settings.AUTH_USER_MODEL, on_delete=PROTECT)
+    added_date = DateTimeField(auto_now_add=True)
+
+    class Meta:  # noqa: D106
+        constraints = [
+            UniqueConstraint(fields=["sequencing_run", "seq"], name="unique_seqrun_seq")
+        ]
+
+
 auditlog.register(Project)
 auditlog.register(Llama)
 auditlog.register(Antigen)
 auditlog.register(Cohort, m2m_fields={"projects", "antigens"})
 auditlog.register(Library)
 auditlog.register(ElisaPlate)
 auditlog.register(SequencingRun)
+auditlog.register(SequencingRunResults)
diff --git a/backend/antigenapi/views.py b/backend/antigenapi/views.py
@@ -5,13 +5,17 @@
 from tempfile import NamedTemporaryFile
 from wsgiref.util import FileWrapper
 
+import numpy as np
 import openpyxl
+import pandas as pd
 from auditlog.models import LogEntry
+from django.conf import settings
 from django.contrib.contenttypes.models import ContentType
+from django.core.files import File
 from django.db import transaction
 from django.db.models.deletion import ProtectedError
 from django.db.utils import IntegrityError
-from django.http import Http404, HttpResponse
+from django.http import Http404, HttpResponse, JsonResponse
 from rest_framework import status
 from rest_framework.decorators import action
 from rest_framework.response import Response
@@ -25,6 +29,7 @@
 )
 from rest_framework.viewsets import ModelViewSet
 
+from antigenapi.bioinformatics import as_fasta_files, load_sequences, run_vquest
 from antigenapi.models import (
     Antigen,
     Cohort,
@@ -35,6 +40,7 @@
     PlateLocations,
     Project,
     SequencingRun,
+    SequencingRunResults,
 )
 from antigenapi.utils.uniprot import get_protein
 
@@ -379,10 +385,21 @@ class Meta:  # noqa: D106
         fields = ("plate", "location")
 
 
+class SequencingRunResultSerializer(ModelSerializer):
+    """A serializer for sequencing run results."""
+
+    added_by = StringRelatedField()
+
+    class Meta:  # noqa: D106
+        model = SequencingRunResults
+        fields = ("seq", "added_by", "added_date")
+
+
 class SequencingRunSerializer(ModelSerializer):
     """A serializer for sequencing runs."""
 
     added_by = StringRelatedField()
+    sequencingrunresults_set = SequencingRunResultSerializer(many=True, required=False)
 
     def validate_wells(self, data):
         """Check JSONField for wells is valid."""
@@ -562,3 +579,134 @@ def download_submission_xlsx(self, request, pk, submission_idx):
             f'sr{pk}_{submission_idx}.xlsx"'
         )
         return response
+
+    @action(
+        detail=True,
+        methods=["PUT"],
+        name="Upload sequencing run results file (.zip).",
+        url_path="resultsfile/(?P<submission_idx>[0-9]+)",
+    )
+    def upload_sequencing_run_results(self, request, pk, submission_idx):
+        """Upload sequencing run results file (.zip)."""
+        results_file = request.data["file"]
+
+        # TODO: Validate results file in more detail
+        if not results_file.name.endswith(".zip"):
+            raise ValidationError("file", "Results file should be a .zip file")
+
+        # TODO: Validate submission_idx
+
+        # Run bioinformatics using .zip file
+        print("Extracting zip file...")
+        seq_data = load_sequences(results_file.temporary_file_path())
+
+        # Convert to FASTA in-memory (vquest api handles chunking to
+        # max 50 seqs per file)
+        fasta_file = as_fasta_files(seq_data, max_file_size=None)[0]
+
+        # Submit to vquest
+        print("Running vquest...")
+        vquest_results = run_vquest(fasta_file)
+
+        parameters_file_data = vquest_results["Parameters.txt"]
+        vquest_airr_data = vquest_results["vquest_airr.tsv"]
+
+        base_filename = f"SequencingResults_{pk}_{submission_idx}"
+
+        # Create SequencingRunResults object
+        with open(
+            os.path.join(
+                settings.MEDIA_ROOT,
+                SequencingRunResults.parameters_file.field.upload_to,
+                f"{base_filename}_vquestparams.txt",
+            ),
+            "w+",
+        ) as parameters_file, open(
+            os.path.join(
+                settings.MEDIA_ROOT,
+                SequencingRunResults.airr_file.field.upload_to,
+                f"{base_filename}_vquestairr.tsv",
+            ),
+            "w+",
+        ) as airr_file:
+            parameters_file.write(parameters_file_data)
+            parameters_file_wrapped = File(parameters_file)
+            airr_file.write(vquest_airr_data)
+            airr_file_wrapped = File(airr_file)
+
+            SequencingRunResults.objects.update_or_create(
+                sequencing_run=SequencingRun.objects.get(pk=int(pk)),
+                seq=submission_idx,
+                defaults={
+                    "added_by": request.user,
+                    "seqres_file": results_file,
+                    "parameters_file": parameters_file_wrapped,
+                    "airr_file": airr_file_wrapped,
+                },
+            )
+
+        return JsonResponse(
+            SequencingRunSerializer(SequencingRun.objects.get(pk=int(pk))).data
+        )
+
+    @action(
+        detail=True,
+        methods=["GET"],
+        name="Get sequencing results.",
+        url_path="results",
+    )
+    def upload_sequencing_run_results(self, request, pk):
+        """Get sequencing results."""
+        IMPORTANT_COLUMNS = (
+            "sequence_id",
+            "productive",
+            "stop_codon",
+            "fwr1_aa",
+            "cdr1_aa",
+            "fwr2_aa",
+            "cdr2_aa",
+            "fwr3_aa",
+            "cdr3_aa",
+        )
+
+        results = SequencingRunResults.objects.filter(
+            sequencing_run_id=int(pk)
+        ).order_by("seq")
+
+        import io
+
+        csvs = []
+        for r in results:
+            # Clean up the CSVs! They seem to have an extra tab in some cases.
+            buffer = io.StringIO(
+                "\n".join(
+                    line.strip()
+                    for line in r.airr_file.read().decode("utf8").split("\n")
+                )
+            )
+            df = pd.read_csv(buffer, sep="\t", header=0, usecols=IMPORTANT_COLUMNS)
+            csvs.append(df)
+        df = pd.concat(csvs)
+
+        # Get number of matches per CDR3 sequence
+        cdr3_counts = df["cdr3_aa"].value_counts()
+        cdr3_counts.name = "cdr3_aa_count"
+        df = df.merge(cdr3_counts, on="cdr3_aa", how="left")
+
+        # Sort as required - Productive, number of CDR3 matches,
+        # then CDR3 itself, then sequence ID
+        df = df.sort_values(
+            by=["productive", "cdr3_aa_count", "cdr3_aa", "sequence_id"],
+            ascending=[False, False, True, True],
+        )
+
+        # Ensure we have the right columns in the right order
+        df = df.loc[:, IMPORTANT_COLUMNS]
+
+        # Set index and replace NaN with None (null in JSON)
+        df = df.replace({np.nan: None})
+
+        # Indicator to show when cdr3 has changed from previous row
+        df["new_cdr3"] = df["cdr3_aa"].shift(1).ne(df["cdr3_aa"])
+
+        return JsonResponse({"records": df.to_dict(orient="records")})