Skip to content

Commit

Permalink
WIP: Upload and display sequencing results
Browse files Browse the repository at this point in the history
  • Loading branch information
alubbock committed Oct 4, 2023
1 parent 5924f5c commit e15c97e
Show file tree
Hide file tree
Showing 10 changed files with 1,590 additions and 511 deletions.
1,498 changes: 1,009 additions & 489 deletions backend/Pipfile.lock

Large diffs are not rendered by default.

110 changes: 110 additions & 0 deletions backend/antigenapi/bioinformatics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import itertools
import os
import sys
import zipfile

import vquest.config
import vquest.vq

START_CODON = "ATG"


def file_name_to_sequence_name(fn):
"""Extract the sequence identifier from the file name (Ray Owens' method)."""
# Use whole filename as sequence name, as requested by Lauren
return fn[:-4] if fn.endswith(".seq") else fn


def trim_sequence(seq):
"""Trim the sequence after start codon, if present."""
try:
return seq[seq.index(START_CODON) + len(START_CODON) :]
except ValueError:
return ""


def _load_sequences_zip(zip_file):
seq_data = {}
with zipfile.ZipFile(zip_file, "r") as zip_ref:
for fn in zip_ref.namelist():
if not fn.endswith(".seq"):
continue

# Convert the file name to a short sequence identifier
seq_name = os.path.basename(file_name_to_sequence_name(fn))

# Read the .seq file
with zip_ref.open(fn, "r") as f:
seq = f.read().decode("utf-8")

# Trim the sequence
seq = trim_sequence(seq)

# Add to dictionary of sequences, if a start codon was present
if seq:
seq_data[seq_name] = seq

return seq_data


def load_sequences(directory_or_zip):
"""Load a set of sequences from .seq files from a dictionary or .zip file."""
if os.path.isfile(directory_or_zip):
return _load_sequences_zip(directory_or_zip)

seq_data = {}
for fn in os.listdir(directory_or_zip):
if fn.endswith(".seq"):
# Convert the file name to a short sequence identifier
seq_name = file_name_to_sequence_name(fn)
# Read the .seq file
with open(os.path.join(directory_or_zip, fn), "r") as f:
seq = f.read()
# Trim the sequence
seq = trim_sequence(seq)
# Add to dictionary of sequences, if a start codon was present
if seq:
seq_data[seq_name] = seq

return seq_data


def _chunks(data, size=None):
"""Split a dict into multiple dicts of specified max size (iterator)."""
if size is None:
size = sys.maxsize
it = iter(data)
for _ in range(0, len(data), size):
yield {k: data[k] for k in itertools.islice(it, size)}


def as_fasta_files(seq_data, max_file_size=50):
"""Convert a dictionary of sequence names and data to FASTA format files.
max_file_size specifies the maximum number of sequences in each file
(For IMGT, this is 50)
"""
fasta_files = []
for seq_data_chunk in _chunks(seq_data, max_file_size):
fasta_files.append(
"\n".join([f"> {name}\n{seq}" for name, seq in seq_data_chunk.items()])
)
return fasta_files


def run_vquest(fasta_data, species="alpaca", receptor="IG"):
"""Run vquest bioinformatics on a set of fasta files."""
conf = vquest.config.DEFAULTS.copy()
conf["inputType"] = "inline"
conf["species"] = species
conf["receptorOrLocusType"] = receptor
conf["sequences"] = fasta_data

# Set vquest logging to DEBUG
import logging

from vquest import LOGGER

LOGGER.setLevel(logging.DEBUG)

return vquest.vq.vquest(conf)
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Generated by Django 4.2.5 on 2023-10-03 20:40

import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("antigenapi", "0004_remove_sequencingrun_results_date_and_more"),
]

operations = [
migrations.CreateModel(
name="SequencingRunResults",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("seq", models.PositiveSmallIntegerField()),
(
"seqres_file",
models.FileField(upload_to="uploads/sequencingresults/"),
),
(
"parameters_file",
models.FileField(upload_to="uploads/sequencingresults/"),
),
("airr_file", models.FileField(upload_to="uploads/sequencingresults/")),
("added_date", models.DateTimeField(auto_now_add=True)),
(
"added_by",
models.ForeignKey(
on_delete=django.db.models.deletion.PROTECT,
to=settings.AUTH_USER_MODEL,
),
),
(
"sequencing_run",
models.ForeignKey(
on_delete=django.db.models.deletion.PROTECT,
to="antigenapi.sequencingrun",
),
),
],
),
migrations.AddConstraint(
model_name="sequencingrunresults",
constraint=models.UniqueConstraint(
fields=("sequencing_run", "seq"), name="unique_seqrun_seq"
),
),
]
24 changes: 24 additions & 0 deletions backend/antigenapi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,34 @@ class SequencingRun(Model):
added_date = DateTimeField(auto_now_add=True)


class SequencingRunResults(Model):
"""A results file for a sequencing run."""

sequencing_run = ForeignKey(SequencingRun, on_delete=PROTECT)
seq: int = PositiveSmallIntegerField()
seqres_file: File = FileField(
upload_to="uploads/sequencingresults/",
)
parameters_file: File = FileField(
upload_to="uploads/sequencingresults/",
)
airr_file: File = FileField(
upload_to="uploads/sequencingresults/",
)
added_by = ForeignKey(settings.AUTH_USER_MODEL, on_delete=PROTECT)
added_date = DateTimeField(auto_now_add=True)

class Meta: # noqa: D106
constraints = [
UniqueConstraint(fields=["sequencing_run", "seq"], name="unique_seqrun_seq")
]


auditlog.register(Project)
auditlog.register(Llama)
auditlog.register(Antigen)
auditlog.register(Cohort, m2m_fields={"projects", "antigens"})
auditlog.register(Library)
auditlog.register(ElisaPlate)
auditlog.register(SequencingRun)
auditlog.register(SequencingRunResults)
150 changes: 149 additions & 1 deletion backend/antigenapi/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@
from tempfile import NamedTemporaryFile
from wsgiref.util import FileWrapper

import numpy as np
import openpyxl
import pandas as pd
from auditlog.models import LogEntry
from django.conf import settings
from django.contrib.contenttypes.models import ContentType
from django.core.files import File
from django.db import transaction
from django.db.models.deletion import ProtectedError
from django.db.utils import IntegrityError
from django.http import Http404, HttpResponse
from django.http import Http404, HttpResponse, JsonResponse
from rest_framework import status
from rest_framework.decorators import action
from rest_framework.response import Response
Expand All @@ -25,6 +29,7 @@
)
from rest_framework.viewsets import ModelViewSet

from antigenapi.bioinformatics import as_fasta_files, load_sequences, run_vquest
from antigenapi.models import (
Antigen,
Cohort,
Expand All @@ -35,6 +40,7 @@
PlateLocations,
Project,
SequencingRun,
SequencingRunResults,
)
from antigenapi.utils.uniprot import get_protein

Expand Down Expand Up @@ -379,10 +385,21 @@ class Meta: # noqa: D106
fields = ("plate", "location")


class SequencingRunResultSerializer(ModelSerializer):
"""A serializer for sequencing run results."""

added_by = StringRelatedField()

class Meta: # noqa: D106
model = SequencingRunResults
fields = ("seq", "added_by", "added_date")


class SequencingRunSerializer(ModelSerializer):
"""A serializer for sequencing runs."""

added_by = StringRelatedField()
sequencingrunresults_set = SequencingRunResultSerializer(many=True, required=False)

def validate_wells(self, data):
"""Check JSONField for wells is valid."""
Expand Down Expand Up @@ -562,3 +579,134 @@ def download_submission_xlsx(self, request, pk, submission_idx):
f'sr{pk}_{submission_idx}.xlsx"'
)
return response

@action(
detail=True,
methods=["PUT"],
name="Upload sequencing run results file (.zip).",
url_path="resultsfile/(?P<submission_idx>[0-9]+)",
)
def upload_sequencing_run_results(self, request, pk, submission_idx):
"""Upload sequencing run results file (.zip)."""
results_file = request.data["file"]

# TODO: Validate results file in more detail
if not results_file.name.endswith(".zip"):
raise ValidationError("file", "Results file should be a .zip file")

# TODO: Validate submission_idx

# Run bioinformatics using .zip file
print("Extracting zip file...")
seq_data = load_sequences(results_file.temporary_file_path())

# Convert to FASTA in-memory (vquest api handles chunking to
# max 50 seqs per file)
fasta_file = as_fasta_files(seq_data, max_file_size=None)[0]

# Submit to vquest
print("Running vquest...")
vquest_results = run_vquest(fasta_file)

parameters_file_data = vquest_results["Parameters.txt"]
vquest_airr_data = vquest_results["vquest_airr.tsv"]

base_filename = f"SequencingResults_{pk}_{submission_idx}"

# Create SequencingRunResults object
with open(
os.path.join(
settings.MEDIA_ROOT,
SequencingRunResults.parameters_file.field.upload_to,
f"{base_filename}_vquestparams.txt",
),
"w+",
) as parameters_file, open(
os.path.join(
settings.MEDIA_ROOT,
SequencingRunResults.airr_file.field.upload_to,
f"{base_filename}_vquestairr.tsv",
),
"w+",
) as airr_file:
parameters_file.write(parameters_file_data)
parameters_file_wrapped = File(parameters_file)
airr_file.write(vquest_airr_data)
airr_file_wrapped = File(airr_file)

SequencingRunResults.objects.update_or_create(
sequencing_run=SequencingRun.objects.get(pk=int(pk)),
seq=submission_idx,
defaults={
"added_by": request.user,
"seqres_file": results_file,
"parameters_file": parameters_file_wrapped,
"airr_file": airr_file_wrapped,
},
)

return JsonResponse(
SequencingRunSerializer(SequencingRun.objects.get(pk=int(pk))).data
)

@action(
detail=True,
methods=["GET"],
name="Get sequencing results.",
url_path="results",
)
def upload_sequencing_run_results(self, request, pk):
"""Get sequencing results."""
IMPORTANT_COLUMNS = (
"sequence_id",
"productive",
"stop_codon",
"fwr1_aa",
"cdr1_aa",
"fwr2_aa",
"cdr2_aa",
"fwr3_aa",
"cdr3_aa",
)

results = SequencingRunResults.objects.filter(
sequencing_run_id=int(pk)
).order_by("seq")

import io

csvs = []
for r in results:
# Clean up the CSVs! They seem to have an extra tab in some cases.
buffer = io.StringIO(
"\n".join(
line.strip()
for line in r.airr_file.read().decode("utf8").split("\n")
)
)
df = pd.read_csv(buffer, sep="\t", header=0, usecols=IMPORTANT_COLUMNS)
csvs.append(df)
df = pd.concat(csvs)

# Get number of matches per CDR3 sequence
cdr3_counts = df["cdr3_aa"].value_counts()
cdr3_counts.name = "cdr3_aa_count"
df = df.merge(cdr3_counts, on="cdr3_aa", how="left")

# Sort as required - Productive, number of CDR3 matches,
# then CDR3 itself, then sequence ID
df = df.sort_values(
by=["productive", "cdr3_aa_count", "cdr3_aa", "sequence_id"],
ascending=[False, False, True, True],
)

# Ensure we have the right columns in the right order
df = df.loc[:, IMPORTANT_COLUMNS]

# Set index and replace NaN with None (null in JSON)
df = df.replace({np.nan: None})

# Indicator to show when cdr3 has changed from previous row
df["new_cdr3"] = df["cdr3_aa"].shift(1).ne(df["cdr3_aa"])

return JsonResponse({"records": df.to_dict(orient="records")})
Loading

0 comments on commit e15c97e

Please sign in to comment.