Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion medcat-trainer/webapp/.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,15 @@ __pycache__/
*.md
.pytest_cache
.mypy_cache
node_modules/

# Frontend — rebuilt in the frontend-builder stage
frontend/node_modules/
frontend/dist/
frontend/coverage/

# Backend tests — not needed in the production image
api/**/tests/

# User-uploaded models and datasets — mounted at /home/api/media in compose
api/media/*
!api/media/.keep
81 changes: 46 additions & 35 deletions medcat-trainer/webapp/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,62 +1,73 @@
FROM python:3.12
# -----------------------------------------------------------------------------
# Stage 1: Build frontend assets (Node toolchain discarded after this stage)
# -----------------------------------------------------------------------------
FROM node:20-bookworm-slim AS frontend-builder

# Update and upgrade everything
RUN apt-get update -y && \
apt-get upgrade -y

# install vim as its annoying not to have an editor
RUN apt-get install -y vim

# install supervisor
RUN apt-get install -y supervisor
WORKDIR /build
COPY frontend/package.json frontend/package-lock.json ./
RUN --mount=type=cache,target=/root/.npm \
npm ci --prefer-offline --no-audit --no-fund

# install gettext for envsubst (used to generate runtime config)
RUN apt-get install -y gettext
COPY frontend/ ./
# CI test-frontend already runs type-check; build-only avoids a second vue-tsc pass.
# No sourcemaps in the image — saves ~25MB+ and build I/O.
RUN NODE_OPTIONS=--max-old-space-size=4096 \
npm run build-only -- --sourcemap false

# install cron - and remove any default tabs
RUN apt-get install -y cron && which cron && rm -rf /etc/cron.*/*
# -----------------------------------------------------------------------------
# Stage 2: Install Python deps (Rust/build tools discarded after this stage)
# -----------------------------------------------------------------------------
FROM python:3.12-bookworm AS python-builder

# Get node and npm
RUN apt install -y nodejs && apt install -y npm
RUN apt-get update -y && \
apt-get install -y --no-install-recommends build-essential curl && \
rm -rf /var/lib/apt/lists/*

# Install Rust - for tokenziers dep in medcat.
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"

# Copy dependency files first for better layer caching
WORKDIR /home/frontend
COPY frontend/package.json frontend/package-lock.json ./
RUN npm install

# Install uv and Python dependencies
WORKDIR /home
COPY pyproject.toml uv.lock* ./
# Install dependencies using a buildkit cache mount for speed on repeat
COPY pyproject.toml uv.lock ./
RUN --mount=type=cache,target=/root/.cache/uv \
pip install uv && \
pip install --no-cache-dir uv && \
uv sync --frozen --cache-dir=/root/.cache/uv --no-install-project --extra observability

# Ensure venv has pip (uv venvs don't include it; spacy download needs it)
RUN uv run python -m ensurepip --upgrade

# Download spaCy models (only requires spaCy, not application code)
ARG SPACY_MODELS="en_core_web_md"
RUN for SPACY_MODEL in ${SPACY_MODELS}; do uv run python -m spacy download ${SPACY_MODEL}; done

# Copy rest of project
# -----------------------------------------------------------------------------
# Stage 3: Runtime image — no Node, npm, Rust, or frontend devDependencies
# -----------------------------------------------------------------------------
FROM python:3.12-bookworm

RUN apt-get update -y && \
apt-get install -y --no-install-recommends \
vim \
supervisor \
gettext \
cron \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /etc/cron.*/*

RUN pip install --no-cache-dir uv

WORKDIR /home
COPY ./ .
COPY pyproject.toml uv.lock ./
COPY --from=python-builder /home/.venv /home/.venv
COPY api ./api
COPY scripts ./scripts
COPY templates ./templates
COPY --from=frontend-builder /build/dist ./frontend/dist

# Build frontend
WORKDIR /home/frontend
RUN npm run build
# MEDIA_ROOT is a runtime volume; ensure the directory exists without baking in local uploads
RUN mkdir -p /home/api/media

# copy backup crontab and chmod scripts
RUN chmod u+x /home/scripts/entry.sh && \
chmod u+x /home/scripts/crontab && cp /home/scripts/crontab /etc/crontab && \
chmod a+x /home/scripts/run.sh && \
chmod a+x /home/scripts/run-bg-process.sh && \
chmod a+x /home/scripts/nginx-entrypoint.sh

WORKDIR /home/api/

91 changes: 91 additions & 0 deletions medcat-trainer/webapp/api/api/tests/_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Shared helpers for backend tests.

These utilities make it easier to construct lightweight model fixtures
without triggering MedCAT model loading or expensive dataset parsing.
"""

import os
import tempfile
from contextlib import contextmanager

import pandas as pd

from django.contrib.auth.models import User

from .. import signals as api_signals
from ..models import (
ConceptDB,
Dataset,
Document,
Entity,
ProjectAnnotateEntities,
Vocabulary,
)


@contextmanager
def dataset_signals_disconnected():
"""Temporarily disconnect Dataset post_save / pre_save signals.

Useful in unit tests that want to insert a Dataset row without triggering
`dataset_from_file` which expects a CSV/XLSX on disk with the right schema.
"""
from django.db.models.signals import post_save, pre_save

post_save.disconnect(api_signals.save_dataset, sender=Dataset)
pre_save.disconnect(api_signals.pre_save_dataset, sender=Dataset)
try:
yield
finally:
post_save.connect(api_signals.save_dataset, sender=Dataset)
pre_save.connect(api_signals.pre_save_dataset, sender=Dataset)


def create_dataset(name='test-dataset', file_name='test-dataset.csv'):
"""Create a Dataset row without firing the file-parsing signals."""
with dataset_signals_disconnected():
ds = Dataset.objects.create(name=name, original_file=file_name)
return ds


def make_csv_file(tmp_dir, rows=None, file_name='dataset.csv'):
"""Write a small CSV with 'name' and 'text' columns and return its path."""
if rows is None:
rows = [
{'name': 'doc-a', 'text': 'Patient reports chest pain.'},
{'name': 'doc-b', 'text': 'No fever or cough.'},
]
path = os.path.join(tmp_dir, file_name)
pd.DataFrame(rows).to_csv(path, index=False)
return path


def create_basic_project(name='test-project'):
"""Create a ProjectAnnotateEntities along with a CDB / Vocab / Dataset."""
cdb = ConceptDB(name=f'{name}-cdb', cdb_file=f'{name}-cdb.dat')
cdb.save(skip_load=True)
vocab = Vocabulary(name=f'{name}-vocab', vocab_file=f'{name}-vocab.dat')
vocab.save(skip_load=True)

ds = create_dataset(name=f'{name}-ds', file_name=f'{name}-ds.csv')

project = ProjectAnnotateEntities()
project.name = name
project.dataset = ds
project.concept_db = cdb
project.vocab = vocab
project.cuis = ''
project.save()
return project


def create_document(project, name='doc1', text='hello world'):
return Document.objects.create(name=name, text=text, dataset=project.dataset)


def create_user(username='testuser', password='pw', **extra):
return User.objects.create_user(username=username, password=password, **extra)


def create_entity(label='C001'):
return Entity.objects.create(label=label)
150 changes: 150 additions & 0 deletions medcat-trainer/webapp/api/api/tests/test_admin_actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""Unit tests for api.admin.actions.

These tests focus on retrieve_project_data and the download_* helpers since
they back the JSON export feature that the upload tests already validate.
"""

import json

from django.test import TestCase, override_settings

from ..admin.actions import (
download_projects_with_text,
download_projects_without_text,
retrieve_project_data,
)
from ..models import (
AnnotatedEntity,
EntityRelation,
MetaAnnotation,
MetaTask,
MetaTaskValue,
ProjectAnnotateEntities,
Relation,
)
from ._helpers import (
create_basic_project,
create_document,
create_entity,
create_user,
)


@override_settings(MEDIA_ROOT='/tmp/mct-tests-admin')
class RetrieveProjectDataTests(TestCase):
def setUp(self):
self.user = create_user(username='admin-actions-user')
self.project = create_basic_project(name='admin-actions-proj')
self.doc = create_document(self.project, name='doc-1', text='hello world')
self.entity = create_entity(label='C100')
self.entity_b = create_entity(label='C200')

self.ann_a = AnnotatedEntity.objects.create(
user=self.user, project=self.project, document=self.doc, entity=self.entity,
value='hello', start_ind=0, end_ind=5, acc=0.9, validated=True, correct=True,
)
self.ann_b = AnnotatedEntity.objects.create(
user=self.user, project=self.project, document=self.doc, entity=self.entity_b,
value='world', start_ind=6, end_ind=11, acc=0.95, validated=True, correct=True,
)

self.task = MetaTask.objects.create(name='Presence')
self.value = MetaTaskValue.objects.create(name='True')
MetaAnnotation.objects.create(
annotated_entity=self.ann_a,
meta_task=self.task,
meta_task_value=self.value,
validated=True,
)

self.project.validated_documents.add(self.doc)

def test_returns_basic_project_metadata(self):
out = retrieve_project_data(ProjectAnnotateEntities.objects.filter(id=self.project.id))
self.assertEqual(len(out['projects']), 1)
proj = out['projects'][0]
self.assertEqual(proj['name'], 'admin-actions-proj')
self.assertEqual(proj['cuis'], self.project.cuis)
self.assertEqual(proj['project_status'], 'A')
self.assertEqual(len(proj['documents']), 1)

def test_includes_annotation_text_and_indices(self):
out = retrieve_project_data(ProjectAnnotateEntities.objects.filter(id=self.project.id))
doc = out['projects'][0]['documents'][0]
cuis = sorted(a['cui'] for a in doc['annotations'])
self.assertEqual(cuis, ['C100', 'C200'])
# check start/end indices match
ann_a = next(a for a in doc['annotations'] if a['cui'] == 'C100')
self.assertEqual(ann_a['start'], 0)
self.assertEqual(ann_a['end'], 5)
self.assertEqual(ann_a['value'], 'hello')
self.assertTrue(ann_a['validated'])
self.assertTrue(ann_a['correct'])

def test_includes_meta_annotations(self):
out = retrieve_project_data(ProjectAnnotateEntities.objects.filter(id=self.project.id))
doc = out['projects'][0]['documents'][0]
ann_a = next(a for a in doc['annotations'] if a['cui'] == 'C100')
self.assertIn('Presence', ann_a['meta_anns'])
self.assertEqual(ann_a['meta_anns']['Presence']['value'], 'True')

def test_relations_included(self):
rel = Relation.objects.create(label='hasFinding')
EntityRelation.objects.create(
user=self.user,
project=self.project,
document=self.doc,
relation=rel,
start_entity=self.ann_a,
end_entity=self.ann_b,
validated=True,
)

out = retrieve_project_data(ProjectAnnotateEntities.objects.filter(id=self.project.id))
rels = out['projects'][0]['documents'][0]['relations']
self.assertEqual(len(rels), 1)
self.assertEqual(rels[0]['relation'], 'hasFinding')
self.assertEqual(rels[0]['start_entity_cui'], 'C100')
self.assertEqual(rels[0]['end_entity_cui'], 'C200')


@override_settings(MEDIA_ROOT='/tmp/mct-tests-admin')
class DownloadProjectsTests(TestCase):
def setUp(self):
self.user = create_user(username='dl-action-user')
self.project = create_basic_project(name='dl-action-proj')
self.doc = create_document(self.project, name='doc-only', text='annotated text')
ent = create_entity(label='C-DL')
AnnotatedEntity.objects.create(
user=self.user, project=self.project, document=self.doc, entity=ent,
value='annotated', start_ind=0, end_ind=9, acc=1.0, validated=True, correct=True,
)
self.project.validated_documents.add(self.doc)

def test_download_with_text_includes_document_text(self):
resp = download_projects_with_text(
ProjectAnnotateEntities.objects.filter(id=self.project.id)
)
self.assertEqual(resp.status_code, 200)
body = json.loads(resp.content)
self.assertEqual(body['projects'][0]['documents'][0]['text'], 'annotated text')

def test_download_without_text_omits_document_text(self):
resp = download_projects_without_text(
ProjectAnnotateEntities.objects.filter(id=self.project.id),
with_doc_name=False,
)
self.assertEqual(resp.status_code, 200)
body = json.loads(resp.content)
doc = body['projects'][0]['documents'][0]
self.assertNotIn('text', doc)

def test_download_without_text_with_doc_name_includes_name(self):
resp = download_projects_without_text(
ProjectAnnotateEntities.objects.filter(id=self.project.id),
with_doc_name=True,
)
body = json.loads(resp.content)
doc = body['projects'][0]['documents'][0]
self.assertEqual(doc['name'], 'doc-only')
self.assertNotIn('text', doc)
Loading