techmatters · stephenhand · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/.github/workflows/hrm-transcript-scrubber-ecs-deploy.yml b/.github/workflows/hrm-transcript-scrubber-ecs-deploy.yml
@@ -61,6 +61,7 @@ on:
 
 env:
   ECR_REPOSITORY: ${{ inputs.environment }}/hrm-transcript-scrubber
+  TRANSCRIPTION_ECR_REPOSITORY: ${{ inputs.environment }}/hrm-recording-transcriber
   # if anything is set as a secret, it can't be used in outputs. So we need to set it as an env var
   AWS_REGION: ${{ inputs.region }}
   PRIMARY_AWS_REGION: us-east-1
@@ -73,6 +74,7 @@ jobs:
     outputs:
       regions: ${{ steps.generate-output.outputs.regions }}
       docker_image: ${{ steps.generate-output.outputs.docker_image}}
+      transcription_docker_image: ${{ steps.generate-transcription-output.outputs.transcription_docker_image}}
 
     steps:
       - name: Checkout
@@ -112,9 +114,21 @@ jobs:
           tags: ${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:live,${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:latest,${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-transcript-scrubber-${{ github.sha }}
           provenance: false
 
+      - name: Build and Push Transcription Docker Image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./
+          file: ./hrm-domain/scheduled-tasks/transcript-scrubber/Dockerfile-transcription
+          push: true
+          tags: ${{ steps.login-ecr.outputs.registry }}/${{ env.TRANSCRIPTION_ECR_REPOSITORY }}:live,${{ steps.login-ecr.outputs.registry }}/${{ env.TRANSCRIPTION_ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-recording-transcriber-${{ github.sha }}
+          provenance: false
+
       - id: generate-output
         run: |
           echo "docker_image=${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-transcript-scrubber-${{ github.sha }}" >> $GITHUB_OUTPUT
+      - id: generate-transcription-output
+        run: |
+          echo "transcription_docker_image=${{ steps.login-ecr.outputs.registry }}/${{ env.TRANSCRIPTION_ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-recording-transcriber-${{ github.sha }}" >> $GITHUB_OUTPUT
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v6
@@ -129,6 +143,7 @@ jobs:
 
     env:
       DOCKER_IMAGE: ${{ needs.deploy_docker.outputs.docker_image }}
+      TRANSCRIPTION_DOCKER_IMAGE: ${{ needs.deploy_docker.outputs.transcription_docker_image }}
 
     steps:
       - name: Download artifacts
@@ -165,6 +180,25 @@ jobs:
           task-definition: ${{ steps.task-def-transcript-scrubber.outputs.task-definition }}
           cluster: ${{ inputs.environment }}-ecs-cluster
 
+      - name: Transcriber - Download current ECS task definition
+        run: aws ecs describe-task-definition --task-definition ${{ inputs.environment }}-hrm-recording-transcriber --query taskDefinition > task-definition-recording-transcriber.json
+
+      - name: Transcriber - Fill in the new image ID in the Amazon ECS task definition
+        id: task-def-transcript-transcriber
+        uses: aws-actions/amazon-ecs-render-task-definition@v1
+        with:
+          task-definition: task-definition-recording-transcriber.json
+          container-name: ${{ inputs.environment }}-hrm-recording-transcriber
+          image: ${{ env.TRANSCRIPTION_DOCKER_IMAGE }}
+
+      - name: Transcriber - Deploy Amazon ECS task definition
+        id: transcript-transcriber-task-definition
+        uses: aws-actions/amazon-ecs-deploy-task-definition@v2
+        with:
+          task-definition: ${{ steps.task-def-transcript-transcriber.outputs.task-definition }}
+          cluster: ${{ inputs.environment }}-ecs-cluster
+
+
       - name: Update EventBridge target
         run: |
           chmod +x ./.github/workflows/scripts/update-event-bridge-target.sh

diff --git a/hrm-domain/scheduled-tasks/transcript-scrubber/Dockerfile-transcription b/hrm-domain/scheduled-tasks/transcript-scrubber/Dockerfile-transcription
@@ -0,0 +1,63 @@
+# Copyright (C) 2021-2023 Technology Matters
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see https://www.gnu.org/licenses/.
+
+FROM node:22 AS build
+# FROM crprivateaiprod.azurecr.io/deid:cpu-text
+ARG task_dir=hrm-domain/scheduled-tasks/transcript-scrubber
+
+# This is a bit complicated by the need for access to the packages.
+# We must have the full repo context for our docker build so we can
+# copy the root packages so that the file: based
+# dependency in package.json can be resolved
+COPY packages /tmp/build/packages
+COPY ${task_dir} /tmp/build/${task_dir}
+COPY ${task_dir}/../tsconfig.json /tmp/build/${task_dir}/../tsconfig.json
+COPY ${task_dir}/../../tsconfig.json /tmp/build/${task_dir}/../../tsconfig.json
+
+COPY package.json package-lock.json tsconfig.json tsconfig.base.json ${task_dir}/tsconfig.build.json /tmp/build/
+
+
+RUN  cd /tmp/build \
+    && npm ci -w ${task_dir} -w packages/* --verbose \
+    && npx tsc -b tsconfig.build.json --verbose \
+    && mkdir -p /var/task/${task_dir} \
+    && cp ${task_dir}/dist/*.js /var/task/ \
+    && cp -r packages /var/task/ \
+    && cp -r node_modules /var/task/
+COPY ${task_dir}/entrypoint.sh /var/task/entrypoint.sh
+COPY ${task_dir}/licence /app/license
+
+# Ignore errors if node_modules is not present
+RUN rsync -a ${task_dir}/node_modules/ /var/task/node_modules; exit 0
+
+# The added layers from our build increase image size significantly. This flattens the image
+# to reduce the size of the final image.
+FROM crprivateaiprod.azurecr.io/deid:4.0.0-gpu AS final
+ENV NODE_VERSION=22.20.0
+RUN apt install -y curl
+RUN curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.0/install.sh | bash
+ENV NVM_DIR=/root/.nvm
+RUN . "$NVM_DIR/nvm.sh" && nvm install ${NODE_VERSION}
+RUN . "$NVM_DIR/nvm.sh" && nvm use v${NODE_VERSION}
+RUN . "$NVM_DIR/nvm.sh" && nvm alias default v${NODE_VERSION}
+ENV PATH="/root/.nvm/versions/node/v${NODE_VERSION}/bin/:${PATH}"
+RUN node --version
+RUN npm --version
+COPY --from=build /var/task /var/task
+COPY --from=build /app/license /app/license
+
+
+WORKDIR /var/task
+
+ENTRYPOINT ["/usr/bin/sh","/var/task/entrypoint.sh"]
diff --git a/hrm-domain/scheduled-tasks/transcript-scrubber/index.ts b/hrm-domain/scheduled-tasks/transcript-scrubber/index.ts
@@ -26,9 +26,15 @@ import { ContactJobAttemptResult } from '@tech-matters/types';
 const PENDING_TRANSCRIPT_SQS_QUEUE_URL = process.env.PENDING_TRANSCRIPT_SQS_QUEUE_URL;
 const COMPLETED_TRANSCRIPT_SQS_QUEUE_URL = process.env.COMPLETED_TRANSCRIPT_SQS_QUEUE_URL;
 const LOCAL_PRIVATEAI_URI_ENDPOINT = new URL('http://localhost:8080/process/text');
+const LOCAL_PRIVATEAI_TRANSCRIPTION_URI_ENDPOINT = new URL(
+  'http://localhost:8080/process/base64',
+);
 const LOCAL_PRIVATEAI_HEALTH_ENDPOINT = new URL('http://localhost:8080/healthz');
 const MAX_PAI_STARTUP_TIME_MILLIS = 10 * 60 * 1000;
 const MAX_PROCESSING_RUN_TIME_MILLIS = 15 * 60 * 1000;
+const MODE: 'scrubbing' | 'transcription' = (process.env.MODE || 'scrubbing') as
+  | 'scrubbing'
+  | 'transcription';
 
 const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms));
 
@@ -54,6 +60,47 @@ const waitForPrivateAiToBeReady = async () => {
   }
 };
 
+const transcribeS3Recording = async (bucket: string, key: string) => {
+  const recordingS3ObjectText = await getS3Object({
+    bucket,
+    key,
+    responseContentType: 'audio/wav',
+  });
+
+  const response = await fetch(LOCAL_PRIVATEAI_TRANSCRIPTION_URI_ENDPOINT, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify({
+      file: {
+        data: recordingS3ObjectText,
+        contentType: 'audio/wav',
+      },
+      entity_detection: {
+        return_entity: true,
+      },
+    }),
+  });
+  const responsePayload = (await response.json()) as { processed_text: string };
+  console.debug('Response from PrivateAI:', response.status);
+  const scrubbedKey = key.replace('voice-recordings', 'scrubbed-transcripts');
+  const scrubbedTranscriptJson = JSON.stringify(
+    {
+      transcript: { messages: [responsePayload.processed_text] },
+    },
+    null,
+    2,
+  );
+  console.debug('Saving', scrubbedKey);
+  await putS3Object({
+    bucket,
+    key: scrubbedKey,
+    body: scrubbedTranscriptJson,
+  });
+  return scrubbedKey;
+};
+
 const scrubS3Transcript = async (bucket: string, key: string) => {
   const transcriptS3ObjectText = await getS3Object({
     bucket,
@@ -157,19 +204,28 @@ const pollQueue = async (): Promise<boolean> => {
 export const executeTask = async () => {
   const processingLatestFinishTime = Date.now() + MAX_PROCESSING_RUN_TIME_MILLIS;
   await waitForPrivateAiToBeReady();
-  let processedMessages = 0;
-  while (await pollQueue()) {
-    processedMessages++;
-    if (Date.now() > processingLatestFinishTime) {
-      console.warn(
-        `Could not process all the pending messages in the configured window of ${Math.round(
-          MAX_PROCESSING_RUN_TIME_MILLIS / 1000,
-        )} seconds. If this occurs frequently you should look at options to increase the throughput of the scrubbing system.`,
-      );
-      break;
+  if (MODE === 'scrubbing') {
+    let processedMessages = 0;
+    while (await pollQueue()) {
+      processedMessages++;
+      if (Date.now() > processingLatestFinishTime) {
+        console.warn(
+          `Could not process all the pending messages in the configured window of ${Math.round(
+            MAX_PROCESSING_RUN_TIME_MILLIS / 1000,
+          )} seconds. If this occurs frequently you should look at options to increase the throughput of the scrubbing system.`,
+        );
+        break;
+      }
     }
+    console.info(`Processed ${processedMessages} messages this run`);
+  } else {
+    console.debug(`Processed test recording.`);
+    const result = await transcribeS3Recording(
+      'tl-aselo-docs-as-development',
+      `voice-recordings/ACd8a2e89748318adf6ddff7df6948deaf/RE2c7fee8fc159364d60facdcaf1c2f88d`,
+    );
+    console.info(`Processed sample recording successfully.`, result);
   }
-  console.info(`Processed ${processedMessages} messages this run`);
 };
 
 executeTask().catch(console.error);
diff --git a/packages/s3-client/index.ts b/packages/s3-client/index.ts
@@ -160,7 +160,14 @@ export const getS3Object = async (params: GetS3ObjectParams) => {
       responseBody.once('error', (err: any) => reject(err));
       responseBody.on('data', (chunk: Buffer) => responseDataChunks.push(chunk));
       responseBody.once('end', () =>
-        resolve(Buffer.concat(responseDataChunks).toString()),
+        resolve(
+          Buffer.concat(responseDataChunks).toString(
+            ResponseContentType.toLowerCase().startsWith('text/') ||
+              ResponseContentType.toLowerCase().endsWith('/json')
+              ? 'utf8'
+              : 'base64',
+          ),
+        ),
       );
     } catch (err) {
       // Handle the error or throw