Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/workflows/hrm-transcript-scrubber-ecs-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ on:

env:
ECR_REPOSITORY: ${{ inputs.environment }}/hrm-transcript-scrubber
TRANSCRIPTION_ECR_REPOSITORY: ${{ inputs.environment }}/hrm-recording-transcriber
# if anything is set as a secret, it can't be used in outputs. So we need to set it as an env var
AWS_REGION: ${{ inputs.region }}
PRIMARY_AWS_REGION: us-east-1
Expand All @@ -73,6 +74,7 @@ jobs:
outputs:
regions: ${{ steps.generate-output.outputs.regions }}
docker_image: ${{ steps.generate-output.outputs.docker_image}}
transcription_docker_image: ${{ steps.generate-transcription-output.outputs.transcription_docker_image}}

steps:
- name: Checkout
Expand Down Expand Up @@ -112,9 +114,21 @@ jobs:
tags: ${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:live,${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:latest,${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-transcript-scrubber-${{ github.sha }}
provenance: false

- name: Build and Push Transcription Docker Image
uses: docker/build-push-action@v6
with:
context: ./
file: ./hrm-domain/scheduled-tasks/transcript-scrubber/Dockerfile-transcription
push: true
tags: ${{ steps.login-ecr.outputs.registry }}/${{ env.TRANSCRIPTION_ECR_REPOSITORY }}:live,${{ steps.login-ecr.outputs.registry }}/${{ env.TRANSCRIPTION_ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-recording-transcriber-${{ github.sha }}
provenance: false

- id: generate-output
run: |
echo "docker_image=${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-transcript-scrubber-${{ github.sha }}" >> $GITHUB_OUTPUT
- id: generate-transcription-output
run: |
echo "transcription_docker_image=${{ steps.login-ecr.outputs.registry }}/${{ env.TRANSCRIPTION_ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-recording-transcriber-${{ github.sha }}" >> $GITHUB_OUTPUT

- name: Upload artifacts
uses: actions/upload-artifact@v6
Expand All @@ -129,6 +143,7 @@ jobs:

env:
DOCKER_IMAGE: ${{ needs.deploy_docker.outputs.docker_image }}
TRANSCRIPTION_DOCKER_IMAGE: ${{ needs.deploy_docker.outputs.transcription_docker_image }}

steps:
- name: Download artifacts
Expand Down Expand Up @@ -165,6 +180,25 @@ jobs:
task-definition: ${{ steps.task-def-transcript-scrubber.outputs.task-definition }}
cluster: ${{ inputs.environment }}-ecs-cluster

- name: Transcriber - Download current ECS task definition
run: aws ecs describe-task-definition --task-definition ${{ inputs.environment }}-hrm-recording-transcriber --query taskDefinition > task-definition-recording-transcriber.json

- name: Transcriber - Fill in the new image ID in the Amazon ECS task definition
id: task-def-transcript-transcriber
uses: aws-actions/amazon-ecs-render-task-definition@v1
with:
task-definition: task-definition-recording-transcriber.json
container-name: ${{ inputs.environment }}-hrm-recording-transcriber
image: ${{ env.TRANSCRIPTION_DOCKER_IMAGE }}

- name: Transcriber - Deploy Amazon ECS task definition
id: transcript-transcriber-task-definition
uses: aws-actions/amazon-ecs-deploy-task-definition@v2
with:
task-definition: ${{ steps.task-def-transcript-transcriber.outputs.task-definition }}
cluster: ${{ inputs.environment }}-ecs-cluster


- name: Update EventBridge target
run: |
chmod +x ./.github/workflows/scripts/update-event-bridge-target.sh
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (C) 2021-2023 Technology Matters
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see https://www.gnu.org/licenses/.

FROM node:22 AS build
# FROM crprivateaiprod.azurecr.io/deid:cpu-text
ARG task_dir=hrm-domain/scheduled-tasks/transcript-scrubber

# This is a bit complicated by the need for access to the packages.
# We must have the full repo context for our docker build so we can
# copy the root packages so that the file: based
# dependency in package.json can be resolved
COPY packages /tmp/build/packages
COPY ${task_dir} /tmp/build/${task_dir}
COPY ${task_dir}/../tsconfig.json /tmp/build/${task_dir}/../tsconfig.json
COPY ${task_dir}/../../tsconfig.json /tmp/build/${task_dir}/../../tsconfig.json

COPY package.json package-lock.json tsconfig.json tsconfig.base.json ${task_dir}/tsconfig.build.json /tmp/build/


RUN cd /tmp/build \
&& npm ci -w ${task_dir} -w packages/* --verbose \
&& npx tsc -b tsconfig.build.json --verbose \
&& mkdir -p /var/task/${task_dir} \
&& cp ${task_dir}/dist/*.js /var/task/ \
&& cp -r packages /var/task/ \
&& cp -r node_modules /var/task/
COPY ${task_dir}/entrypoint.sh /var/task/entrypoint.sh
COPY ${task_dir}/licence /app/license

# Ignore errors if node_modules is not present
RUN rsync -a ${task_dir}/node_modules/ /var/task/node_modules; exit 0

# The added layers from our build increase image size significantly. This flattens the image
# to reduce the size of the final image.
FROM crprivateaiprod.azurecr.io/deid:4.0.0-gpu AS final
ENV NODE_VERSION=22.20.0
RUN apt install -y curl
RUN curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.0/install.sh | bash
ENV NVM_DIR=/root/.nvm
RUN . "$NVM_DIR/nvm.sh" && nvm install ${NODE_VERSION}
RUN . "$NVM_DIR/nvm.sh" && nvm use v${NODE_VERSION}
RUN . "$NVM_DIR/nvm.sh" && nvm alias default v${NODE_VERSION}
ENV PATH="/root/.nvm/versions/node/v${NODE_VERSION}/bin/:${PATH}"
RUN node --version
RUN npm --version
COPY --from=build /var/task /var/task
COPY --from=build /app/license /app/license


WORKDIR /var/task

ENTRYPOINT ["/usr/bin/sh","/var/task/entrypoint.sh"]
78 changes: 67 additions & 11 deletions hrm-domain/scheduled-tasks/transcript-scrubber/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,15 @@ import { ContactJobAttemptResult } from '@tech-matters/types';
const PENDING_TRANSCRIPT_SQS_QUEUE_URL = process.env.PENDING_TRANSCRIPT_SQS_QUEUE_URL;
const COMPLETED_TRANSCRIPT_SQS_QUEUE_URL = process.env.COMPLETED_TRANSCRIPT_SQS_QUEUE_URL;
const LOCAL_PRIVATEAI_URI_ENDPOINT = new URL('http://localhost:8080/process/text');
const LOCAL_PRIVATEAI_TRANSCRIPTION_URI_ENDPOINT = new URL(
'http://localhost:8080/process/base64',
);
const LOCAL_PRIVATEAI_HEALTH_ENDPOINT = new URL('http://localhost:8080/healthz');
const MAX_PAI_STARTUP_TIME_MILLIS = 10 * 60 * 1000;
const MAX_PROCESSING_RUN_TIME_MILLIS = 15 * 60 * 1000;
const MODE: 'scrubbing' | 'transcription' = (process.env.MODE || 'scrubbing') as
| 'scrubbing'
| 'transcription';

const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms));

Expand All @@ -54,6 +60,47 @@ const waitForPrivateAiToBeReady = async () => {
}
};

const transcribeS3Recording = async (bucket: string, key: string) => {
const recordingS3ObjectText = await getS3Object({
bucket,
key,
responseContentType: 'audio/wav',
});

const response = await fetch(LOCAL_PRIVATEAI_TRANSCRIPTION_URI_ENDPOINT, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
file: {
data: recordingS3ObjectText,
contentType: 'audio/wav',
},
entity_detection: {
return_entity: true,
},
}),
});
const responsePayload = (await response.json()) as { processed_text: string };
console.debug('Response from PrivateAI:', response.status);
const scrubbedKey = key.replace('voice-recordings', 'scrubbed-transcripts');
const scrubbedTranscriptJson = JSON.stringify(
{
transcript: { messages: [responsePayload.processed_text] },
},
null,
2,
);
console.debug('Saving', scrubbedKey);
await putS3Object({
bucket,
key: scrubbedKey,
body: scrubbedTranscriptJson,
});
return scrubbedKey;
};

const scrubS3Transcript = async (bucket: string, key: string) => {
const transcriptS3ObjectText = await getS3Object({
bucket,
Expand Down Expand Up @@ -157,19 +204,28 @@ const pollQueue = async (): Promise<boolean> => {
export const executeTask = async () => {
const processingLatestFinishTime = Date.now() + MAX_PROCESSING_RUN_TIME_MILLIS;
await waitForPrivateAiToBeReady();
let processedMessages = 0;
while (await pollQueue()) {
processedMessages++;
if (Date.now() > processingLatestFinishTime) {
console.warn(
`Could not process all the pending messages in the configured window of ${Math.round(
MAX_PROCESSING_RUN_TIME_MILLIS / 1000,
)} seconds. If this occurs frequently you should look at options to increase the throughput of the scrubbing system.`,
);
break;
if (MODE === 'scrubbing') {
let processedMessages = 0;
while (await pollQueue()) {
processedMessages++;
if (Date.now() > processingLatestFinishTime) {
console.warn(
`Could not process all the pending messages in the configured window of ${Math.round(
MAX_PROCESSING_RUN_TIME_MILLIS / 1000,
)} seconds. If this occurs frequently you should look at options to increase the throughput of the scrubbing system.`,
);
break;
}
}
console.info(`Processed ${processedMessages} messages this run`);
} else {
console.debug(`Processed test recording.`);
const result = await transcribeS3Recording(
'tl-aselo-docs-as-development',
`voice-recordings/ACd8a2e89748318adf6ddff7df6948deaf/RE2c7fee8fc159364d60facdcaf1c2f88d`,
);
console.info(`Processed sample recording successfully.`, result);
}
console.info(`Processed ${processedMessages} messages this run`);
};

executeTask().catch(console.error);
9 changes: 8 additions & 1 deletion packages/s3-client/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,14 @@ export const getS3Object = async (params: GetS3ObjectParams) => {
responseBody.once('error', (err: any) => reject(err));
responseBody.on('data', (chunk: Buffer) => responseDataChunks.push(chunk));
responseBody.once('end', () =>
resolve(Buffer.concat(responseDataChunks).toString()),
resolve(
Buffer.concat(responseDataChunks).toString(
ResponseContentType.toLowerCase().startsWith('text/') ||
ResponseContentType.toLowerCase().endsWith('/json')
? 'utf8'
: 'base64',
),
),
);
} catch (err) {
// Handle the error or throw
Expand Down
Loading