diff --git a/.github/workflows/hrm-transcript-scrubber-ecs-deploy.yml b/.github/workflows/hrm-transcript-scrubber-ecs-deploy.yml index 21093e6fa..dc1aacf67 100644 --- a/.github/workflows/hrm-transcript-scrubber-ecs-deploy.yml +++ b/.github/workflows/hrm-transcript-scrubber-ecs-deploy.yml @@ -61,6 +61,7 @@ on: env: ECR_REPOSITORY: ${{ inputs.environment }}/hrm-transcript-scrubber + TRANSCRIPTION_ECR_REPOSITORY: ${{ inputs.environment }}/hrm-recording-transcriber # if anything is set as a secret, it can't be used in outputs. So we need to set it as an env var AWS_REGION: ${{ inputs.region }} PRIMARY_AWS_REGION: us-east-1 @@ -73,6 +74,7 @@ jobs: outputs: regions: ${{ steps.generate-output.outputs.regions }} docker_image: ${{ steps.generate-output.outputs.docker_image}} + transcription_docker_image: ${{ steps.generate-transcription-output.outputs.transcription_docker_image}} steps: - name: Checkout @@ -112,9 +114,21 @@ jobs: tags: ${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:live,${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:latest,${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-transcript-scrubber-${{ github.sha }} provenance: false + - name: Build and Push Transcription Docker Image + uses: docker/build-push-action@v6 + with: + context: ./ + file: ./hrm-domain/scheduled-tasks/transcript-scrubber/Dockerfile-transcription + push: true + tags: ${{ steps.login-ecr.outputs.registry }}/${{ env.TRANSCRIPTION_ECR_REPOSITORY }}:live,${{ steps.login-ecr.outputs.registry }}/${{ env.TRANSCRIPTION_ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-recording-transcriber-${{ github.sha }} + provenance: false + - id: generate-output run: | echo "docker_image=${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-transcript-scrubber-${{ github.sha }}" >> $GITHUB_OUTPUT + - id: generate-transcription-output + run: | + echo "transcription_docker_image=${{ steps.login-ecr.outputs.registry }}/${{ env.TRANSCRIPTION_ECR_REPOSITORY }}:${{ inputs.environment }}-hrm-recording-transcriber-${{ github.sha }}" >> $GITHUB_OUTPUT - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -129,6 +143,7 @@ jobs: env: DOCKER_IMAGE: ${{ needs.deploy_docker.outputs.docker_image }} + TRANSCRIPTION_DOCKER_IMAGE: ${{ needs.deploy_docker.outputs.transcription_docker_image }} steps: - name: Download artifacts @@ -165,6 +180,25 @@ jobs: task-definition: ${{ steps.task-def-transcript-scrubber.outputs.task-definition }} cluster: ${{ inputs.environment }}-ecs-cluster + - name: Transcriber - Download current ECS task definition + run: aws ecs describe-task-definition --task-definition ${{ inputs.environment }}-hrm-recording-transcriber --query taskDefinition > task-definition-recording-transcriber.json + + - name: Transcriber - Fill in the new image ID in the Amazon ECS task definition + id: task-def-transcript-transcriber + uses: aws-actions/amazon-ecs-render-task-definition@v1 + with: + task-definition: task-definition-recording-transcriber.json + container-name: ${{ inputs.environment }}-hrm-recording-transcriber + image: ${{ env.TRANSCRIPTION_DOCKER_IMAGE }} + + - name: Transcriber - Deploy Amazon ECS task definition + id: transcript-transcriber-task-definition + uses: aws-actions/amazon-ecs-deploy-task-definition@v2 + with: + task-definition: ${{ steps.task-def-transcript-transcriber.outputs.task-definition }} + cluster: ${{ inputs.environment }}-ecs-cluster + + - name: Update EventBridge target run: | chmod +x ./.github/workflows/scripts/update-event-bridge-target.sh diff --git a/hrm-domain/scheduled-tasks/transcript-scrubber/Dockerfile-transcription b/hrm-domain/scheduled-tasks/transcript-scrubber/Dockerfile-transcription new file mode 100644 index 000000000..cfd8b5a6e --- /dev/null +++ b/hrm-domain/scheduled-tasks/transcript-scrubber/Dockerfile-transcription @@ -0,0 +1,63 @@ +# Copyright (C) 2021-2023 Technology Matters +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see https://www.gnu.org/licenses/. + +FROM node:22 AS build +# FROM crprivateaiprod.azurecr.io/deid:cpu-text +ARG task_dir=hrm-domain/scheduled-tasks/transcript-scrubber + +# This is a bit complicated by the need for access to the packages. +# We must have the full repo context for our docker build so we can +# copy the root packages so that the file: based +# dependency in package.json can be resolved +COPY packages /tmp/build/packages +COPY ${task_dir} /tmp/build/${task_dir} +COPY ${task_dir}/../tsconfig.json /tmp/build/${task_dir}/../tsconfig.json +COPY ${task_dir}/../../tsconfig.json /tmp/build/${task_dir}/../../tsconfig.json + +COPY package.json package-lock.json tsconfig.json tsconfig.base.json ${task_dir}/tsconfig.build.json /tmp/build/ + + +RUN cd /tmp/build \ + && npm ci -w ${task_dir} -w packages/* --verbose \ + && npx tsc -b tsconfig.build.json --verbose \ + && mkdir -p /var/task/${task_dir} \ + && cp ${task_dir}/dist/*.js /var/task/ \ + && cp -r packages /var/task/ \ + && cp -r node_modules /var/task/ +COPY ${task_dir}/entrypoint.sh /var/task/entrypoint.sh +COPY ${task_dir}/licence /app/license + +# Ignore errors if node_modules is not present +RUN rsync -a ${task_dir}/node_modules/ /var/task/node_modules; exit 0 + +# The added layers from our build increase image size significantly. This flattens the image +# to reduce the size of the final image. +FROM crprivateaiprod.azurecr.io/deid:4.0.0-gpu AS final +ENV NODE_VERSION=22.20.0 +RUN apt install -y curl +RUN curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.0/install.sh | bash +ENV NVM_DIR=/root/.nvm +RUN . "$NVM_DIR/nvm.sh" && nvm install ${NODE_VERSION} +RUN . "$NVM_DIR/nvm.sh" && nvm use v${NODE_VERSION} +RUN . "$NVM_DIR/nvm.sh" && nvm alias default v${NODE_VERSION} +ENV PATH="/root/.nvm/versions/node/v${NODE_VERSION}/bin/:${PATH}" +RUN node --version +RUN npm --version +COPY --from=build /var/task /var/task +COPY --from=build /app/license /app/license + + +WORKDIR /var/task + +ENTRYPOINT ["/usr/bin/sh","/var/task/entrypoint.sh"] \ No newline at end of file diff --git a/hrm-domain/scheduled-tasks/transcript-scrubber/index.ts b/hrm-domain/scheduled-tasks/transcript-scrubber/index.ts index ef663395c..b16c4b8d0 100644 --- a/hrm-domain/scheduled-tasks/transcript-scrubber/index.ts +++ b/hrm-domain/scheduled-tasks/transcript-scrubber/index.ts @@ -26,9 +26,15 @@ import { ContactJobAttemptResult } from '@tech-matters/types'; const PENDING_TRANSCRIPT_SQS_QUEUE_URL = process.env.PENDING_TRANSCRIPT_SQS_QUEUE_URL; const COMPLETED_TRANSCRIPT_SQS_QUEUE_URL = process.env.COMPLETED_TRANSCRIPT_SQS_QUEUE_URL; const LOCAL_PRIVATEAI_URI_ENDPOINT = new URL('http://localhost:8080/process/text'); +const LOCAL_PRIVATEAI_TRANSCRIPTION_URI_ENDPOINT = new URL( + 'http://localhost:8080/process/base64', +); const LOCAL_PRIVATEAI_HEALTH_ENDPOINT = new URL('http://localhost:8080/healthz'); const MAX_PAI_STARTUP_TIME_MILLIS = 10 * 60 * 1000; const MAX_PROCESSING_RUN_TIME_MILLIS = 15 * 60 * 1000; +const MODE: 'scrubbing' | 'transcription' = (process.env.MODE || 'scrubbing') as + | 'scrubbing' + | 'transcription'; const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms)); @@ -54,6 +60,47 @@ const waitForPrivateAiToBeReady = async () => { } }; +const transcribeS3Recording = async (bucket: string, key: string) => { + const recordingS3ObjectText = await getS3Object({ + bucket, + key, + responseContentType: 'audio/wav', + }); + + const response = await fetch(LOCAL_PRIVATEAI_TRANSCRIPTION_URI_ENDPOINT, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + file: { + data: recordingS3ObjectText, + contentType: 'audio/wav', + }, + entity_detection: { + return_entity: true, + }, + }), + }); + const responsePayload = (await response.json()) as { processed_text: string }; + console.debug('Response from PrivateAI:', response.status); + const scrubbedKey = key.replace('voice-recordings', 'scrubbed-transcripts'); + const scrubbedTranscriptJson = JSON.stringify( + { + transcript: { messages: [responsePayload.processed_text] }, + }, + null, + 2, + ); + console.debug('Saving', scrubbedKey); + await putS3Object({ + bucket, + key: scrubbedKey, + body: scrubbedTranscriptJson, + }); + return scrubbedKey; +}; + const scrubS3Transcript = async (bucket: string, key: string) => { const transcriptS3ObjectText = await getS3Object({ bucket, @@ -157,19 +204,28 @@ const pollQueue = async (): Promise => { export const executeTask = async () => { const processingLatestFinishTime = Date.now() + MAX_PROCESSING_RUN_TIME_MILLIS; await waitForPrivateAiToBeReady(); - let processedMessages = 0; - while (await pollQueue()) { - processedMessages++; - if (Date.now() > processingLatestFinishTime) { - console.warn( - `Could not process all the pending messages in the configured window of ${Math.round( - MAX_PROCESSING_RUN_TIME_MILLIS / 1000, - )} seconds. If this occurs frequently you should look at options to increase the throughput of the scrubbing system.`, - ); - break; + if (MODE === 'scrubbing') { + let processedMessages = 0; + while (await pollQueue()) { + processedMessages++; + if (Date.now() > processingLatestFinishTime) { + console.warn( + `Could not process all the pending messages in the configured window of ${Math.round( + MAX_PROCESSING_RUN_TIME_MILLIS / 1000, + )} seconds. If this occurs frequently you should look at options to increase the throughput of the scrubbing system.`, + ); + break; + } } + console.info(`Processed ${processedMessages} messages this run`); + } else { + console.debug(`Processed test recording.`); + const result = await transcribeS3Recording( + 'tl-aselo-docs-as-development', + `voice-recordings/ACd8a2e89748318adf6ddff7df6948deaf/RE2c7fee8fc159364d60facdcaf1c2f88d`, + ); + console.info(`Processed sample recording successfully.`, result); } - console.info(`Processed ${processedMessages} messages this run`); }; executeTask().catch(console.error); diff --git a/packages/s3-client/index.ts b/packages/s3-client/index.ts index e6a234147..5f5bd4796 100644 --- a/packages/s3-client/index.ts +++ b/packages/s3-client/index.ts @@ -160,7 +160,14 @@ export const getS3Object = async (params: GetS3ObjectParams) => { responseBody.once('error', (err: any) => reject(err)); responseBody.on('data', (chunk: Buffer) => responseDataChunks.push(chunk)); responseBody.once('end', () => - resolve(Buffer.concat(responseDataChunks).toString()), + resolve( + Buffer.concat(responseDataChunks).toString( + ResponseContentType.toLowerCase().startsWith('text/') || + ResponseContentType.toLowerCase().endsWith('/json') + ? 'utf8' + : 'base64', + ), + ), ); } catch (err) { // Handle the error or throw