From 397a92f2ee16c1b904c94d738db4b7a7ccc56510 Mon Sep 17 00:00:00 2001 From: zxhlyh Date: Wed, 12 Jul 2023 17:18:56 +0800 Subject: [PATCH] convert audio wav to mp3 (#552) --- api/services/audio_service.py | 9 +++-- api/services/errors/audio.py | 26 ++++--------- web/app/components/base/voice-input/index.tsx | 14 +++++-- web/app/components/base/voice-input/utils.ts | 38 +++++++++++++++++++ web/global.d.ts | 1 + web/package.json | 3 +- 6 files changed, 64 insertions(+), 27 deletions(-) create mode 100644 web/app/components/base/voice-input/utils.ts create mode 100644 web/global.d.ts diff --git a/api/services/audio_service.py b/api/services/audio_service.py index 4506f787db..9870702e46 100644 --- a/api/services/audio_service.py +++ b/api/services/audio_service.py @@ -6,7 +6,8 @@ from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServ from core.llm.whisper import Whisper from models.provider import ProviderName -FILE_SIZE_LIMIT = 1 * 1024 * 1024 +FILE_SIZE = 15 +FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024 ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm'] class AudioService: @@ -23,17 +24,17 @@ class AudioService: file_size = len(file_content) if file_size > FILE_SIZE_LIMIT: - message = f"({file_size} > {FILE_SIZE_LIMIT})" + message = f"Audio size larger than {FILE_SIZE} mb" raise AudioTooLargeServiceError(message) provider_name = LLMBuilder.get_default_provider(tenant_id) if provider_name != ProviderName.OPENAI.value: - raise ProviderNotSupportSpeechToTextServiceError('haha') + raise ProviderNotSupportSpeechToTextServiceError() provider_service = LLMProviderService(tenant_id, provider_name) buffer = io.BytesIO(file_content) - buffer.name = 'temp.wav' + buffer.name = 'temp.mp3' return Whisper(provider_service.provider).transcribe(buffer) diff --git a/api/services/errors/audio.py b/api/services/errors/audio.py index 0f67fa1b61..8c6508936d 100644 --- a/api/services/errors/audio.py +++ b/api/services/errors/audio.py @@ -1,23 +1,13 @@ -from services.errors.base import BaseServiceError - -class NoAudioUploadedServiceError(BaseServiceError): - error_code = 'no_audio_uploaded' - description = "Please upload your audio." - code = 400 +class NoAudioUploadedServiceError(Exception): + pass -class AudioTooLargeServiceError(BaseServiceError): - error_code = 'audio_too_large' - description = "Audio size exceeded. {message}" - code = 413 +class AudioTooLargeServiceError(Exception): + pass -class UnsupportedAudioTypeServiceError(BaseServiceError): - error_code = 'unsupported_audio_type' - description = "Audio type not allowed." - code = 415 +class UnsupportedAudioTypeServiceError(Exception): + pass -class ProviderNotSupportSpeechToTextServiceError(BaseServiceError): - error_code = 'provider_not_support_speech_to_text' - description = "Provider not support speech to text. {message}" - code = 400 \ No newline at end of file +class ProviderNotSupportSpeechToTextServiceError(Exception): + pass \ No newline at end of file diff --git a/web/app/components/base/voice-input/index.tsx b/web/app/components/base/voice-input/index.tsx index f1b020c916..5dd0a1dbc3 100644 --- a/web/app/components/base/voice-input/index.tsx +++ b/web/app/components/base/voice-input/index.tsx @@ -4,6 +4,7 @@ import { useParams, usePathname } from 'next/navigation' import cn from 'classnames' import Recorder from 'js-audio-recorder' import { useRafInterval } from 'ahooks' +import { convertToMp3 } from './utils' import s from './index.module.css' import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices' import { Loading02, XClose } from '@/app/components/base/icons/src/vender/line/general' @@ -19,7 +20,12 @@ const VoiceInput = ({ onConverted, }: VoiceInputTypes) => { const { t } = useTranslation() - const recorder = useRef(new Recorder()) + const recorder = useRef(new Recorder({ + sampleBits: 16, + sampleRate: 16000, + numChannels: 1, + compiling: false, + })) const canvasRef = useRef(null) const ctxRef = useRef(null) const drawRecordId = useRef(null) @@ -75,10 +81,10 @@ const VoiceInput = ({ const canvas = canvasRef.current! const ctx = ctxRef.current! ctx.clearRect(0, 0, canvas.width, canvas.height) - const wavBlob = recorder.current.getWAVBlob() - const wavFile = new File([wavBlob], 'a.wav', { type: 'audio/wav' }) + const mp3Blob = convertToMp3(recorder.current) + const mp3File = new File([mp3Blob], 'temp.mp3', { type: 'audio/mp3' }) const formData = new FormData() - formData.append('file', wavFile) + formData.append('file', mp3File) let url = '' let isPublic = false diff --git a/web/app/components/base/voice-input/utils.ts b/web/app/components/base/voice-input/utils.ts new file mode 100644 index 0000000000..5f4c8d1140 --- /dev/null +++ b/web/app/components/base/voice-input/utils.ts @@ -0,0 +1,38 @@ +import lamejs from 'lamejs' + +export const convertToMp3 = (recorder: any) => { + const wav = lamejs.WavHeader.readHeader(recorder.getWAV()) + const { channels, sampleRate } = wav + const mp3enc = new lamejs.Mp3Encoder(channels, sampleRate, 128) + const result = recorder.getChannelData() + const buffer = [] + + const leftData = result.left && new Int16Array(result.left.buffer, 0, result.left.byteLength / 2) + const rightData = result.right && new Int16Array(result.right.buffer, 0, result.right.byteLength / 2) + const remaining = leftData.length + (rightData ? rightData.length : 0) + + const maxSamples = 1152 + for (let i = 0; i < remaining; i += maxSamples) { + const left = leftData.subarray(i, i + maxSamples) + let right = null + let mp3buf = null + + if (channels === 2) { + right = rightData.subarray(i, i + maxSamples) + mp3buf = mp3enc.encodeBuffer(left, right) + } + else { + mp3buf = mp3enc.encodeBuffer(left) + } + + if (mp3buf.length > 0) + buffer.push(mp3buf) + } + + const enc = mp3enc.flush() + + if (enc.length > 0) + buffer.push(enc) + + return new Blob(buffer, { type: 'audio/mp3' }) +} diff --git a/web/global.d.ts b/web/global.d.ts new file mode 100644 index 0000000000..0cdf0a0372 --- /dev/null +++ b/web/global.d.ts @@ -0,0 +1 @@ +declare module 'lamejs'; \ No newline at end of file diff --git a/web/package.json b/web/package.json index b1eb41c2c4..d5f967290e 100644 --- a/web/package.json +++ b/web/package.json @@ -81,7 +81,8 @@ "swr": "^2.1.0", "tailwindcss": "^3.2.7", "typescript": "4.9.5", - "use-context-selector": "^1.4.1" + "use-context-selector": "^1.4.1", + "lamejs": "1.2.0" }, "devDependencies": { "@antfu/eslint-config": "^0.36.0",