convert audio wav to mp3 (#552)

This commit is contained in:
zxhlyh 2023-07-12 17:18:56 +08:00 committed by GitHub
parent b91e226063
commit 397a92f2ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 64 additions and 27 deletions

View File

@ -6,7 +6,8 @@ from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServ
from core.llm.whisper import Whisper
from models.provider import ProviderName
FILE_SIZE_LIMIT = 1 * 1024 * 1024
FILE_SIZE = 15
FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm']
class AudioService:
@ -23,17 +24,17 @@ class AudioService:
file_size = len(file_content)
if file_size > FILE_SIZE_LIMIT:
message = f"({file_size} > {FILE_SIZE_LIMIT})"
message = f"Audio size larger than {FILE_SIZE} mb"
raise AudioTooLargeServiceError(message)
provider_name = LLMBuilder.get_default_provider(tenant_id)
if provider_name != ProviderName.OPENAI.value:
raise ProviderNotSupportSpeechToTextServiceError('haha')
raise ProviderNotSupportSpeechToTextServiceError()
provider_service = LLMProviderService(tenant_id, provider_name)
buffer = io.BytesIO(file_content)
buffer.name = 'temp.wav'
buffer.name = 'temp.mp3'
return Whisper(provider_service.provider).transcribe(buffer)

View File

@ -1,23 +1,13 @@
from services.errors.base import BaseServiceError
class NoAudioUploadedServiceError(BaseServiceError):
error_code = 'no_audio_uploaded'
description = "Please upload your audio."
code = 400
class NoAudioUploadedServiceError(Exception):
pass
class AudioTooLargeServiceError(BaseServiceError):
error_code = 'audio_too_large'
description = "Audio size exceeded. {message}"
code = 413
class AudioTooLargeServiceError(Exception):
pass
class UnsupportedAudioTypeServiceError(BaseServiceError):
error_code = 'unsupported_audio_type'
description = "Audio type not allowed."
code = 415
class UnsupportedAudioTypeServiceError(Exception):
pass
class ProviderNotSupportSpeechToTextServiceError(BaseServiceError):
error_code = 'provider_not_support_speech_to_text'
description = "Provider not support speech to text. {message}"
code = 400
class ProviderNotSupportSpeechToTextServiceError(Exception):
pass

View File

@ -4,6 +4,7 @@ import { useParams, usePathname } from 'next/navigation'
import cn from 'classnames'
import Recorder from 'js-audio-recorder'
import { useRafInterval } from 'ahooks'
import { convertToMp3 } from './utils'
import s from './index.module.css'
import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
import { Loading02, XClose } from '@/app/components/base/icons/src/vender/line/general'
@ -19,7 +20,12 @@ const VoiceInput = ({
onConverted,
}: VoiceInputTypes) => {
const { t } = useTranslation()
const recorder = useRef(new Recorder())
const recorder = useRef(new Recorder({
sampleBits: 16,
sampleRate: 16000,
numChannels: 1,
compiling: false,
}))
const canvasRef = useRef<HTMLCanvasElement | null>(null)
const ctxRef = useRef<CanvasRenderingContext2D | null>(null)
const drawRecordId = useRef<number | null>(null)
@ -75,10 +81,10 @@ const VoiceInput = ({
const canvas = canvasRef.current!
const ctx = ctxRef.current!
ctx.clearRect(0, 0, canvas.width, canvas.height)
const wavBlob = recorder.current.getWAVBlob()
const wavFile = new File([wavBlob], 'a.wav', { type: 'audio/wav' })
const mp3Blob = convertToMp3(recorder.current)
const mp3File = new File([mp3Blob], 'temp.mp3', { type: 'audio/mp3' })
const formData = new FormData()
formData.append('file', wavFile)
formData.append('file', mp3File)
let url = ''
let isPublic = false

View File

@ -0,0 +1,38 @@
import lamejs from 'lamejs'
export const convertToMp3 = (recorder: any) => {
const wav = lamejs.WavHeader.readHeader(recorder.getWAV())
const { channels, sampleRate } = wav
const mp3enc = new lamejs.Mp3Encoder(channels, sampleRate, 128)
const result = recorder.getChannelData()
const buffer = []
const leftData = result.left && new Int16Array(result.left.buffer, 0, result.left.byteLength / 2)
const rightData = result.right && new Int16Array(result.right.buffer, 0, result.right.byteLength / 2)
const remaining = leftData.length + (rightData ? rightData.length : 0)
const maxSamples = 1152
for (let i = 0; i < remaining; i += maxSamples) {
const left = leftData.subarray(i, i + maxSamples)
let right = null
let mp3buf = null
if (channels === 2) {
right = rightData.subarray(i, i + maxSamples)
mp3buf = mp3enc.encodeBuffer(left, right)
}
else {
mp3buf = mp3enc.encodeBuffer(left)
}
if (mp3buf.length > 0)
buffer.push(mp3buf)
}
const enc = mp3enc.flush()
if (enc.length > 0)
buffer.push(enc)
return new Blob(buffer, { type: 'audio/mp3' })
}

1
web/global.d.ts vendored Normal file
View File

@ -0,0 +1 @@
declare module 'lamejs';

View File

@ -81,7 +81,8 @@
"swr": "^2.1.0",
"tailwindcss": "^3.2.7",
"typescript": "4.9.5",
"use-context-selector": "^1.4.1"
"use-context-selector": "^1.4.1",
"lamejs": "1.2.0"
},
"devDependencies": {
"@antfu/eslint-config": "^0.36.0",