mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
convert audio wav to mp3 (#552)
This commit is contained in:
parent
b91e226063
commit
397a92f2ee
|
@ -6,7 +6,8 @@ from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServ
|
|||
from core.llm.whisper import Whisper
|
||||
from models.provider import ProviderName
|
||||
|
||||
FILE_SIZE_LIMIT = 1 * 1024 * 1024
|
||||
FILE_SIZE = 15
|
||||
FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
|
||||
ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm']
|
||||
|
||||
class AudioService:
|
||||
|
@ -23,17 +24,17 @@ class AudioService:
|
|||
file_size = len(file_content)
|
||||
|
||||
if file_size > FILE_SIZE_LIMIT:
|
||||
message = f"({file_size} > {FILE_SIZE_LIMIT})"
|
||||
message = f"Audio size larger than {FILE_SIZE} mb"
|
||||
raise AudioTooLargeServiceError(message)
|
||||
|
||||
provider_name = LLMBuilder.get_default_provider(tenant_id)
|
||||
if provider_name != ProviderName.OPENAI.value:
|
||||
raise ProviderNotSupportSpeechToTextServiceError('haha')
|
||||
raise ProviderNotSupportSpeechToTextServiceError()
|
||||
|
||||
provider_service = LLMProviderService(tenant_id, provider_name)
|
||||
|
||||
buffer = io.BytesIO(file_content)
|
||||
buffer.name = 'temp.wav'
|
||||
buffer.name = 'temp.mp3'
|
||||
|
||||
return Whisper(provider_service.provider).transcribe(buffer)
|
||||
|
||||
|
|
|
@ -1,23 +1,13 @@
|
|||
from services.errors.base import BaseServiceError
|
||||
|
||||
class NoAudioUploadedServiceError(BaseServiceError):
|
||||
error_code = 'no_audio_uploaded'
|
||||
description = "Please upload your audio."
|
||||
code = 400
|
||||
class NoAudioUploadedServiceError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class AudioTooLargeServiceError(BaseServiceError):
|
||||
error_code = 'audio_too_large'
|
||||
description = "Audio size exceeded. {message}"
|
||||
code = 413
|
||||
class AudioTooLargeServiceError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class UnsupportedAudioTypeServiceError(BaseServiceError):
|
||||
error_code = 'unsupported_audio_type'
|
||||
description = "Audio type not allowed."
|
||||
code = 415
|
||||
class UnsupportedAudioTypeServiceError(Exception):
|
||||
pass
|
||||
|
||||
class ProviderNotSupportSpeechToTextServiceError(BaseServiceError):
|
||||
error_code = 'provider_not_support_speech_to_text'
|
||||
description = "Provider not support speech to text. {message}"
|
||||
code = 400
|
||||
class ProviderNotSupportSpeechToTextServiceError(Exception):
|
||||
pass
|
|
@ -4,6 +4,7 @@ import { useParams, usePathname } from 'next/navigation'
|
|||
import cn from 'classnames'
|
||||
import Recorder from 'js-audio-recorder'
|
||||
import { useRafInterval } from 'ahooks'
|
||||
import { convertToMp3 } from './utils'
|
||||
import s from './index.module.css'
|
||||
import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
|
||||
import { Loading02, XClose } from '@/app/components/base/icons/src/vender/line/general'
|
||||
|
@ -19,7 +20,12 @@ const VoiceInput = ({
|
|||
onConverted,
|
||||
}: VoiceInputTypes) => {
|
||||
const { t } = useTranslation()
|
||||
const recorder = useRef(new Recorder())
|
||||
const recorder = useRef(new Recorder({
|
||||
sampleBits: 16,
|
||||
sampleRate: 16000,
|
||||
numChannels: 1,
|
||||
compiling: false,
|
||||
}))
|
||||
const canvasRef = useRef<HTMLCanvasElement | null>(null)
|
||||
const ctxRef = useRef<CanvasRenderingContext2D | null>(null)
|
||||
const drawRecordId = useRef<number | null>(null)
|
||||
|
@ -75,10 +81,10 @@ const VoiceInput = ({
|
|||
const canvas = canvasRef.current!
|
||||
const ctx = ctxRef.current!
|
||||
ctx.clearRect(0, 0, canvas.width, canvas.height)
|
||||
const wavBlob = recorder.current.getWAVBlob()
|
||||
const wavFile = new File([wavBlob], 'a.wav', { type: 'audio/wav' })
|
||||
const mp3Blob = convertToMp3(recorder.current)
|
||||
const mp3File = new File([mp3Blob], 'temp.mp3', { type: 'audio/mp3' })
|
||||
const formData = new FormData()
|
||||
formData.append('file', wavFile)
|
||||
formData.append('file', mp3File)
|
||||
|
||||
let url = ''
|
||||
let isPublic = false
|
||||
|
|
38
web/app/components/base/voice-input/utils.ts
Normal file
38
web/app/components/base/voice-input/utils.ts
Normal file
|
@ -0,0 +1,38 @@
|
|||
import lamejs from 'lamejs'
|
||||
|
||||
export const convertToMp3 = (recorder: any) => {
|
||||
const wav = lamejs.WavHeader.readHeader(recorder.getWAV())
|
||||
const { channels, sampleRate } = wav
|
||||
const mp3enc = new lamejs.Mp3Encoder(channels, sampleRate, 128)
|
||||
const result = recorder.getChannelData()
|
||||
const buffer = []
|
||||
|
||||
const leftData = result.left && new Int16Array(result.left.buffer, 0, result.left.byteLength / 2)
|
||||
const rightData = result.right && new Int16Array(result.right.buffer, 0, result.right.byteLength / 2)
|
||||
const remaining = leftData.length + (rightData ? rightData.length : 0)
|
||||
|
||||
const maxSamples = 1152
|
||||
for (let i = 0; i < remaining; i += maxSamples) {
|
||||
const left = leftData.subarray(i, i + maxSamples)
|
||||
let right = null
|
||||
let mp3buf = null
|
||||
|
||||
if (channels === 2) {
|
||||
right = rightData.subarray(i, i + maxSamples)
|
||||
mp3buf = mp3enc.encodeBuffer(left, right)
|
||||
}
|
||||
else {
|
||||
mp3buf = mp3enc.encodeBuffer(left)
|
||||
}
|
||||
|
||||
if (mp3buf.length > 0)
|
||||
buffer.push(mp3buf)
|
||||
}
|
||||
|
||||
const enc = mp3enc.flush()
|
||||
|
||||
if (enc.length > 0)
|
||||
buffer.push(enc)
|
||||
|
||||
return new Blob(buffer, { type: 'audio/mp3' })
|
||||
}
|
1
web/global.d.ts
vendored
Normal file
1
web/global.d.ts
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
declare module 'lamejs';
|
|
@ -81,7 +81,8 @@
|
|||
"swr": "^2.1.0",
|
||||
"tailwindcss": "^3.2.7",
|
||||
"typescript": "4.9.5",
|
||||
"use-context-selector": "^1.4.1"
|
||||
"use-context-selector": "^1.4.1",
|
||||
"lamejs": "1.2.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@antfu/eslint-config": "^0.36.0",
|
||||
|
|
Loading…
Reference in New Issue
Block a user