在讲 OpenAI Whisper 前先做了一个剪视频小工具【论文精读·44】_哔哩哔哩_bilibili更多论文:https://github.com/mli/paper-reading, 视频播放量 58633、弹幕量 233、点赞数 2732、投硬币枚数 1630、收藏人数 1465、转发人数 604, 视频作者 跟李沐学AI, 作者简介 ,相关视频:OpenAI Whisper 精读【论文精读·45】,免费开源语音转文字Whisper快速搭建,可生成字幕,媒体人的福音,OpenAI开源Whisper,我来教你如何从零开始搭建,#如何安装和使用 Whisper AI,Chain of Thought论文、代码和资源【论文精读·43】,我是如何快速阅读和整理文献,GPT,GPT-2,GPT-3 论文精读【论文精读】,免费离线语音识别神器whisper安装教程,OpenAI Whisper大法 | 用OpenAI开源模型搞定音频转写,免费且优雅,大概能省1万,还能学到不少东西,自动识别讲座、访谈录音 ——Python开源whisper语音识别https://www.bilibili.com/video/BV1Pe4y1t7de/?spm_id_from=333.999.0.0&vd_source=4aed82e35f26bb600bc5b46e65e25c22GitHub - mli/autocut: 用文本编辑器剪视频用文本编辑器剪视频. Contribute to mli/autocut development by creating an account on GitHub.https://github.com/mli/autocut1.环境安装
openai-whisper、torchaudio
a.输入inputs,是一个["input.mp4"]的形式。
b.autocut-main\autocut\transcribe.py中
self.vad_model, funcs = torch.hub.load(
repo_or_dir=r"E:\common_tools\wav2lip_tools\autocut-main\autocut\silero-vad", model="silero_vad", trust_repo=True,source='local')
c.transcribe 92行
self.whisper_model = whisper.load_model(
self.args.whisper_model, self.args.device,download_root=r"E:\common_tools\wav2lip_tools\autocut-main", )
2.入参
parser.add_argument("--inputs", type=str, nargs="+",
default=[r"F:\wav2lip_tmp\601112\1672544575_1.mp4"],help="Inputs filenames/folders")
parser.add_argument(
"-t",
"--transcribe", # 转写
default=True,
help="Transcribe videos/audio into subtitles",
# action=argparse.BooleanOptionalAction,
)
parser.add_argument(
"-c",
"--cut", # 裁剪视频
help="Cut a video based on subtitles",
# action=argparse.BooleanOptionalAction,
)
parser.add_argument(
"-d",
"--daemon", # 监听文件夹生成字幕和剪切字幕的功能
help="Monitor a folder to transcribe and cut",
# action=argparse.BooleanOptionalAction,
)
parser.add_argument(
"-s",
help="Convert .srt to a compact format for easier editing",
# action=argparse.BooleanOptionalAction,
)
parser.add_argument(
"-m",
"--to-md",
help="Convert .srt to .md for easier editing",
# action=argparse.BooleanOptionalAction,
)
parser.add_argument(
"--lang",
type=str,
default="zh",
choices=["zh", "en"],
help="The output language of transcription",
)
parser.add_argument(
"--prompt", type=str, default="", help="initial prompt feed into whisper"
)
parser.add_argument(
"--whisper-model",
type=str,
default="small",
choices=["tiny", "base", "small", "medium", "large", "large-v2"],
help="The whisper model used to transcribe.",
)
parser.add_argument(
"--bitrate",
type=str,
default="10m", # 默认视频比特率10m
help="The bitrate to export the cutted video, such as 10m, 1m, or 500k",
)
parser.add_argument(
"--vad", help="If or not use VAD", choices=["1", "0", "auto"], default="auto"
)
parser.add_argument(
"--force",
help="Force write even if files exist",
# action=argparse.BooleanOptionalAction,
)
parser.add_argument(
"--encoding", type=str, default="utf-8", help="Document encoding format"
)
parser.add_argument(
"--device",
type=str,
default=None,
choices=["cpu", "cuda"],
help="Force to CPU or GPU for transcribing. In default automatically use GPU if available.",
)
3.代码
3.1 权重
_MODELS = {
"tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
"tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
"base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
"base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
"small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
"small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
"medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
"medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
"large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt",
"large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
"large": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
}
3.2 transcribe
transcribe().run()->
audio=whisper.load_audio(input,16000)->ffmpeg.input()->
vad="1" or vad="auto" speech_timestamps=_detect_voice_activity(audio)->
vad_model is None or detect_speech is None->self.vad_model,funcs=torch.hub.load()->
detect_speech=funcs[0] = get_speech_timestamps->
- speeches=detect_speech(audio,vad_model,16000)->
- speeches=utils.remove_short_segments(speeches,1.0*16000)-> # 去掉一些太短的片段
-- [s for s in segments if s["end"] - s["start"] > threshold]
- speeches=utils.expand_segments(speeches,0.2*16000,0.0*16000,audio.shape[0])->避免片段过于紧凑,对一开始做了一个padding
- speeches=utils.merge_adjacant_segment(speeches,0.5*16000)-> 将一些邻近的片段进行合并
transcribe_results=_transcribe(audio,speech_timestamps)->
- whisper_model is None: whisper_model=whisper.load_model("small",)->
-- checkpoint_file=_download(_MODELS['small'],download_root,in_memory)->
-- checkpoint=torch.load(,map_location=device)->
-- dims=ModelDimensions(**checkpoint['dim'])->
-- mdoel=Whisper(dims)->
--- encoder=AudioEncoder/decoder=TextDecoder->
- whisper_model.transcribe()
-- mel=log_mel_spectrogram(audio,padding=480000)-> SAMPLE_RATE为16000,即在1秒内采集到16000个样本点。因此,在30秒采集时长内,采集到的总共样本点数为30 * 16000 = 480000,即N_SAMPLES的值为480000。
-- tokenizer=get_tokenizer()->tokenizer通常包括两个主要组件:分词器和编码器。分词器将文本分割成一个一个的token,可以使用空格、标点符号或者字典等方式进行划分;编码器则将分词器生成的token编码成数字表示,以便于模型使用。
-- inital_prompt_tokens=tokenizer.encode()->
- DecodingResult=decode_with_fallback(mel_segment)->
-- res:[{'text': '耐德是一个750度的烙温 同时呢 我们还有防部插防漏电的设计啊里面是有挡板的 是有挡板的', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 6.3, 'text': '耐德是一个750度的烙温 同时呢 我们还有防部插防漏电的设计啊', 'tokens': [50364, 4450, 238, 35898, 1541, 20182, 45396, 13127, 1546, 23661, 247, 9592, 102, 220, 13089, 15729, 6240, 8624, 9497, 35091, 35863, 13470, 11673, 240, 35863, 14065, 237, 42182, 1546, 7422, 122, 7422, 94, 4905, 50679], 'temperature': 0.0, 'avg_logprob': -0.418671938089224, 'compression_ratio': 1.0169491525423728, 'no_speech_prob': 0.039087388664484024}, {'id': 1, 'seek': 0, 'start': 6.3, 'end': 8.1, 'text': '里面是有挡板的 是有挡板的', 'tokens': [50679, 15759, 8833, 1541, 2412, 8501, 94, 43664, 1546, 11947, 2412, 8501, 94, 43664, 1546, 50769], 'temperature': 0.0, 'avg_logprob': -0.418671938089224, 'compression_ratio': 1.0169491525423728, 'no_speech_prob': 0.039087388664484024}], 'language': 'zh', 'origin_timestamp': {'start': 0, 'end': 131146}}]
_save_srt()
_save_md()
3.3 cut
[r"F:\wav2lip_tmp\601112\1672544575_1.mp4",r"F:\wav2lip_tmp\601112\1672544575_1.srt"]
cut主要是根据已经裁剪好的srt文件进行视频的裁剪。
4.whisper small模型结构
ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=768, n_audio_head=12, n_audio_layer=12, n_vocab=51865, n_text_ctx=448, n_text_state=768, n_text_head=12, n_text_layer=12)
Whisper(
(encoder): AudioEncoder(
(conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
(blocks): ModuleList(
(0-11): 12 x ResidualAttentionBlock(
(attn): MultiHeadAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
)
(attn_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU(approximate='none')
(2): Linear(in_features=3072, out_features=768, bias=True)
)
(mlp_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(decoder): TextDecoder(
(token_embedding): Embedding(51865, 768)
(blocks): ModuleList(
(0-11): 12 x ResidualAttentionBlock(
(attn): MultiHeadAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
)
(attn_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(cross_attn): MultiHeadAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(out): Linear(in_features=768, out_features=768, bias=True)
)
(cross_attn_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU(approximate='none')
(2): Linear(in_features=3072, out_features=768, bias=True)
)
(mlp_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
(ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)