autocut

在讲 OpenAI Whisper 前先做了一个剪视频小工具【论文精读·44】_哔哩哔哩_bilibili更多论文：https://github.com/mli/paper-reading, 视频播放量 58633、弹幕量 233、点赞数 2732、投硬币枚数 1630、收藏人数 1465、转发人数 604, 视频作者跟李沐学AI, 作者简介，相关视频：OpenAI Whisper 精读【论文精读·45】，免费开源语音转文字Whisper快速搭建，可生成字幕，媒体人的福音，OpenAI开源Whisper，我来教你如何从零开始搭建，#如何安装和使用 Whisper AI，Chain of Thought论文、代码和资源【论文精读·43】，我是如何快速阅读和整理文献，GPT，GPT-2，GPT-3 论文精读【论文精读】，免费离线语音识别神器whisper安装教程，OpenAI Whisper大法 | 用OpenAI开源模型搞定音频转写，免费且优雅，大概能省1万，还能学到不少东西，自动识别讲座、访谈录音 ——Python开源whisper语音识别https://www.bilibili.com/video/BV1Pe4y1t7de/?spm_id_from=333.999.0.0&vd_source=4aed82e35f26bb600bc5b46e65e25c22GitHub - mli/autocut: 用文本编辑器剪视频用文本编辑器剪视频. Contribute to mli/autocut development by creating an account on GitHub.https://github.com/mli/autocut1.环境安装

openai-whisper、torchaudio

a.输入inputs，是一个["input.mp4"]的形式。

b.autocut-main\autocut\transcribe.py中

self.vad_model, funcs = torch.hub.load(
repo_or_dir=r"E:\common_tools\wav2lip_tools\autocut-main\autocut\silero-vad", model="silero_vad", trust_repo=True,source='local')

c.transcribe 92行

self.whisper_model = whisper.load_model(
self.args.whisper_model, self.args.device,download_root=r"E:\common_tools\wav2lip_tools\autocut-main", )

2.入参

    parser.add_argument("--inputs", type=str, nargs="+",
                        default=[r"F:\wav2lip_tmp\601112\1672544575_1.mp4"],help="Inputs filenames/folders")
    parser.add_argument(
        "-t",
        "--transcribe",  # 转写
        default=True,
        help="Transcribe videos/audio into subtitles",
        # action=argparse.BooleanOptionalAction,
    )
    parser.add_argument(
        "-c",
        "--cut",  # 裁剪视频
        help="Cut a video based on subtitles",
        # action=argparse.BooleanOptionalAction,
    )
    parser.add_argument(
        "-d",
        "--daemon", # 监听文件夹生成字幕和剪切字幕的功能
        help="Monitor a folder to transcribe and cut",
        # action=argparse.BooleanOptionalAction,
    )
    parser.add_argument(
        "-s",
        help="Convert .srt to a compact format for easier editing",
        # action=argparse.BooleanOptionalAction,
    )
    parser.add_argument(
        "-m",
        "--to-md",
        help="Convert .srt to .md for easier editing",
        # action=argparse.BooleanOptionalAction,
    )
    parser.add_argument(
        "--lang",
        type=str,
        default="zh",
        choices=["zh", "en"],
        help="The output language of transcription",
    )
    parser.add_argument(
        "--prompt", type=str, default="", help="initial prompt feed into whisper"
    )
    parser.add_argument(
        "--whisper-model",
        type=str,
        default="small",
        choices=["tiny", "base", "small", "medium", "large", "large-v2"],
        help="The whisper model used to transcribe.",
    )
    parser.add_argument(
        "--bitrate",
        type=str,
        default="10m", # 默认视频比特率10m
        help="The bitrate to export the cutted video, such as 10m, 1m, or 500k",
    )
    parser.add_argument(
        "--vad", help="If or not use VAD", choices=["1", "0", "auto"], default="auto"
    )
    parser.add_argument(
        "--force",
        help="Force write even if files exist",
        # action=argparse.BooleanOptionalAction,
    )
    parser.add_argument(
        "--encoding", type=str, default="utf-8", help="Document encoding format"
    )
    parser.add_argument(
        "--device",
        type=str,
        default=None,
        choices=["cpu", "cuda"],
        help="Force to CPU or GPU for transcribing. In default automatically use GPU if available.",
    )

3.代码

3.1 权重

_MODELS = {
    "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
    "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
    "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
    "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
    "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
    "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
    "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
    "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
    "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt",
    "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
    "large": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
}

3.2 transcribe

transcribe().run()->
audio=whisper.load_audio(input,16000)->ffmpeg.input()->
vad="1" or vad="auto" speech_timestamps=_detect_voice_activity(audio)->
vad_model is None or detect_speech is None->self.vad_model,funcs=torch.hub.load()->
detect_speech=funcs[0] = get_speech_timestamps->
- speeches=detect_speech(audio,vad_model,16000)->
- speeches=utils.remove_short_segments(speeches,1.0*16000)-> # 去掉一些太短的片段
-- [s for s in segments if s["end"] - s["start"] > threshold]
- speeches=utils.expand_segments(speeches,0.2*16000,0.0*16000,audio.shape[0])->避免片段过于紧凑，对一开始做了一个padding
- speeches=utils.merge_adjacant_segment(speeches,0.5*16000)-> 将一些邻近的片段进行合并
transcribe_results=_transcribe(audio,speech_timestamps)->
- whisper_model is None: whisper_model=whisper.load_model("small",)->
-- checkpoint_file=_download(_MODELS['small'],download_root,in_memory)->
-- checkpoint=torch.load(,map_location=device)->
-- dims=ModelDimensions(**checkpoint['dim'])->
-- mdoel=Whisper(dims)->
--- encoder=AudioEncoder/decoder=TextDecoder->
- whisper_model.transcribe()
-- mel=log_mel_spectrogram(audio,padding=480000)-> SAMPLE_RATE为16000，即在1秒内采集到16000个样本点。因此，在30秒采集时长内，采集到的总共样本点数为30 * 16000 = 480000，即N_SAMPLES的值为480000。
-- tokenizer=get_tokenizer()->tokenizer通常包括两个主要组件：分词器和编码器。分词器将文本分割成一个一个的token，可以使用空格、标点符号或者字典等方式进行划分；编码器则将分词器生成的token编码成数字表示，以便于模型使用。
-- inital_prompt_tokens=tokenizer.encode()->
- DecodingResult=decode_with_fallback(mel_segment)->
-- res：[{'text': '耐德是一个750度的烙温 同时呢 我们还有防部插防漏电的设计啊里面是有挡板的 是有挡板的', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 6.3, 'text': '耐德是一个750度的烙温 同时呢 我们还有防部插防漏电的设计啊', 'tokens': [50364, 4450, 238, 35898, 1541, 20182, 45396, 13127, 1546, 23661, 247, 9592, 102, 220, 13089, 15729, 6240, 8624, 9497, 35091, 35863, 13470, 11673, 240, 35863, 14065, 237, 42182, 1546, 7422, 122, 7422, 94, 4905, 50679], 'temperature': 0.0, 'avg_logprob': -0.418671938089224, 'compression_ratio': 1.0169491525423728, 'no_speech_prob': 0.039087388664484024}, {'id': 1, 'seek': 0, 'start': 6.3, 'end': 8.1, 'text': '里面是有挡板的 是有挡板的', 'tokens': [50679, 15759, 8833, 1541, 2412, 8501, 94, 43664, 1546, 11947, 2412, 8501, 94, 43664, 1546, 50769], 'temperature': 0.0, 'avg_logprob': -0.418671938089224, 'compression_ratio': 1.0169491525423728, 'no_speech_prob': 0.039087388664484024}], 'language': 'zh', 'origin_timestamp': {'start': 0, 'end': 131146}}]
_save_srt()
_save_md()

3.3 cut

[r"F:\wav2lip_tmp\601112\1672544575_1.mp4",r"F:\wav2lip_tmp\601112\1672544575_1.srt"]

cut主要是根据已经裁剪好的srt文件进行视频的裁剪。

4.whisper small模型结构

ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=768, n_audio_head=12, n_audio_layer=12, n_vocab=51865, n_text_ctx=448, n_text_state=768, n_text_head=12, n_text_layer=12)

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-11): 12 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=False)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (out): Linear(in_features=768, out_features=768, bias=True)
        )
        (attn_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (mlp_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TextDecoder(
    (token_embedding): Embedding(51865, 768)
    (blocks): ModuleList(
      (0-11): 12 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=False)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (out): Linear(in_features=768, out_features=768, bias=True)
        )
        (attn_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (cross_attn): MultiHeadAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=False)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (out): Linear(in_features=768, out_features=768, bias=True)
        )
        (cross_attn_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (mlp_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
)