最近在研究将图片和文本批量合成为带字幕口播视频
主要是基于python的moviepy库
from generator import audio, pics, subs, video
def main():
texts_input = 'example'
pics_input = 'example'
# 图片分辨率预处理
pics.adjust(pics_input)
# 文字转语音
audio.text_to_audio(texts_input)
# 语音转视频
video.audio_to_video(texts_input, pics_input)
# 生成字幕
subs.download_subs(texts_input)
# 生成字幕视频
video.attach_subs(texts_input)
if __name__ == "__main__":
main()
以下是核心的图片+文本转视频逻辑
import json
from mutagen.mp3 import MP3
from moviepy import editor
from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
import generator.pics as pics
def audio_to_video(text_input, pics_input):
audio_filepath = './output/audios/' + text_input.replace(' ', '_') + '.mp3'
video_filepath = './output/videos/' + text_input.replace(' ', '_') + '.mp4'
pics_dir = './output/pics/' + pics_input.replace(' ', '_')
list_of_images = pics.preprocess_pics(pics_dir)
audio = MP3(audio_filepath)
audio_length = audio.info.length
fps = len(list_of_images) / audio_length
# 生成视频
video = editor.ImageSequenceClip(pics_dir, fps=fps)
audio = editor.AudioFileClip(audio_filepath)
final_video = video.set_audio(audio)
final_video.write_videofile(video_filepath, codec="libx264", fps=10)
以下是我实现的示例项目,可参考和star一下下哈!