-
Notifications
You must be signed in to change notification settings - Fork 1
/
subtitles.py
205 lines (155 loc) · 8.01 KB
/
subtitles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os
import whisper
from moviepy.editor import TextClip, CompositeVideoClip, VideoFileClip, ColorClip
class SubtitleGenerator:
def __init__(self, audio_filename, video_filename, model_path="medium"):
self.audio_filename = audio_filename
self.video_filename = video_filename
self.model_path = model_path
self.IMAGEMAGICK_BINARY = os.getenv('IMAGEMAGICK_BINARY', 'C:\Program Files\ImageMagick-7.1.1-Q16-HDRI\magick.exe')
self.frame_size = None
self.linelevel_subtitles = None
self.all_linelevel_splits = None
def load_audio_model(self):
return whisper.load_model(self.model_path)
def transcribe_audio(self):
model = self.load_audio_model()
result = model.transcribe(self.audio_filename, word_timestamps=True)
wordlevel_info = []
for each in result['segments']:
words = each['words']
for word in words:
wordlevel_info.append({'word': word['word'].strip(), 'start': word['start'], 'end': word['end']})
self.linelevel_subtitles = self.split_text_into_lines(wordlevel_info)
def create_caption(self, textJSON, font="Helvetica-Bold", color='white', highlight_color='yellow',
stroke_color='black', stroke_width=1.5):
wordcount = len(textJSON['textcontents'])
full_duration = textJSON['end']-textJSON['start']
word_clips = []
xy_textclips_positions =[]
x_pos = 0
y_pos = 0
line_width = 0 # Total width of words in the current line
frame_width = self.frame_size[0]
frame_height = self.frame_size[1]
x_buffer = frame_width*1/10
max_line_width = frame_width - 2 * (x_buffer)
fontsize = int(frame_height * 0.075) #7.5 percent of video height
space_width = ""
space_height = ""
for index,wordJSON in enumerate(textJSON['textcontents']):
duration = wordJSON['end']-wordJSON['start']
word_clip = TextClip(wordJSON['word'], font = font,fontsize=fontsize, color=color,stroke_color=stroke_color,stroke_width=stroke_width).set_start(textJSON['start']).set_duration(full_duration)
word_clip_space = TextClip(" ", font = font,fontsize=fontsize, color=color).set_start(textJSON['start']).set_duration(full_duration)
word_width, word_height = word_clip.size
space_width,space_height = word_clip_space.size
if line_width + word_width+ space_width <= max_line_width:
# Store info of each word_clip created
xy_textclips_positions.append({
"x_pos":x_pos,
"y_pos": y_pos,
"width" : word_width,
"height" : word_height,
"word": wordJSON['word'],
"start": wordJSON['start'],
"end": wordJSON['end'],
"duration": duration
})
word_clip = word_clip.set_position((x_pos, y_pos))
word_clip_space = word_clip_space.set_position((x_pos+ word_width, y_pos))
x_pos = x_pos + word_width+ space_width
line_width = line_width+ word_width + space_width
else:
# Move to the next line
x_pos = 0
y_pos = y_pos+ word_height+10
line_width = word_width + space_width
# Store info of each word_clip created
xy_textclips_positions.append({
"x_pos":x_pos,
"y_pos": y_pos,
"width" : word_width,
"height" : word_height,
"word": wordJSON['word'],
"start": wordJSON['start'],
"end": wordJSON['end'],
"duration": duration
})
word_clip = word_clip.set_position((x_pos, y_pos))
word_clip_space = word_clip_space.set_position((x_pos+ word_width , y_pos))
x_pos = word_width + space_width
word_clips.append(word_clip)
word_clips.append(word_clip_space)
for highlight_word in xy_textclips_positions:
word_clip_highlight = TextClip(highlight_word['word'], font = font,fontsize=fontsize, color=highlight_color,stroke_color=stroke_color,stroke_width=stroke_width).set_start(highlight_word['start']).set_duration(highlight_word['duration'])
word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos']))
word_clips.append(word_clip_highlight)
return word_clips,xy_textclips_positions
def process_subtitles(self):
input_video = VideoFileClip(self.video_filename)
self.frame_size = input_video.size
self.all_linelevel_splits = []
for line in self.linelevel_subtitles:
out_clips, positions = self.create_caption(line)
max_width = 0
max_height = 0
for position in positions:
x_pos, y_pos = position['x_pos'], position['y_pos']
width, height = position['width'], position['height']
max_width = max(max_width, x_pos + width)
max_height = max(max_height, y_pos + height)
color_clip = ColorClip(size=(int(max_width * 1.1), int(max_height * 1.1)),
color=(64, 64, 64))
color_clip = color_clip.set_opacity(.6)
color_clip = color_clip.set_start(line['start'] - 0.1).set_duration(line['end'] - line['start'] + 0.2)
clip_to_overlay = CompositeVideoClip([color_clip] + out_clips)
clip_to_overlay = clip_to_overlay.set_position("bottom")
self.all_linelevel_splits.append(clip_to_overlay)
final_video = CompositeVideoClip([input_video] + self.all_linelevel_splits)
final_video = final_video.set_audio(input_video.audio)
final_video.write_videofile("output.mp4", fps=24, codec="libx264", audio_codec="aac")
return "output.mp4"
def split_text_into_lines(self, data):
MaxChars = 30
MaxDuration = 2.5
MaxGap = 1.5
subtitles = []
line = []
line_duration = 0
line_chars = 0
for idx, word_data in enumerate(data):
word = word_data["word"]
start = word_data["start"]
end = word_data["end"]
line.append(word_data)
line_duration += end - start
temp = " ".join(item["word"] for item in line)
new_line_chars = len(temp)
duration_exceeded = line_duration > MaxDuration
chars_exceeded = new_line_chars > MaxChars
if idx > 0:
gap = word_data['start'] - data[idx - 1]['end']
maxgap_exceeded = gap > MaxGap
else:
maxgap_exceeded = False
if duration_exceeded or chars_exceeded or maxgap_exceeded:
if line:
subtitle_line = {
"word": " ".join(item["word"] for item in line),
"start": line[0]["start"],
"end": line[-1]["end"],
"textcontents": line
}
subtitles.append(subtitle_line)
line = []
line_duration = 0
line_chars = 0
if line:
subtitle_line = {
"word": " ".join(item["word"] for item in line),
"start": line[0]["start"],
"end": line[-1]["end"],
"textcontents": line
}
subtitles.append(subtitle_line)
return subtitles