Skip to content

Commit 391fab7

Browse files
feat and fix: proper youtube loader implementation using yt_dlp instead of yt_dl and with vtt subtitle parsing using timecode aware parsing
Signed-off-by: thiswillbeyourgithub <[email protected]>
1 parent 7678491 commit 391fab7

File tree

1 file changed

+115
-11
lines changed

1 file changed

+115
-11
lines changed

wdoc/utils/loaders.py

Lines changed: 115 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
from langchain_community.document_loaders import PDFMinerLoader
3939
from langchain_community.document_loaders import PDFPlumberLoader
4040
from langchain_community.document_loaders import OnlinePDFLoader
41-
from langchain_community.document_loaders import YoutubeLoader
4241
from langchain_community.document_loaders import SeleniumURLLoader
4342
from langchain_community.document_loaders import PlaywrightURLLoader
4443
from langchain_community.document_loaders import WebBaseLoader
@@ -691,9 +690,7 @@ def load_youtube_video(
691690

692691
if youtube_audio_backend == "youtube":
693692
whi(f"Using youtube.com loader: '{path}'")
694-
fyu = YoutubeLoader.from_youtube_url
695693
docs = cached_yt_loader(
696-
loader=fyu,
697694
path=path,
698695
add_video_info=True,
699696
language=youtube_language if youtube_language else [
@@ -2220,20 +2217,127 @@ def load_youtube_playlist(playlist_url: str) -> Any:
22202217

22212218

22222219
@optional_typecheck
2223-
@doc_loaders_cache.cache(ignore=["loader"])
2220+
@doc_loaders_cache.cache
22242221
def cached_yt_loader(
2225-
loader: Any,
22262222
path: str,
22272223
add_video_info: bool,
22282224
language: List[str],
22292225
translation: Optional[str]) -> List[Document]:
22302226
yel(f"Not using cache for youtube {path}")
2231-
docs = loader(
2232-
path,
2233-
add_video_info=add_video_info,
2234-
language=language,
2235-
translation=translation,
2236-
).load()
2227+
2228+
options = {
2229+
'writesubtitles': True,
2230+
'writeautomaticsub': True,
2231+
'subtitleslangs': language,
2232+
'skip_download': True,
2233+
'subtitlesformat': 'vtt',
2234+
'allsubtitles': True,
2235+
'extract_flat': False,
2236+
}
2237+
if translation is None:
2238+
translation = []
2239+
else:
2240+
traslation = [translation]
2241+
2242+
with youtube_dl.YoutubeDL(options) as ydl:
2243+
# First check available subs
2244+
info = ydl.extract_info(path, download=False)
2245+
2246+
title = info.get('fulltitle', None)
2247+
2248+
# Check both manual and auto subs
2249+
good_subs = info.get('subtitles', {})
2250+
auto_subs = info.get('automatic_captions', {})
2251+
2252+
if not good_subs and not auto_subs:
2253+
raise Exception(f"No subtitles found for youtube video entitled '{title}' at link '{path}'")
2254+
2255+
sub = None
2256+
for subs in [good_subs, auto_subs]:
2257+
if sub is not None:
2258+
break
2259+
for lang in language + translation:
2260+
if lang in subs.keys():
2261+
sub_url = [s for s in subs[lang] if s["ext"] == "vtt"][0]["url"]
2262+
sub = requests.get(sub_url).content
2263+
sub = ftfy.fix_text(sub.decode()).strip()
2264+
break
2265+
if sub is None:
2266+
available = list(set(list(good_subs.keys()) + list(auto_subs.keys())))
2267+
raise Exception(f"Subtitles found but not for the languages '{language}' nor '{translation}' for youtube video entitled '{title}' at link '{path}'\nAvailable languages were: '{available}'")
2268+
2269+
# get metadata too
2270+
meta = {}
2271+
for k in ["description", "categories", "tags", "channel", "upload_date", "duration_string", "language"]:
2272+
if k in info:
2273+
meta["yt_" + k] = info[k]
2274+
2275+
# the chapters, if present, are in seconds, while the vtt uses human readable timecodes so converting the chapters
2276+
if "chapters" in info:
2277+
chap = info["chapters"]
2278+
def seconds_to_timecode(inp: str) -> str:
2279+
second = float(inp)
2280+
minute = second // 60
2281+
second = second % 60
2282+
hour = minute // 60
2283+
minute = minute % 60
2284+
hour, minute, second = int(hour), int(minute), int(second)
2285+
return f"{hour:02d}:{minute:02d}:{second:02d}"
2286+
2287+
for ich, ch in enumerate(chap):
2288+
chap[ich]["start_time"] = seconds_to_timecode(chap[ich]["start_time"])
2289+
chap[ich]["end_time"] = seconds_to_timecode(chap[ich]["end_time"])
2290+
2291+
meta["yt_chapters"] = chap
2292+
2293+
def timecode_to_second(inp: str) -> float:
2294+
"turns a vtt timecode into seconds"
2295+
hour, minute, second = map(int, inp.split(':'))
2296+
return hour * 3600 + minute * 60 + second
2297+
2298+
def is_timecode(inp: str) -> bool:
2299+
try:
2300+
timecode_to_second(inp)
2301+
return True
2302+
except Exception:
2303+
return False
2304+
2305+
# reduce greatly the number of token in the subtitles by removing some less important formatting
2306+
lines = sub.splitlines()
2307+
timecode_pattern = re.compile(r'(?:\d{2}:\d{2}:\d{2}\.\d{3})|(?:<\d{2}:\d{2}:\d{2}\.\d{3}>)|(?:</?c>)')
2308+
latest_tc = -1 # store the timecode once every Xs
2309+
newlines = []
2310+
for li in lines:
2311+
if " --> " in li:
2312+
li = re.sub("\.\d+ -->.*", "", li).strip()
2313+
2314+
# remove duplicate timecodes:
2315+
tc = timecode_to_second(li)
2316+
if tc - latest_tc < 15:
2317+
li = ""
2318+
else:
2319+
latest_tc = tc
2320+
else:
2321+
li = timecode_pattern.sub("", li).strip()
2322+
2323+
if is_timecode(li):
2324+
newlines.append(li + "\n")
2325+
elif not newlines:
2326+
newlines.append(li)
2327+
elif is_timecode(newlines[-1]):
2328+
newlines.append(li)
2329+
elif li not in newlines[-1]:
2330+
newlines[-1] = newlines[-1].strip() + " " + li.strip()
2331+
2332+
content = "\n".join(newlines)
2333+
2334+
docs = [
2335+
Document(
2336+
page_content=content,
2337+
metadata=meta,
2338+
)
2339+
]
2340+
22372341
return docs
22382342

22392343

0 commit comments

Comments
 (0)