|
38 | 38 | from langchain_community.document_loaders import PDFMinerLoader |
39 | 39 | from langchain_community.document_loaders import PDFPlumberLoader |
40 | 40 | from langchain_community.document_loaders import OnlinePDFLoader |
41 | | -from langchain_community.document_loaders import YoutubeLoader |
42 | 41 | from langchain_community.document_loaders import SeleniumURLLoader |
43 | 42 | from langchain_community.document_loaders import PlaywrightURLLoader |
44 | 43 | from langchain_community.document_loaders import WebBaseLoader |
@@ -691,9 +690,7 @@ def load_youtube_video( |
691 | 690 |
|
692 | 691 | if youtube_audio_backend == "youtube": |
693 | 692 | whi(f"Using youtube.com loader: '{path}'") |
694 | | - fyu = YoutubeLoader.from_youtube_url |
695 | 693 | docs = cached_yt_loader( |
696 | | - loader=fyu, |
697 | 694 | path=path, |
698 | 695 | add_video_info=True, |
699 | 696 | language=youtube_language if youtube_language else [ |
@@ -2220,20 +2217,127 @@ def load_youtube_playlist(playlist_url: str) -> Any: |
2220 | 2217 |
|
2221 | 2218 |
|
2222 | 2219 | @optional_typecheck |
2223 | | -@doc_loaders_cache.cache(ignore=["loader"]) |
| 2220 | +@doc_loaders_cache.cache |
2224 | 2221 | def cached_yt_loader( |
2225 | | - loader: Any, |
2226 | 2222 | path: str, |
2227 | 2223 | add_video_info: bool, |
2228 | 2224 | language: List[str], |
2229 | 2225 | translation: Optional[str]) -> List[Document]: |
2230 | 2226 | yel(f"Not using cache for youtube {path}") |
2231 | | - docs = loader( |
2232 | | - path, |
2233 | | - add_video_info=add_video_info, |
2234 | | - language=language, |
2235 | | - translation=translation, |
2236 | | - ).load() |
| 2227 | + |
| 2228 | + options = { |
| 2229 | + 'writesubtitles': True, |
| 2230 | + 'writeautomaticsub': True, |
| 2231 | + 'subtitleslangs': language, |
| 2232 | + 'skip_download': True, |
| 2233 | + 'subtitlesformat': 'vtt', |
| 2234 | + 'allsubtitles': True, |
| 2235 | + 'extract_flat': False, |
| 2236 | + } |
| 2237 | + if translation is None: |
| 2238 | + translation = [] |
| 2239 | + else: |
| 2240 | + traslation = [translation] |
| 2241 | + |
| 2242 | + with youtube_dl.YoutubeDL(options) as ydl: |
| 2243 | + # First check available subs |
| 2244 | + info = ydl.extract_info(path, download=False) |
| 2245 | + |
| 2246 | + title = info.get('fulltitle', None) |
| 2247 | + |
| 2248 | + # Check both manual and auto subs |
| 2249 | + good_subs = info.get('subtitles', {}) |
| 2250 | + auto_subs = info.get('automatic_captions', {}) |
| 2251 | + |
| 2252 | + if not good_subs and not auto_subs: |
| 2253 | + raise Exception(f"No subtitles found for youtube video entitled '{title}' at link '{path}'") |
| 2254 | + |
| 2255 | + sub = None |
| 2256 | + for subs in [good_subs, auto_subs]: |
| 2257 | + if sub is not None: |
| 2258 | + break |
| 2259 | + for lang in language + translation: |
| 2260 | + if lang in subs.keys(): |
| 2261 | + sub_url = [s for s in subs[lang] if s["ext"] == "vtt"][0]["url"] |
| 2262 | + sub = requests.get(sub_url).content |
| 2263 | + sub = ftfy.fix_text(sub.decode()).strip() |
| 2264 | + break |
| 2265 | + if sub is None: |
| 2266 | + available = list(set(list(good_subs.keys()) + list(auto_subs.keys()))) |
| 2267 | + raise Exception(f"Subtitles found but not for the languages '{language}' nor '{translation}' for youtube video entitled '{title}' at link '{path}'\nAvailable languages were: '{available}'") |
| 2268 | + |
| 2269 | + # get metadata too |
| 2270 | + meta = {} |
| 2271 | + for k in ["description", "categories", "tags", "channel", "upload_date", "duration_string", "language"]: |
| 2272 | + if k in info: |
| 2273 | + meta["yt_" + k] = info[k] |
| 2274 | + |
| 2275 | + # the chapters, if present, are in seconds, while the vtt uses human readable timecodes so converting the chapters |
| 2276 | + if "chapters" in info: |
| 2277 | + chap = info["chapters"] |
| 2278 | + def seconds_to_timecode(inp: str) -> str: |
| 2279 | + second = float(inp) |
| 2280 | + minute = second // 60 |
| 2281 | + second = second % 60 |
| 2282 | + hour = minute // 60 |
| 2283 | + minute = minute % 60 |
| 2284 | + hour, minute, second = int(hour), int(minute), int(second) |
| 2285 | + return f"{hour:02d}:{minute:02d}:{second:02d}" |
| 2286 | + |
| 2287 | + for ich, ch in enumerate(chap): |
| 2288 | + chap[ich]["start_time"] = seconds_to_timecode(chap[ich]["start_time"]) |
| 2289 | + chap[ich]["end_time"] = seconds_to_timecode(chap[ich]["end_time"]) |
| 2290 | + |
| 2291 | + meta["yt_chapters"] = chap |
| 2292 | + |
| 2293 | + def timecode_to_second(inp: str) -> float: |
| 2294 | + "turns a vtt timecode into seconds" |
| 2295 | + hour, minute, second = map(int, inp.split(':')) |
| 2296 | + return hour * 3600 + minute * 60 + second |
| 2297 | + |
| 2298 | + def is_timecode(inp: str) -> bool: |
| 2299 | + try: |
| 2300 | + timecode_to_second(inp) |
| 2301 | + return True |
| 2302 | + except Exception: |
| 2303 | + return False |
| 2304 | + |
| 2305 | + # reduce greatly the number of token in the subtitles by removing some less important formatting |
| 2306 | + lines = sub.splitlines() |
| 2307 | + timecode_pattern = re.compile(r'(?:\d{2}:\d{2}:\d{2}\.\d{3})|(?:<\d{2}:\d{2}:\d{2}\.\d{3}>)|(?:</?c>)') |
| 2308 | + latest_tc = -1 # store the timecode once every Xs |
| 2309 | + newlines = [] |
| 2310 | + for li in lines: |
| 2311 | + if " --> " in li: |
| 2312 | + li = re.sub("\.\d+ -->.*", "", li).strip() |
| 2313 | + |
| 2314 | + # remove duplicate timecodes: |
| 2315 | + tc = timecode_to_second(li) |
| 2316 | + if tc - latest_tc < 15: |
| 2317 | + li = "" |
| 2318 | + else: |
| 2319 | + latest_tc = tc |
| 2320 | + else: |
| 2321 | + li = timecode_pattern.sub("", li).strip() |
| 2322 | + |
| 2323 | + if is_timecode(li): |
| 2324 | + newlines.append(li + "\n") |
| 2325 | + elif not newlines: |
| 2326 | + newlines.append(li) |
| 2327 | + elif is_timecode(newlines[-1]): |
| 2328 | + newlines.append(li) |
| 2329 | + elif li not in newlines[-1]: |
| 2330 | + newlines[-1] = newlines[-1].strip() + " " + li.strip() |
| 2331 | + |
| 2332 | + content = "\n".join(newlines) |
| 2333 | + |
| 2334 | + docs = [ |
| 2335 | + Document( |
| 2336 | + page_content=content, |
| 2337 | + metadata=meta, |
| 2338 | + ) |
| 2339 | + ] |
| 2340 | + |
2237 | 2341 | return docs |
2238 | 2342 |
|
2239 | 2343 |
|
|
0 commit comments