Skip to content

Commit 039ac79

Browse files
fix: Continue summarizer on error and add more logging (#2220)
The fix will allow the summarizer to continue even if some content files fail to process. It will also add more logging output for successful and failed content files. In case of failed content files, the error log will also be printed.
1 parent 69274bf commit 039ac79

File tree

6 files changed

+145
-18
lines changed

6 files changed

+145
-18
lines changed

docker-compose.services.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ services:
101101
environment:
102102
- KEYCLOAK_ADMIN=${KEYCLOAK_SVC_ADMIN:-admin}
103103
- KEYCLOAK_ADMIN_PASSWORD=${KEYCLOAK_SVC_ADMIN_PASSWORD:-admin}
104+
- "_JAVA_OPTIONS=${JAVA_OPTIONS:-}" # Load _JAVA_OPTIONS from env, fallback to empty string
104105
networks:
105106
default:
106107
aliases:

learning_resources/content_summarizer.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
from langchain_community.chat_models import ChatLiteLLM
88
from typing_extensions import TypedDict
99

10+
from learning_resources.exceptions import (
11+
FlashcardsGenerationError,
12+
SummaryGenerationError,
13+
)
1014
from learning_resources.models import (
1115
ContentFile,
1216
ContentSummarizerConfiguration,
@@ -124,31 +128,36 @@ def get_unprocessed_content_file_ids(
124128

125129
def summarize_content_files_by_ids(
126130
self, content_file_ids: list[int], overwrite
127-
) -> None:
131+
) -> list[str]:
128132
"""Process multiple content files by id.
129133
130134
Args:
131135
- ids (list[int]): List of content file ids to process
132136
- overwrite (bool): Whether to overwrite existing summary and flashcards
133137
134138
Returns:
135-
- None
139+
- list[str]: List of status messages for each content file
136140
"""
141+
status_messages = []
137142
for content_file_id in content_file_ids:
138-
self.summarize_single_content_file(content_file_id, overwrite=overwrite)
143+
status_msg = self.summarize_single_content_file(
144+
content_file_id, overwrite=overwrite
145+
)
146+
status_messages.append(status_msg)
147+
return status_messages
139148

140149
def summarize_single_content_file(
141150
self,
142151
content_file_id: int,
143152
overwrite,
144-
) -> None:
153+
) -> str:
145154
"""Process a single content file
146155
Args:
147156
- content_file_id (int): Id of the content file to process
148157
- overwrite (bool): Whether to overwrite existing summary and flashcards
149158
150159
Returns:
151-
- None
160+
- str: A string message indicating the status of the summarization
152161
"""
153162
try:
154163
with transaction.atomic():
@@ -175,10 +184,24 @@ def summarize_single_content_file(
175184

176185
if updated:
177186
content_file.save()
178-
179-
except Exception:
187+
return f"Summarization succeeded for CONTENT_FILE_ID: {content_file_id}" # noqa: E501
188+
return f"Summarization skipped for CONTENT_FILE_ID: {content_file_id}"
189+
except SummaryGenerationError as exc:
190+
# Log and return a specific readable error message when summary
191+
# generation fails.
192+
logger.exception("Error processing content: %d", content_file.id)
193+
return f"Summary generation failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n" # noqa: E501
194+
except FlashcardsGenerationError as exc:
195+
# Log and return a specific readable error message when flashcards
196+
# generation fails.
197+
return f"Flashcards generation failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n" # noqa: E501
198+
except Exception as exc:
199+
# Log and return a specific readable error message when an unknown
200+
# error occurs.
180201
logger.exception("Error processing content: %d", content_file.id)
181-
raise
202+
return (
203+
f"Summarization failed for CONTENT_FILE_ID: {content_file_id}\nError: {exc.args[0]}\n\n", # noqa: E501
204+
)
182205

183206
def _get_llm(self, model=None, temperature=0.0, max_tokens=1000) -> ChatLiteLLM:
184207
"""Get the ChatLiteLLM instance"""
@@ -216,11 +239,15 @@ def _generate_summary(self, content: str, llm_model: str) -> str:
216239
generated_summary = response.content
217240
logger.info("Generated summary: %s", generated_summary)
218241

219-
except Exception:
242+
except Exception as exc:
243+
# We do not want to raise the exception as is, we will log the exception and
244+
# raise SummaryGenerationError that will be used to make further decisions
245+
# in the code.
220246
logger.exception(
221247
"An error occurred while generating summary using model: %s", llm_model
222248
)
223-
raise
249+
raise SummaryGenerationError(exc) from exc
250+
224251
else:
225252
return generated_summary
226253

@@ -243,12 +270,14 @@ def _generate_flashcards(
243270
)
244271
generated_flashcards = response.get("flashcards")
245272
logger.info("Generated flashcards: %s", generated_flashcards)
246-
247-
except Exception:
273+
except Exception as exc:
274+
# We do not want to raise the exception as is, we will log the exception and
275+
# raise FlashcardsGenerationError that will be used to make further
276+
# decisions in the code.
248277
logger.exception(
249278
"An error occurred while generating flashcards using model: %s",
250279
llm_model,
251280
)
252-
raise
281+
raise FlashcardsGenerationError(exc) from exc
253282
else:
254283
return generated_flashcards

learning_resources/content_summarizer_test.py

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
PlatformType,
77
)
88
from learning_resources.content_summarizer import ContentSummarizer
9+
from learning_resources.exceptions import (
10+
FlashcardsGenerationError,
11+
SummaryGenerationError,
12+
)
913
from learning_resources.factories import (
1014
ContentFileFactory,
1115
ContentSummarizerConfigurationFactory,
@@ -242,9 +246,9 @@ def test_get_unprocessed_content_files_with_platform_and_config(
242246
def test_summarize_content_files_by_ids(
243247
processable_content_files, mock_summarize_single_content_file
244248
):
245-
"""The summarizer should process content files that are processable"""
249+
"""The summarizer should process content files that are processable and return the status results"""
246250
summarizer = ContentSummarizer()
247-
summarizer.summarize_content_files_by_ids(
251+
results = summarizer.summarize_content_files_by_ids(
248252
overwrite=False,
249253
content_file_ids=[
250254
content_file.id for content_file in processable_content_files
@@ -253,6 +257,8 @@ def test_summarize_content_files_by_ids(
253257
assert mock_summarize_single_content_file.call_count == len(
254258
processable_content_files
255259
)
260+
assert isinstance(results, list)
261+
assert len(results) == len(processable_content_files)
256262

257263

258264
def test_summarize_single_content_file(mocker, processable_content_files):
@@ -332,3 +338,77 @@ def test_process_single_file_calls_llm_summary(
332338
assert mock_instance.with_structured_output.call_count == 1
333339
elif has_flashcards:
334340
assert mock_instance.invoke.call_count == 1
341+
342+
343+
@pytest.mark.parametrize(
344+
("process_type", "expected_exception"),
345+
[("summary", SummaryGenerationError), ("flashcards", FlashcardsGenerationError)],
346+
)
347+
def test_generate_summary_flashcards_exception(
348+
mocker, processable_content_files, settings, process_type, expected_exception
349+
):
350+
"""Test the exception handling in the generate_summary and generate_flashcards methods"""
351+
settings.OPENAI_API_KEY = "test"
352+
summarizer = ContentSummarizer()
353+
content_file = processable_content_files[0]
354+
content_file.save()
355+
356+
# Mock the ChatLiteLLM class and its methods
357+
mock_chat_llm = mocker.patch(
358+
"learning_resources.content_summarizer.ChatLiteLLM", autospec=True
359+
)
360+
mock_instance = mock_chat_llm.return_value
361+
362+
# Mock the response for _generate_summary to raise an exception
363+
mock_instance.invoke.side_effect = Exception("Test exception")
364+
# Mock the response for _generate_flashcards to raise an exception
365+
mock_instance.with_structured_output.return_value.invoke.side_effect = Exception(
366+
"INVALID_FORMAT"
367+
)
368+
369+
if process_type == "summary":
370+
with pytest.raises(expected_exception):
371+
summarizer._generate_summary( # noqa: SLF001
372+
llm_model="llm_model", content=content_file.content
373+
)
374+
else:
375+
with pytest.raises(expected_exception):
376+
summarizer._generate_flashcards( # noqa: SLF001
377+
llm_model="llm_model", content=content_file.content
378+
)
379+
380+
381+
def test_summarize_single_content_file_with_exception(
382+
mocker, processable_content_files, settings
383+
):
384+
"""Test the exception handling in the summarize_single_content_file method"""
385+
settings.OPENAI_API_KEY = "test"
386+
summarizer = ContentSummarizer()
387+
content_file = processable_content_files[0]
388+
389+
# Mock the ChatLiteLLM class and its methods
390+
mock_chat_llm = mocker.patch(
391+
"learning_resources.content_summarizer.ChatLiteLLM", autospec=True
392+
)
393+
mock_instance = mock_chat_llm.return_value
394+
395+
# Mock the response for _generate_summary to raise an exception
396+
mock_instance.invoke.side_effect = Exception("Test exception")
397+
# Mock the response for _generate_flashcards to raise an exception
398+
mock_instance.with_structured_output.return_value.invoke.side_effect = Exception(
399+
"INVALID_FORMAT"
400+
)
401+
402+
error = summarizer.summarize_single_content_file(content_file.id, overwrite=False)
403+
assert (
404+
error
405+
== f"Summary generation failed for CONTENT_FILE_ID: {content_file.id}\nError: Test exception\n\n"
406+
)
407+
content_file.summary = "Test summary"
408+
content_file.save()
409+
content_file.refresh_from_db()
410+
error = summarizer.summarize_single_content_file(content_file.id, overwrite=False)
411+
assert (
412+
error
413+
== f"Flashcards generation failed for CONTENT_FILE_ID: {content_file.id}\nError: INVALID_FORMAT\n\n"
414+
)

learning_resources/exceptions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,11 @@ class PostHogAuthenticationError(Exception):
1313

1414
class PostHogQueryError(Exception):
1515
"""Raised if the PostHog query API returns a non-authentication error."""
16+
17+
18+
class SummaryGenerationError(Exception):
19+
"""Raised if the summary generation fails for a content file."""
20+
21+
22+
class FlashcardsGenerationError(Exception):
23+
"""Raised if the flashcards generation fails for a content file."""

learning_resources/management/commands/generate_summary_flashcards.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Management command to run the content summarizer"""
22

3+
import itertools
4+
35
from django.conf import settings
46
from django.core.management import BaseCommand
57

@@ -91,9 +93,16 @@ def handle(self, *args, **options): # noqa: ARG002
9193
self.stdout.write("Waiting on task...")
9294

9395
start = now_in_utc()
94-
summarizer_task.get()
96+
results = summarizer_task.get()
97+
98+
# Log the summarization stats
99+
flat_results = list(itertools.chain(*results))
100+
for result in flat_results:
101+
self.stdout.write(f"{result}")
95102

96103
total_seconds = (now_in_utc() - start).total_seconds()
97104
self.stdout.write(
98-
f"Content file summarizer finished, took {total_seconds} seconds"
105+
self.style.SUCCESS(
106+
f"Content file summarizer finished, took {total_seconds} seconds"
107+
)
99108
)

learning_resources/tasks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ def summarize_content_files_task(
424424
- None
425425
"""
426426
summarizer = ContentSummarizer()
427-
summarizer.summarize_content_files_by_ids(content_file_ids, overwrite)
427+
return summarizer.summarize_content_files_by_ids(content_file_ids, overwrite)
428428

429429

430430
@app.task(bind=True, acks_late=True)

0 commit comments

Comments
 (0)