Skip to content

Commit ebfe8b1

Browse files
authored
Merge pull request #72 from amihos/feat/improve-essence-extraction
feat(essence): improve sentence scoring for better title/header retention
2 parents 912b30d + 744152f commit ebfe8b1

File tree

1 file changed

+45
-14
lines changed

1 file changed

+45
-14
lines changed

backend/src/memory/hsg.ts

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -336,8 +336,18 @@ export function extract_essence(
336336
.map((s) => s.trim())
337337
.filter((s) => s.length > 10);
338338
if (sents.length === 0) return raw.slice(0, max_len);
339-
const score_sent = (s: string): number => {
339+
const score_sent = (s: string, idx: number): number => {
340340
let sc = 0;
341+
// First sentence bonus - titles/headers are essential for retrieval
342+
if (idx === 0) sc += 10;
343+
// Second sentence often contains key context
344+
if (idx === 1) sc += 5;
345+
// Header/section markers (markdown or label-style)
346+
if (/^#+\s/.test(s) || /^[A-Z][A-Z\s]+:/.test(s)) sc += 8;
347+
// Colon-prefixed labels like "PROBLEM:", "SOLUTION:", "CONTEXT:"
348+
if (/^[A-Z][a-z]+:/i.test(s)) sc += 6;
349+
// Date patterns (ISO format)
350+
if (/\d{4}-\d{2}-\d{2}/.test(s)) sc += 7;
341351
if (
342352
/\b(january|february|march|april|may|june|july|august|september|october|november|december)\s+\d+/i.test(
343353
s,
@@ -347,7 +357,7 @@ export function extract_essence(
347357
if (/\$\d+|\d+\s*(miles|dollars|years|months|km)/.test(s)) sc += 4;
348358
if (/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+/.test(s)) sc += 3;
349359
if (
350-
/\b(bought|purchased|serviced|visited|went|got|received|paid|earned|learned|discovered|found|saw|met|completed|finished)\b/i.test(
360+
/\b(bought|purchased|serviced|visited|went|got|received|paid|earned|learned|discovered|found|saw|met|completed|finished|fixed|implemented|created|updated|added|removed|resolved)\b/i.test(
351361
s,
352362
)
353363
)
@@ -357,21 +367,42 @@ export function extract_essence(
357367
if (/\b(I|my|me)\b/.test(s)) sc += 1;
358368
return sc;
359369
};
360-
const scored = sents.map((s) => ({ text: s, score: score_sent(s) }));
370+
const scored = sents.map((s, idx) => ({ text: s, score: score_sent(s, idx), idx }));
361371
scored.sort((a, b) => b.score - a.score);
372+
// Build result, ensuring first sentence is always included if space permits
362373
let comp = "";
363-
for (const item of scored) {
364-
const cand = comp ? `${comp}. ${item.text}` : item.text;
365-
if (cand.length <= max_len) {
366-
comp = cand;
367-
} else if (comp.length < max_len * 0.7) {
368-
const rem = max_len - comp.length - 2;
369-
if (rem > 20) {
370-
comp += ". " + item.text.slice(0, rem);
374+
const firstSent = sents[0];
375+
if (firstSent && firstSent.length <= max_len * 0.5) {
376+
comp = firstSent;
377+
const remaining = scored.filter((item) => item.idx !== 0);
378+
for (const item of remaining) {
379+
const cand = comp ? `${comp}. ${item.text}` : item.text;
380+
if (cand.length <= max_len) {
381+
comp = cand;
382+
} else if (comp.length < max_len * 0.7) {
383+
const rem = max_len - comp.length - 2;
384+
if (rem > 20) {
385+
comp += ". " + item.text.slice(0, rem);
386+
}
387+
break;
388+
} else {
389+
break;
390+
}
391+
}
392+
} else {
393+
for (const item of scored) {
394+
const cand = comp ? `${comp}. ${item.text}` : item.text;
395+
if (cand.length <= max_len) {
396+
comp = cand;
397+
} else if (comp.length < max_len * 0.7) {
398+
const rem = max_len - comp.length - 2;
399+
if (rem > 20) {
400+
comp += ". " + item.text.slice(0, rem);
401+
}
402+
break;
403+
} else {
404+
break;
371405
}
372-
break;
373-
} else {
374-
break;
375406
}
376407
}
377408
return comp || raw.slice(0, max_len);

0 commit comments

Comments
 (0)