|
| 1 | +//@ts-check |
| 2 | +// Helpers to work with different data types |
| 3 | +// by Humans for All |
| 4 | +// |
| 5 | + |
| 6 | +/** |
| 7 | + * Given the limited context size of local LLMs and , many a times when context gets filled |
| 8 | + * between the prompt and the response, it can lead to repeating text garbage generation. |
| 9 | + * And many a times setting penalty wrt repeatation leads to over-intelligent garbage |
| 10 | + * repeatation with slight variations. These garbage inturn can lead to overloading of the |
| 11 | + * available model context, leading to less valuable response for subsequent prompts/queries, |
| 12 | + * if chat history is sent to ai model. |
| 13 | + * |
| 14 | + * So two simple minded garbage trimming logics are experimented below. |
| 15 | + * * one based on progressively-larger-substring-based-repeat-matching-with-partial-skip and |
| 16 | + * * another based on char-histogram-driven garbage trimming. |
| 17 | + * * in future characteristic of histogram over varying lengths could be used to allow for |
| 18 | + * a more aggressive and adaptive trimming logic. |
| 19 | + */ |
| 20 | + |
| 21 | + |
| 22 | +/** |
| 23 | + * Simple minded logic to help remove repeating garbage at end of the string. |
| 24 | + * The repeatation needs to be perfectly matching. |
| 25 | + * |
| 26 | + * The logic progressively goes on probing for longer and longer substring based |
| 27 | + * repeatation, till there is no longer repeatation. Inturn picks the one with |
| 28 | + * the longest chain. |
| 29 | + * |
| 30 | + * @param {string} sIn |
| 31 | + * @param {number} maxSubL |
| 32 | + * @param {number} maxMatchLenThreshold |
| 33 | + */ |
| 34 | +export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold=40) { |
| 35 | + let rCnt = [0]; |
| 36 | + let maxMatchLen = maxSubL; |
| 37 | + let iMML = -1; |
| 38 | + for(let subL=1; subL < maxSubL; subL++) { |
| 39 | + rCnt.push(0); |
| 40 | + let i; |
| 41 | + let refS = sIn.substring(sIn.length-subL, sIn.length); |
| 42 | + for(i=sIn.length; i > 0; i -= subL) { |
| 43 | + let curS = sIn.substring(i-subL, i); |
| 44 | + if (refS != curS) { |
| 45 | + let curMatchLen = rCnt[subL]*subL; |
| 46 | + if (maxMatchLen < curMatchLen) { |
| 47 | + maxMatchLen = curMatchLen; |
| 48 | + iMML = subL; |
| 49 | + } |
| 50 | + break; |
| 51 | + } |
| 52 | + rCnt[subL] += 1; |
| 53 | + } |
| 54 | + } |
| 55 | + console.debug("DBUG:DU:TrimRepeatGarbage:", rCnt); |
| 56 | + if ((iMML == -1) || (maxMatchLen < maxMatchLenThreshold)) { |
| 57 | + return {trimmed: false, data: sIn}; |
| 58 | + } |
| 59 | + console.debug("DBUG:TrimRepeatGarbage:TrimmedCharLen:", maxMatchLen); |
| 60 | + let iEnd = sIn.length - maxMatchLen; |
| 61 | + return { trimmed: true, data: sIn.substring(0, iEnd) }; |
| 62 | +} |
| 63 | + |
| 64 | + |
| 65 | +/** |
| 66 | + * Simple minded logic to help remove repeating garbage at end of the string, till it cant. |
| 67 | + * If its not able to trim, then it will try to skip a char at end and then trim, a few times. |
| 68 | + * This ensures that even if there are multiple runs of garbage with different patterns, the |
| 69 | + * logic still tries to munch through them. |
| 70 | + * |
| 71 | + * @param {string} sIn |
| 72 | + * @param {number} maxSubL |
| 73 | + * @param {number | undefined} [maxMatchLenThreshold] |
| 74 | + */ |
| 75 | +export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold, skipMax=16) { |
| 76 | + let sCur = sIn; |
| 77 | + let sSaved = ""; |
| 78 | + let iTry = 0; |
| 79 | + while(true) { |
| 80 | + let got = trim_repeat_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold); |
| 81 | + if (got.trimmed != true) { |
| 82 | + if (iTry == 0) { |
| 83 | + sSaved = got.data; |
| 84 | + } |
| 85 | + iTry += 1; |
| 86 | + if (iTry >= skipMax) { |
| 87 | + return sSaved; |
| 88 | + } |
| 89 | + got.data = got.data.substring(0,got.data.length-1); |
| 90 | + } else { |
| 91 | + iTry = 0; |
| 92 | + } |
| 93 | + sCur = got.data; |
| 94 | + } |
| 95 | +} |
| 96 | + |
| 97 | + |
| 98 | +/** |
| 99 | + * A simple minded try trim garbage at end using histogram driven characteristics. |
| 100 | + * There can be variation in the repeatations, as long as no new char props up. |
| 101 | + * |
| 102 | + * This tracks the chars and their frequency in a specified length of substring at the end |
| 103 | + * and inturn checks if moving further into the generated text from the end remains within |
| 104 | + * the same char subset or goes beyond it and based on that either trims the string at the |
| 105 | + * end or not. This allows to filter garbage at the end, including even if there are certain |
| 106 | + * kind of small variations in the repeated text wrt position of seen chars. |
| 107 | + * |
| 108 | + * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that |
| 109 | + * a given type of char ie numerals or alphabets or other types dont cross the specified |
| 110 | + * maxType limit. This allows intermixed text garbage to be identified and trimmed. |
| 111 | + * |
| 112 | + * ALERT: This is not perfect and only provides a rough garbage identification logic. |
| 113 | + * Also it currently only differentiates between character classes wrt english. |
| 114 | + * |
| 115 | + * @param {string} sIn |
| 116 | + * @param {number} maxType |
| 117 | + * @param {number} maxUniq |
| 118 | + * @param {number} maxMatchLenThreshold |
| 119 | + */ |
| 120 | +export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) { |
| 121 | + if (sIn.length < maxMatchLenThreshold) { |
| 122 | + return { trimmed: false, data: sIn }; |
| 123 | + } |
| 124 | + let iAlp = 0; |
| 125 | + let iNum = 0; |
| 126 | + let iOth = 0; |
| 127 | + // Learn |
| 128 | + let hist = {}; |
| 129 | + let iUniq = 0; |
| 130 | + for(let i=0; i<maxMatchLenThreshold; i++) { |
| 131 | + let c = sIn[sIn.length-1-i]; |
| 132 | + if (c in hist) { |
| 133 | + hist[c] += 1; |
| 134 | + } else { |
| 135 | + if(c.match(/[0-9]/) != null) { |
| 136 | + iNum += 1; |
| 137 | + } else if(c.match(/[A-Za-z]/) != null) { |
| 138 | + iAlp += 1; |
| 139 | + } else { |
| 140 | + iOth += 1; |
| 141 | + } |
| 142 | + iUniq += 1; |
| 143 | + if (iUniq >= maxUniq) { |
| 144 | + break; |
| 145 | + } |
| 146 | + hist[c] = 1; |
| 147 | + } |
| 148 | + } |
| 149 | + console.debug("DBUG:TrimHistGarbage:", hist); |
| 150 | + if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) { |
| 151 | + return { trimmed: false, data: sIn }; |
| 152 | + } |
| 153 | + // Catch and Trim |
| 154 | + for(let i=0; i < sIn.length; i++) { |
| 155 | + let c = sIn[sIn.length-1-i]; |
| 156 | + if (!(c in hist)) { |
| 157 | + if (i < maxMatchLenThreshold) { |
| 158 | + return { trimmed: false, data: sIn }; |
| 159 | + } |
| 160 | + console.debug("DBUG:TrimHistGarbage:TrimmedCharLen:", i); |
| 161 | + return { trimmed: true, data: sIn.substring(0, sIn.length-i+1) }; |
| 162 | + } |
| 163 | + } |
| 164 | + console.debug("DBUG:TrimHistGarbage:Trimmed fully"); |
| 165 | + return { trimmed: true, data: "" }; |
| 166 | +} |
| 167 | + |
| 168 | +/** |
| 169 | + * Keep trimming repeatedly using hist_garbage logic, till you no longer can. |
| 170 | + * This ensures that even if there are multiple runs of garbage with different patterns, |
| 171 | + * the logic still tries to munch through them. |
| 172 | + * |
| 173 | + * @param {any} sIn |
| 174 | + * @param {number} maxType |
| 175 | + * @param {number} maxUniq |
| 176 | + * @param {number} maxMatchLenThreshold |
| 177 | + */ |
| 178 | +export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) { |
| 179 | + let sCur = sIn; |
| 180 | + while (true) { |
| 181 | + let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold); |
| 182 | + if (!got.trimmed) { |
| 183 | + return got.data; |
| 184 | + } |
| 185 | + sCur = got.data; |
| 186 | + } |
| 187 | +} |
| 188 | + |
| 189 | +/** |
| 190 | + * Try trim garbage at the end by using both the hist-driven-garbage-trimming as well as |
| 191 | + * skip-a-bit-if-reqd-then-repeat-pattern-based-garbage-trimming, with blind retrying. |
| 192 | + * @param {string} sIn |
| 193 | + */ |
| 194 | +export function trim_garbage_at_end(sIn) { |
| 195 | + let sCur = sIn; |
| 196 | + for(let i=0; i<2; i++) { |
| 197 | + sCur = trim_hist_garbage_at_end_loop(sCur, 8, 24, 72); |
| 198 | + sCur = trim_repeat_garbage_at_end_loop(sCur, 32, 72, 12); |
| 199 | + } |
| 200 | + return sCur; |
| 201 | +} |
| 202 | + |
| 203 | + |
| 204 | +/** |
| 205 | + * NewLines array helper. |
| 206 | + * Allow for maintaining a list of lines. |
| 207 | + * Allow for a line to be builtup/appended part by part. |
| 208 | + */ |
| 209 | +export class NewLines { |
| 210 | + |
| 211 | + constructor() { |
| 212 | + /** @type {string[]} */ |
| 213 | + this.lines = []; |
| 214 | + } |
| 215 | + |
| 216 | + /** |
| 217 | + * Extracts lines from the passed string and inturn either |
| 218 | + * append to a previous partial line or add a new line. |
| 219 | + * @param {string} sLines |
| 220 | + */ |
| 221 | + add_append(sLines) { |
| 222 | + let aLines = sLines.split("\n"); |
| 223 | + let lCnt = 0; |
| 224 | + for(let line of aLines) { |
| 225 | + lCnt += 1; |
| 226 | + // Add back newline removed if any during split |
| 227 | + if (lCnt < aLines.length) { |
| 228 | + line += "\n"; |
| 229 | + } else { |
| 230 | + if (sLines.endsWith("\n")) { |
| 231 | + line += "\n"; |
| 232 | + } |
| 233 | + } |
| 234 | + // Append if required |
| 235 | + if (lCnt == 1) { |
| 236 | + let lastLine = this.lines[this.lines.length-1]; |
| 237 | + if (lastLine != undefined) { |
| 238 | + if (!lastLine.endsWith("\n")) { |
| 239 | + this.lines[this.lines.length-1] += line; |
| 240 | + continue; |
| 241 | + } |
| 242 | + } |
| 243 | + } |
| 244 | + // Add new line |
| 245 | + this.lines.push(line); |
| 246 | + } |
| 247 | + } |
| 248 | + |
| 249 | + /** |
| 250 | + * Shift the oldest/earliest/0th line in the array. [Old-New|Earliest-Latest] |
| 251 | + * Optionally control whether only full lines (ie those with newline at end) will be returned |
| 252 | + * or will a partial line without a newline at end (can only be the last line) be returned. |
| 253 | + * @param {boolean} bFullWithNewLineOnly |
| 254 | + */ |
| 255 | + shift(bFullWithNewLineOnly=true) { |
| 256 | + let line = this.lines[0]; |
| 257 | + if (line == undefined) { |
| 258 | + return undefined; |
| 259 | + } |
| 260 | + if ((line[line.length-1] != "\n") && bFullWithNewLineOnly){ |
| 261 | + return undefined; |
| 262 | + } |
| 263 | + return this.lines.shift(); |
| 264 | + } |
| 265 | + |
| 266 | +} |
0 commit comments