1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
export async function processTextInChunks(inputText, processFn = async (text) => console.log(text), config = {}) {
const {
chunkLength = 1000,
overlapLength = 0
} = config;
const results = [];
let startIndex = 0;
const numChunks = Math.ceil((inputText.length - overlapLength) / (chunkLength - overlapLength));
for (let chunkIndex = 0; chunkIndex < numChunks; chunkIndex++) {
const endIndex = Math.min(startIndex + chunkLength, inputText.length);
const mainChunk = inputText.slice(startIndex, endIndex);
let overlapChunk = '';
if (startIndex > 0) {
const overlapStartIndex = Math.max(0, startIndex - overlapLength);
overlapChunk = inputText.slice(overlapStartIndex, startIndex);
}
const result = await processFn(mainChunk, overlapChunk, {
startIndex,
endIndex,
isLastChunk: endIndex === inputText.length,
chunkIndex,
numChunks
});
if (result !== undefined) {
results.push(result);
}
startIndex = endIndex - overlapLength;
if (startIndex >= inputText.length) break;
}
return results.length > 0 ? results : inputText;
}
// just gives information about the chunks; doesn't actually do anything!
export function preprocessTextChunks(inputText, config = {}) {
const {
chunkLength = 1000,
overlapLength = 0
} = config;
if (chunkLength <= overlapLength) {
throw new Error("chunkLength must be greater than overlapLength");
}
const chunks = [];
let startIndex = 0;
while (startIndex < inputText.length) {
const endIndex = Math.min(startIndex + chunkLength, inputText.length);
const mainChunk = inputText.slice(startIndex, endIndex);
let overlapChunk = '';
if (startIndex > 0) {
const overlapStartIndex = Math.max(0, startIndex - overlapLength);
overlapChunk = inputText.slice(overlapStartIndex, startIndex);
}
chunks.push({
mainChunk,
overlapChunk,
startIndex,
endIndex,
isLastChunk: endIndex === inputText.length
});
if (endIndex === inputText.length) break;
startIndex = endIndex - overlapLength;
}
return {
numChunks: chunks.length,
chunks
};
}
//
// Example usage:
//
// import { blobby } from "https://esm.town/v/yawnxyz/blobby";