Val Town is a social website to write and deploy JavaScript.
Build APIs and schedule functions from your browser.
Readme

Text Chunk Processor

This JavaScript function processes long text inputs by breaking them into smaller chunks, applying a custom processing function to each chunk, and optionally handling overlaps between chunks.

Function Signature

processTextInChunks(inputText, processFn, config)

Parameters

  • inputText (string): The long input text to be processed.
  • processFn (function, optional): A function to process each chunk. Defaults to logging the chunk.
  • config (object, optional): Configuration options.

Config Options

  • chunkLength (number): Length of each chunk. Default: 1000.
  • overlapLength (number): Length of overlap between chunks. Default: 0.

Usage

const longText = "Your very long text here..."; const result = processTextInChunks(longText, (mainChunk, overlapChunk, info) => { // Process the chunk here console.log(`Processing chunk from ${info.startIndex} to ${info.endIndex}`); console.log(`Main chunk: ${mainChunk}`); console.log(`Overlap: ${overlapChunk}`); return mainChunk.length; // Example return value }, { chunkLength: 500, overlapLength: 50 } ); console.log(result);

Notes

  • If processFn is not provided, the function will log each chunk and return the original text.
  • If processFn returns a value, these values will be collected in an array and returned.
  • If processFn doesn't return anything, the original input text is returned.
  • The processFn receives three arguments:
    1. mainChunk: The current chunk of text being processed.
    2. overlapChunk: The overlapping text from the previous chunk (empty for the first chunk).
    3. An info object containing:
      • startIndex: Start index of the current chunk in the original text.
      • endIndex: End index of the current chunk in the original text.
      • isLastChunk: Boolean indicating if this is the last chunk.

Use Cases

  • Processing large texts in smaller, manageable pieces.
  • Applying transformations or analysis to text while maintaining context through overlaps.
  • Tokenization or parsing of large documents.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
export async function processTextInChunks(inputText, processFn = async (text) => console.log(text), config = {}) {
const {
chunkLength = 1000,
overlapLength = 0
} = config;
const results = [];
let startIndex = 0;
const numChunks = Math.ceil((inputText.length - overlapLength) / (chunkLength - overlapLength));
for (let chunkIndex = 0; chunkIndex < numChunks; chunkIndex++) {
const endIndex = Math.min(startIndex + chunkLength, inputText.length);
const mainChunk = inputText.slice(startIndex, endIndex);
let overlapChunk = '';
if (startIndex > 0) {
const overlapStartIndex = Math.max(0, startIndex - overlapLength);
overlapChunk = inputText.slice(overlapStartIndex, startIndex);
}
const result = await processFn(mainChunk, overlapChunk, {
startIndex,
endIndex,
isLastChunk: endIndex === inputText.length,
chunkIndex,
numChunks
});
if (result !== undefined) {
results.push(result);
}
startIndex = endIndex - overlapLength;
if (startIndex >= inputText.length) break;
}
return results.length > 0 ? results : inputText;
}
// just gives information about the chunks; doesn't actually do anything!
export function preprocessTextChunks(inputText, config = {}) {
const {
chunkLength = 1000,
overlapLength = 0
} = config;
if (chunkLength <= overlapLength) {
throw new Error("chunkLength must be greater than overlapLength");
}
const chunks = [];
let startIndex = 0;
while (startIndex < inputText.length) {
const endIndex = Math.min(startIndex + chunkLength, inputText.length);
const mainChunk = inputText.slice(startIndex, endIndex);
let overlapChunk = '';
if (startIndex > 0) {
const overlapStartIndex = Math.max(0, startIndex - overlapLength);
overlapChunk = inputText.slice(overlapStartIndex, startIndex);
}
chunks.push({
mainChunk,
overlapChunk,
startIndex,
endIndex,
isLastChunk: endIndex === inputText.length
});
if (endIndex === inputText.length) break;
startIndex = endIndex - overlapLength;
}
return {
numChunks: chunks.length,
chunks
};
}
//
// Example usage:
//
// import { blobby } from "https://esm.town/v/yawnxyz/blobby";
June 21, 2024