chunkx/chunker.go at main · gomantics/chunkx · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
// Package chunkx provides AST-based code chunking using the CAST algorithm.
//
// ChunkX implements the CAST (Chunking via Abstract Syntax Trees) method for
// semantically-aware code chunking. Unlike line-based chunking, CAST respects
// code structure by parsing source into an AST and creating chunks that align
// with syntactic boundaries (functions, classes, methods).
//
// Basic usage:
//
//	chunker := chunkx.NewChunker()
//	chunks, err := chunker.Chunk(code, chunkx.WithLanguage(languages.Go))
//
// Supports 30+ languages including Bash, C, C++, C#, CSS, Cue, Dockerfile, Elixir,
// Elm, Go, Groovy, HCL, HTML, Java, JavaScript, Kotlin, Lua, Markdown, OCaml, PHP,
// Protobuf, Python, Ruby, Rust, Scala, SQL, Svelte, Swift, TOML, TypeScript, and YAML.
//
// For unsupported file types, the chunker automatically falls back to a generic
// line-based chunking algorithm.
package chunkx

import (
	"fmt"
	"os"
	"sort"
	"strings"

	"github.com/gomantics/chunkx/languages"
	sitter "github.com/smacker/go-tree-sitter"
)

// Default configuration values.
const (
	// DefaultMaxSize is the default maximum chunk size in tokens.
	DefaultMaxSize = 1500

	// DefaultOverlap is the default overlap percentage between chunks.
	DefaultOverlap = 0

	// MaxOverlap is the maximum allowed overlap percentage.
	MaxOverlap = 50
)

// Chunker provides AST-based code chunking capabilities.
type Chunker interface {
	Chunk(code string, opts ...Option) ([]Chunk, error)
	ChunkFile(path string, opts ...Option) ([]Chunk, error)
}

// castChunker implements the CAST algorithm for code chunking.
type castChunker struct {
	parser *Parser
}

// NewChunker creates a new CAST chunker instance.
func NewChunker() Chunker {
	return &castChunker{
		parser: NewParser(),
	}
}

// config holds the configuration for chunking operations.
type config struct {
	maxSize      int
	overlap      float64
	language     languages.LanguageName
	tokenCounter TokenCounter
}

// Option configures the chunker.
type Option func(*config)

// WithMaxSize sets the maximum chunk size in tokens.
func WithMaxSize(tokens int) Option {
	return func(c *config) {
		c.maxSize = tokens
	}
}

// WithOverlap sets the overlap percentage (0-MaxOverlap).
func WithOverlap(percent float64) Option {
	return func(c *config) {
		if percent < 0 {
			percent = 0
		} else if percent > MaxOverlap {
			percent = MaxOverlap
		}
		c.overlap = percent
	}
}

// WithLanguage sets the language for parsing.
// Use the exported constants: languages.Go, languages.Python, etc.
func WithLanguage(lang languages.LanguageName) Option {
	return func(c *config) {
		c.language = lang
	}
}

// WithTokenCounter sets a custom token counter.
func WithTokenCounter(counter TokenCounter) Option {
	return func(c *config) {
		c.tokenCounter = counter
	}
}

// newDefaultConfig creates a new config with default values.
func newDefaultConfig() *config {
	return &config{
		maxSize:      DefaultMaxSize,
		overlap:      DefaultOverlap,
		tokenCounter: &SimpleTokenCounter{},
	}
}

// Chunk splits the code into semantically coherent chunks.
func (c *castChunker) Chunk(code string, opts ...Option) ([]Chunk, error) {
	cfg := newDefaultConfig()

	for _, opt := range opts {
		opt(cfg)
	}

	if cfg.language == "" {
		return nil, ErrLanguageNotSpecified
	}

	// Use generic chunking for the generic language
	if cfg.language == languages.Generic {
		return c.chunkGeneric(code, cfg)
	}

	parseResult, err := c.parser.Parse(code, cfg.language)
	if err != nil {
		// Fallback to generic chunking if parsing fails
		return c.chunkGeneric(code, cfg)
	}

	root := parseResult.Tree.RootNode()
	chunks, err := c.chunkCode(root, parseResult.Source, cfg)
	if err != nil {
		return nil, err
	}

	// Apply overlap if configured
	if cfg.overlap > 0 {
		chunks = c.applyOverlap(chunks, cfg.overlap)
	}

	return chunks, nil
}

// ChunkFile chunks code from a file.
func (c *castChunker) ChunkFile(path string, opts ...Option) ([]Chunk, error) {
	content, err := os.ReadFile(path)
	if err != nil {
		return nil, fmt.Errorf("failed to read file: %w", err)
	}

	cfg := newDefaultConfig()

	for _, opt := range opts {
		opt(cfg)
	}

	// Auto-detect language if not specified
	if cfg.language == "" {
		detectedLang, _ := languages.DetectLanguage(path)
		cfg.language = detectedLang.Name

		// Use generic chunking if the language doesn't support AST parsing
		if detectedLang.GetParser == nil {
			return c.chunkGeneric(string(content), cfg)
		}

		parseResult, err := c.parser.ParseFile(path, string(content))
		if err != nil {
			// Fallback to generic chunking if parsing fails
			return c.chunkGeneric(string(content), cfg)
		}

		root := parseResult.Tree.RootNode()
		chunks, err := c.chunkCode(root, parseResult.Source, cfg)
		if err != nil {
			return nil, err
		}

		if cfg.overlap > 0 {
			chunks = c.applyOverlap(chunks, cfg.overlap)
		}

		return chunks, nil
	}

	return c.Chunk(string(content), opts...)
}

// chunkCode implements the main CAST algorithm
func (c *castChunker) chunkCode(node *sitter.Node, source []byte, cfg *config) ([]Chunk, error) {
	size, err := GetNodeSize(node, source, cfg.tokenCounter)
	if err != nil {
		return nil, fmt.Errorf("%w: %w", ErrNodeSize, err)
	}

	// If node fits within max size, return it as a single chunk
	if size <= cfg.maxSize {
		return []Chunk{c.nodeToChunk(node, source, cfg.language)}, nil
	}

	// Otherwise, chunk the node's children
	childCount := int(node.ChildCount())
	if childCount == 0 {
		// Leaf node that's too large - return as is (can't split further)
		return []Chunk{c.nodeToChunk(node, source, cfg.language)}, nil
	}

	children := make([]*sitter.Node, 0, childCount)
	for i := 0; i < childCount; i++ {
		if child := node.Child(i); child != nil {
			children = append(children, child)
		}
	}

	return c.chunkNodes(children, source, cfg)
}

// chunkNodes implements the node grouping logic.
func (c *castChunker) chunkNodes(nodes []*sitter.Node, source []byte, cfg *config) ([]Chunk, error) {
	var chunks []Chunk
	var currentNodes []*sitter.Node
	currentSize := 0

	for _, node := range nodes {
		nodeSize, err := GetNodeSize(node, source, cfg.tokenCounter)
		if err != nil {
			return nil, fmt.Errorf("%w: %w", ErrNodeSize, err)
		}

		// If adding this node would exceed max size
		if len(currentNodes) > 0 && currentSize+nodeSize > cfg.maxSize {
			// Save current chunk
			chunks = append(chunks, c.mergeNodesToChunk(currentNodes, source, cfg.language))
			currentNodes = nil
			currentSize = 0
		}

		// If single node exceeds max size, recursively chunk it
		if nodeSize > cfg.maxSize {
			if len(currentNodes) > 0 {
				chunks = append(chunks, c.mergeNodesToChunk(currentNodes, source, cfg.language))
				currentNodes = nil
				currentSize = 0
			}

			subChunks, err := c.chunkCode(node, source, cfg)
			if err != nil {
				return nil, err
			}
			chunks = append(chunks, subChunks...)
		} else {
			currentNodes = append(currentNodes, node)
			currentSize += nodeSize
		}
	}

	// Don't forget the last chunk
	if len(currentNodes) > 0 {
		chunks = append(chunks, c.mergeNodesToChunk(currentNodes, source, cfg.language))
	}

	return chunks, nil
}

// nodeToChunk converts a single node to a Chunk.
func (c *castChunker) nodeToChunk(node *sitter.Node, source []byte, language languages.LanguageName) Chunk {
	startLine, endLine := GetLineNumbers(node)

	// Collect unique node types
	nodeTypeSet := make(map[string]bool)
	collectNodeTypes(node, nodeTypeSet)

	nodeTypes := make([]string, 0, len(nodeTypeSet))
	for nodeType := range nodeTypeSet {
		nodeTypes = append(nodeTypes, nodeType)
	}
	sort.Strings(nodeTypes)

	return Chunk{
		Content:   GetNodeText(node, source),
		StartLine: startLine,
		EndLine:   endLine,
		StartByte: int(node.StartByte()),
		EndByte:   int(node.EndByte()),
		NodeTypes: nodeTypes,
		Language:  language,
	}
}

// mergeNodesToChunk merges multiple nodes into a single chunk.
func (c *castChunker) mergeNodesToChunk(nodes []*sitter.Node, source []byte, language languages.LanguageName) Chunk {
	if len(nodes) == 0 {
		return Chunk{Language: language}
	}

	// Find the span of all nodes
	firstNode := nodes[0]
	lastNode := nodes[len(nodes)-1]

	startByte := firstNode.StartByte()
	endByte := lastNode.EndByte()

	// Collect unique node types using a map
	nodeTypeSet := make(map[string]bool)
	for _, node := range nodes {
		collectNodeTypes(node, nodeTypeSet)
	}

	// Convert map to sorted slice for consistent output
	nodeTypes := make([]string, 0, len(nodeTypeSet))
	for nodeType := range nodeTypeSet {
		nodeTypes = append(nodeTypes, nodeType)
	}
	sort.Strings(nodeTypes)

	startLine, _ := GetLineNumbers(firstNode)
	_, endLine := GetLineNumbers(lastNode)

	return Chunk{
		Content:   string(source[startByte:endByte]),
		StartLine: startLine,
		EndLine:   endLine,
		StartByte: int(startByte),
		EndByte:   int(endByte),
		NodeTypes: nodeTypes,
		Language:  language,
	}
}

// applyOverlap adds overlap between consecutive chunks.
func (c *castChunker) applyOverlap(chunks []Chunk, overlapPercent float64) []Chunk {
	if len(chunks) <= 1 || overlapPercent <= 0 {
		return chunks
	}

	overlappedChunks := make([]Chunk, 0, len(chunks))

	for i := range chunks {
		chunk := chunks[i]

		// Calculate overlap size
		overlapSize := int(float64(len(chunk.Content)) * (overlapPercent / 100.0))

		// Add content from previous chunk if available
		if i > 0 && overlapSize > 0 {
			prevChunk := chunks[i-1]
			prevContent := prevChunk.Content

			// Get the last N characters from previous chunk
			startIdx := max(len(prevContent)-overlapSize, 0)

			chunk.Content = prevContent[startIdx:] + "\n" + chunk.Content
			// Adjust start position to reflect the overlap
			chunk.StartByte -= (len(prevContent) - startIdx)
			chunk.StartLine = prevChunk.StartLine + countLines(prevContent[:startIdx])
		}

		// Add content from next chunk if available
		if i < len(chunks)-1 && overlapSize > 0 {
			nextChunk := chunks[i+1]
			nextContent := nextChunk.Content

			// Get the first N characters from next chunk
			endIdx := min(overlapSize, len(nextContent))

			chunk.Content = chunk.Content + "\n" + nextContent[:endIdx]
			// Adjust end position to reflect the overlap
			chunk.EndByte += endIdx
			chunk.EndLine = nextChunk.StartLine + countLines(nextContent[:endIdx]) - 1
		}

		overlappedChunks = append(overlappedChunks, chunk)
	}

	return overlappedChunks
}

// countLines counts the number of lines in a string.
func countLines(s string) int {
	if s == "" {
		return 0
	}
	count := 1
	for _, r := range s {
		if r == '\n' {
			count++
		}
	}
	return count
}

// collectNodeTypes recursively collects all unique node types from a node and its descendants.
func collectNodeTypes(node *sitter.Node, nodeTypeSet map[string]bool) {
	if node == nil {
		return
	}

	nodeType := node.Type()

	// Skip pure punctuation/whitespace node types
	if shouldIncludeNodeType(nodeType) {
		nodeTypeSet[nodeType] = true
	}

	// Recursively collect from children
	childCount := int(node.ChildCount())
	for i := 0; i < childCount; i++ {
		if child := node.Child(i); child != nil {
			collectNodeTypes(child, nodeTypeSet)
		}
	}
}

// shouldIncludeNodeType determines if a node type should be included in the NodeTypes array.
// Filters out pure punctuation, operators, and whitespace nodes that don't add semantic value.
func shouldIncludeNodeType(nodeType string) bool {
	// Filter out whitespace
	if nodeType == "\n" || nodeType == "\t" || nodeType == " " {
		return false
	}

	// Filter out node types that are purely punctuation/operators
	// These don't add semantic value about code structure
	for _, r := range nodeType {
		// If the node type contains only punctuation/operator characters, skip it
		if !isPunctuationOrOperator(r) {
			return true
		}
	}

	// All characters are punctuation/operators, so skip this node type
	return false
}

// isPunctuationOrOperator checks if a rune is a punctuation or operator character.
func isPunctuationOrOperator(r rune) bool {
	return strings.ContainsRune("{}()[]<>;:,.=*&|!~^%+-/?#@$\\\"'`", r)
}

// chunkGeneric implements a simple line-based chunking algorithm for unsupported languages.
// This is used as a fallback when tree-sitter parsing is not available.
func (c *castChunker) chunkGeneric(code string, cfg *config) ([]Chunk, error) {
	lines := strings.Split(code, "\n")
	var chunks []Chunk
	var currentLines []string
	currentSize := 0
	currentStartLine := 1

	for i, line := range lines {
		lineSize, err := cfg.tokenCounter.CountTokens(line)
		if err != nil {
			return nil, fmt.Errorf("%w: %w", ErrNodeSize, err)
		}

		// If adding this line would exceed max size and we have content
		if len(currentLines) > 0 && currentSize+lineSize > cfg.maxSize {
			// Save current chunk
			chunk := Chunk{
				Content:   strings.Join(currentLines, "\n"),
				StartLine: currentStartLine,
				EndLine:   currentStartLine + len(currentLines) - 1,
				StartByte: 0, // Generic chunks don't track byte positions accurately
				EndByte:   0,
				NodeTypes: []string{"generic"},
				Language:  cfg.language,
			}
			chunks = append(chunks, chunk)
			currentLines = nil
			currentSize = 0
			currentStartLine = i + 1
		}

		currentLines = append(currentLines, line)
		currentSize += lineSize
	}

	// Don't forget the last chunk
	if len(currentLines) > 0 {
		chunk := Chunk{
			Content:   strings.Join(currentLines, "\n"),
			StartLine: currentStartLine,
			EndLine:   currentStartLine + len(currentLines) - 1,
			StartByte: 0,
			EndByte:   0,
			NodeTypes: []string{"generic"},
			Language:  cfg.language,
		}
		chunks = append(chunks, chunk)
	}

	// Apply overlap if configured
	if cfg.overlap > 0 {
		chunks = c.applyOverlap(chunks, cfg.overlap)
	}

	return chunks, nil
}