-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathMarkdownChunkingService.cs
More file actions
195 lines (173 loc) · 7.66 KB
/
MarkdownChunkingService.cs
File metadata and controls
195 lines (173 loc) · 7.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using Microsoft.SemanticKernel.Text;
namespace EssentialCSharp.Chat.Common.Services;
/// <summary>
/// Markdown chunking service using Semantic Kernel's TextChunker
/// </summary>
public partial class MarkdownChunkingService(
ILogger<MarkdownChunkingService> logger,
int maxTokensPerChunk = 256,
int overlapTokens = 25)
{
private static readonly string[] _NewLineSeparators = ["\r\n", "\n", "\r"];
private readonly int _MaxTokensPerChunk = maxTokensPerChunk;
private readonly int _OverlapTokens = overlapTokens;
/// <summary>
/// Process markdown files in the specified directory using Semantic Kernel's TextChunker
/// </summary>
public async Task<List<FileChunkingResult>> ProcessMarkdownFilesAsync(
DirectoryInfo directory,
string filePattern)
{
// Validate input parameters
if (!directory.Exists)
{
logger.LogError("Error: Directory {DirectoryName} does not exist.", directory.FullName);
throw new InvalidOperationException($"Error: Directory '{directory.FullName}' does not exist.");
}
// Find markdown files
var markdownFiles = directory.GetFiles(filePattern, SearchOption.TopDirectoryOnly);
if (markdownFiles.Length == 0)
{
throw new InvalidOperationException($"No files matching pattern '{filePattern}' found in '{directory.FullName}'");
}
Console.WriteLine($"Processing {markdownFiles.Length} markdown files...");
int totalChunks = 0;
var results = new List<FileChunkingResult>();
foreach (var file in markdownFiles)
{
string[] fileContent = await File.ReadAllLinesAsync(file.FullName);
var result = ProcessSingleMarkdownFile(fileContent, file.Name, file.FullName);
results.Add(result);
totalChunks += result.ChunkCount;
}
Console.WriteLine($"Processed {markdownFiles.Length} markdown files with a total of {totalChunks} chunks.");
return results;
}
/// <summary>
/// Process a single markdown file using Semantic Kernel's SplitMarkdownParagraphs method
/// </summary>
public FileChunkingResult ProcessSingleMarkdownFile(
string[] fileContent, string fileName, string filePath)
{
// Collapse consecutive blank lines to at most one blank line. Single blank lines must
// be preserved because TextChunker.SplitMarkdownParagraphs uses them as paragraph
// separators — stripping all blanks defeats paragraph-aware chunking.
var normalizedLines = new List<string>(fileContent.Length);
bool lastWasBlank = false;
foreach (var raw in fileContent)
{
var line = raw.Trim();
var isBlank = string.IsNullOrWhiteSpace(line);
if (!isBlank || !lastWasBlank)
normalizedLines.Add(line);
lastWasBlank = isBlank;
}
string[] lines = [.. normalizedLines];
string content = string.Join(Environment.NewLine, lines);
var sections = MarkdownContentToHeadersAndSection(content);
var allChunks = new List<MarkdownChunk>();
int totalChunkCharacters = 0;
int chunkCount = 0;
foreach (var (Header, Content) in sections)
{
#pragma warning disable SKEXP0050
var chunks = TextChunker.SplitMarkdownParagraphs(
lines: Content,
maxTokensPerParagraph: _MaxTokensPerChunk,
overlapTokens: _OverlapTokens,
chunkHeader: Header + " - "
);
#pragma warning restore SKEXP0050
allChunks.AddRange(chunks.Select(c => new MarkdownChunk(Header, c)));
chunkCount += chunks.Count;
totalChunkCharacters += chunks.Sum(c => c.Length);
}
return new FileChunkingResult
{
FileName = fileName,
FilePath = filePath,
OriginalCharCount = content.Length,
ChunkCount = chunkCount,
Chunks = allChunks,
TotalChunkCharacters = totalChunkCharacters
};
}
/// <summary>
/// Convert markdown content into a list of headers and their associated content sections.
/// </summary>
/// <param name="content"></param>
/// <returns></returns>
public static List<(string Header, List<string> Content)> MarkdownContentToHeadersAndSection(string content)
{
var lines = content.Split(_NewLineSeparators, StringSplitOptions.None);
var sections = new List<(string Header, List<string> Content)>();
var headerRegex = HeadingRegex();
var listingPattern = ListingRegex();
var headerStack = new List<(int Level, string Text)>();
int i = 0;
while (i < lines.Length)
{
// Find next header
while (i < lines.Length && !headerRegex.IsMatch(lines[i]))
i++;
if (i >= lines.Length) break;
var match = headerRegex.Match(lines[i]);
int level = match.Groups[1].Value.Length;
string headerText = match.Groups[2].Value.Trim();
bool isListing = headerText.StartsWith("Listing", StringComparison.OrdinalIgnoreCase) && listingPattern.IsMatch(headerText);
// If this is a listing header, append its content to the previous section
if (isListing && sections.Count > 0)
{
i++; // skip the listing header
var listingContent = new List<string>();
while (i < lines.Length && !headerRegex.IsMatch(lines[i]))
{
if (!string.IsNullOrWhiteSpace(lines[i]))
listingContent.Add(lines[i]);
i++;
}
// Append to previous section's content
var prev = sections[^1];
prev.Content.AddRange(listingContent);
sections[^1] = prev;
continue;
}
// Update header stack for non-listing headers
if (headerStack.Count == 0 || level > headerStack.Last().Level)
{
headerStack.Add((level, headerText));
}
else
{
while (headerStack.Count > 0 && headerStack.Last().Level >= level)
headerStack.RemoveAt(headerStack.Count - 1);
headerStack.Add((level, headerText));
}
i++;
// Collect content until next header, preserving blank lines as paragraph separators
// for TextChunker.SplitMarkdownParagraphs.
var contentLines = new List<string>();
while (i < lines.Length && !headerRegex.IsMatch(lines[i]))
{
contentLines.Add(lines[i]);
i++;
}
// Strip leading and trailing blank lines; keep internal blanks for paragraph detection.
while (contentLines.Count > 0 && string.IsNullOrWhiteSpace(contentLines[0]))
contentLines.RemoveAt(0);
while (contentLines.Count > 0 && string.IsNullOrWhiteSpace(contentLines[^1]))
contentLines.RemoveAt(contentLines.Count - 1);
// Compose full header context
var fullHeader = string.Join(": ", headerStack.Select(h => h.Text));
if (contentLines.Any(l => !string.IsNullOrWhiteSpace(l)))
sections.Add((fullHeader, contentLines));
}
return sections;
}
[GeneratedRegex(@"^Listing \d+\.\d+(:.*)?$")]
private static partial Regex ListingRegex();
[GeneratedRegex(@"^(#{1,6}) +(.+)$")]
private static partial Regex HeadingRegex();
}