-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathChunkingResultExtensions.cs
More file actions
62 lines (53 loc) · 2.19 KB
/
ChunkingResultExtensions.cs
File metadata and controls
62 lines (53 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
using System.Security.Cryptography;
using System.Text;
using System.Linq;
using EssentialCSharp.Chat.Common.Models;
namespace EssentialCSharp.Chat.Common.Services;
public static partial class ChunkingResultExtensions
{
public static List<BookContentChunk> ToBookContentChunks(this FileChunkingResult result)
{
int? chapterNumber = ExtractChapterNumber(result.FileName);
var chunks = result.Chunks
.Select(chunkText =>
{
var contentHash = ComputeSha256Hash(chunkText);
return new BookContentChunk
{
Id = Guid.NewGuid().ToString(),
FileName = result.FileName,
Heading = ExtractHeading(chunkText),
ChunkText = chunkText,
ChapterNumber = chapterNumber,
ContentHash = contentHash
};
})
.ToList();
return chunks;
}
private static string ExtractHeading(string chunkText)
{
// get characters until the first " - " or newline
var firstLine = chunkText.Split(["\r\n", "\r", "\n"], StringSplitOptions.None)[0];
var headingParts = firstLine.Split([" - "], StringSplitOptions.None);
return headingParts.Length > 0 ? headingParts[0].Trim() : string.Empty;
}
private static int ExtractChapterNumber(string fileName)
{
// Example: "Chapter01.md" -> 1
// Regex: Chapter(?<ChapterNumber>[0-9]{2})
var match = ChapterNumberRegex().Match(fileName);
if (match.Success && int.TryParse(match.Groups["ChapterNumber"].Value, out int chapterNumber))
{
return chapterNumber;
}
throw new InvalidOperationException($"File name '{fileName}' does not contain a valid chapter number in the expected format.");
}
private static string ComputeSha256Hash(string text)
{
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(text));
return Convert.ToHexStringLower(bytes);
}
[System.Text.RegularExpressions.GeneratedRegex(@"Chapter(?<ChapterNumber>\d{2})")]
private static partial System.Text.RegularExpressions.Regex ChapterNumberRegex();
}