Skip to content

Commit 3525612

Browse files
reakaleekclaude
andauthored
Sitemap: Add protocol limit validation (#3123)
* Sitemap: Add protocol limit validation (50k entries, 50 MB file size) Throw InvalidOperationException when sitemap.xml exceeds the sitemap protocol limits of 50,000 URLs or 50 MB file size. Emit warnings at 40,000 entries and 40 MB to provide early notice before hitting the wall. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Sitemap: Buffer XML in memory before writing to avoid persisting oversized files Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 73eae13 commit 3525612

4 files changed

Lines changed: 115 additions & 6 deletions

File tree

src/services/Elastic.Documentation.Assembler/Building/AssemblerBuildService.cs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,20 @@ Cancel ctx
138138
.Distinct();
139139
var now = DateTimeOffset.UtcNow;
140140
var entries = urls.ToDictionary(u => u, _ => now);
141-
SitemapBuilder.Generate(entries, assembleContext.WriteFileSystem, assembleContext.OutputWithPathPrefixDirectory);
141+
142+
if (entries.Count >= SitemapBuilder.WarningEntryThreshold)
143+
collector.EmitGlobalWarning(
144+
$"Sitemap has {entries.Count:N0} entries, approaching the {SitemapBuilder.MaxEntries:N0} URL protocol limit. " +
145+
"Consider implementing sitemap index files."
146+
);
147+
148+
var sitemapResult = SitemapBuilder.Generate(entries, assembleContext.WriteFileSystem, assembleContext.OutputWithPathPrefixDirectory);
149+
150+
if (sitemapResult.FileSizeBytes >= SitemapBuilder.WarningFileSizeBytes)
151+
collector.EmitGlobalWarning(
152+
$"Sitemap file size is {sitemapResult.FileSizeBytes / (1024.0 * 1024.0):F1} MB, approaching the 50 MB protocol limit. " +
153+
"Consider implementing sitemap index files."
154+
);
142155
}
143156

144157
if (exporters.Contains(Exporter.LLMText))

src/services/Elastic.Documentation.Assembler/Building/AssemblerSitemapService.cs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,19 @@ public async Task<bool> GenerateSitemapAsync(
9696
return false;
9797
}
9898

99-
SitemapBuilder.Generate(entries, assembleContext.WriteFileSystem, assembleContext.OutputWithPathPrefixDirectory);
99+
if (entries.Count >= SitemapBuilder.WarningEntryThreshold)
100+
collector.EmitGlobalWarning(
101+
$"Sitemap has {entries.Count:N0} entries, approaching the {SitemapBuilder.MaxEntries:N0} URL protocol limit. " +
102+
"Consider implementing sitemap index files."
103+
);
104+
105+
var result = SitemapBuilder.Generate(entries, assembleContext.WriteFileSystem, assembleContext.OutputWithPathPrefixDirectory);
106+
107+
if (result.FileSizeBytes >= SitemapBuilder.WarningFileSizeBytes)
108+
collector.EmitGlobalWarning(
109+
$"Sitemap file size is {result.FileSizeBytes / (1024.0 * 1024.0):F1} MB, approaching the 50 MB protocol limit. " +
110+
"Consider implementing sitemap index files."
111+
);
100112

101113
_logger.LogInformation("Sitemap written to {Path}", assembleContext.OutputWithPathPrefixDirectory.FullName);
102114
return true;

src/services/Elastic.Documentation.Assembler/Building/SitemapBuilder.cs

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,34 @@
1212

1313
namespace Elastic.Documentation.Assembler.Building;
1414

15+
public record SitemapResult(int EntryCount, long FileSizeBytes);
16+
1517
public static class SitemapBuilder
1618
{
19+
public const int MaxEntries = 50_000;
20+
public const int WarningEntryThreshold = 40_000;
21+
public const long MaxFileSizeBytes = 50L * 1024 * 1024;
22+
public const long WarningFileSizeBytes = 40L * 1024 * 1024;
23+
1724
private static readonly Uri BaseUri = new("https://www.elastic.co");
1825

1926
/// <summary>Generates sitemap.xml with per-URL last_updated dates.</summary>
20-
public static void Generate(
27+
public static SitemapResult Generate(
2128
IReadOnlyDictionary<string, DateTimeOffset> entries,
2229
IFileSystem fileSystem,
2330
IDirectoryInfo outputFolder
2431
)
2532
{
2633
// TODO: Remove this exclusion when API docs are ready for sitemap inclusion
2734
var filtered = entries
28-
.Where(e => !e.Key.StartsWith("/docs/api/", StringComparison.Ordinal));
35+
.Where(e => !e.Key.StartsWith("/docs/api/", StringComparison.Ordinal))
36+
.ToList();
37+
38+
if (filtered.Count > MaxEntries)
39+
throw new InvalidOperationException(
40+
$"Sitemap contains {filtered.Count:N0} URLs, which exceeds the sitemap protocol limit of {MaxEntries:N0}. " +
41+
"Consider implementing sitemap index files to split entries across multiple sitemaps."
42+
);
2943

3044
var doc = new XDocument
3145
{
@@ -47,11 +61,25 @@ IDirectoryInfo outputFolder
4761

4862
doc.Add(root);
4963

64+
using var buffer = new MemoryStream();
65+
doc.Save(buffer);
66+
67+
var fileSize = buffer.Length;
68+
if (fileSize > MaxFileSizeBytes)
69+
throw new InvalidOperationException(
70+
$"Sitemap file size is {fileSize / (1024.0 * 1024.0):F1} MB, which exceeds the sitemap protocol limit of 50 MB. " +
71+
"Consider implementing sitemap index files to split entries across multiple sitemaps."
72+
);
73+
5074
if (!outputFolder.Exists)
5175
_ = fileSystem.Directory.CreateDirectory(outputFolder.FullName);
5276

53-
using var fileStream = fileSystem.File.Create(fileSystem.Path.Join(outputFolder.FullName, "sitemap.xml"));
54-
doc.Save(fileStream);
77+
var sitemapPath = fileSystem.Path.Join(outputFolder.FullName, "sitemap.xml");
78+
using var fileStream = fileSystem.File.Create(sitemapPath);
79+
buffer.Position = 0;
80+
buffer.CopyTo(fileStream);
81+
82+
return new SitemapResult(filtered.Count, fileSize);
5583
}
5684
}
5785

tests/Elastic.Documentation.Build.Tests/SitemapTests.cs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,62 @@ public void Generate_OrdersUrlsAlphabetically()
9494
locs.Should().BeInAscendingOrder();
9595
}
9696

97+
[Fact]
98+
public void Generate_ReturnsEntryCountAndFileSize()
99+
{
100+
// Arrange
101+
var fs = new MockFileSystem();
102+
var outputDir = fs.DirectoryInfo.New("/output");
103+
var now = DateTimeOffset.UtcNow;
104+
var entries = new Dictionary<string, DateTimeOffset>
105+
{
106+
["/docs/page-1"] = now,
107+
["/docs/page-2"] = now,
108+
};
109+
110+
// Act
111+
var result = SitemapBuilder.Generate(entries, fs, outputDir);
112+
113+
// Assert
114+
result.EntryCount.Should().Be(2);
115+
result.FileSizeBytes.Should().BeGreaterThan(0);
116+
}
117+
118+
[Fact]
119+
public void Generate_ThrowsWhenEntryCountExceedsLimit()
120+
{
121+
// Arrange
122+
var fs = new MockFileSystem();
123+
var outputDir = fs.DirectoryInfo.New("/output");
124+
var now = DateTimeOffset.UtcNow;
125+
var entries = Enumerable.Range(0, SitemapBuilder.MaxEntries + 1)
126+
.ToDictionary(i => $"/docs/page-{i}", _ => now);
127+
128+
// Act
129+
var act = () => SitemapBuilder.Generate(entries, fs, outputDir);
130+
131+
// Assert
132+
act.Should().Throw<InvalidOperationException>()
133+
.WithMessage("*exceeds the sitemap protocol limit*");
134+
}
135+
136+
[Fact]
137+
public void Generate_DoesNotThrowAtExactLimit()
138+
{
139+
// Arrange
140+
var fs = new MockFileSystem();
141+
var outputDir = fs.DirectoryInfo.New("/output");
142+
var now = DateTimeOffset.UtcNow;
143+
var entries = Enumerable.Range(0, SitemapBuilder.MaxEntries)
144+
.ToDictionary(i => $"/docs/page-{i}", _ => now);
145+
146+
// Act
147+
var act = () => SitemapBuilder.Generate(entries, fs, outputDir);
148+
149+
// Assert
150+
act.Should().NotThrow();
151+
}
152+
97153
[Fact]
98154
public void Generate_ExcludesApiDocsFromSitemap()
99155
{

0 commit comments

Comments
 (0)