diff --git a/README.rst b/README.rst index 34c46e0..cd9c113 100644 --- a/README.rst +++ b/README.rst @@ -64,6 +64,7 @@ Netconan can anonymize *many types of sensitive information*: * IPv4 and IPv6 addresses (``--anonymize-ips``, ``-a``). * User-specified sensitive words (``--sensitive-words``, ``-w``). *Note that any occurrence of a specified sensitive word will be replaced regardless of context, even if it is part of a larger string.* * User-specified AS numbers (``--as-numbers``, ``-n``). *Note that any number matching a specified AS number will be anonymized.* +* Description fields (``--anonymize-descriptions``). *Description text is replaced with a deterministic hash, preserving surrounding quotes and semicolons.* Netconan attempts to *preserve useful structure*. For example, diff --git a/netconan/anonymize_files.py b/netconan/anonymize_files.py index 4a0334a..8d78ce4 100644 --- a/netconan/anonymize_files.py +++ b/netconan/anonymize_files.py @@ -18,12 +18,17 @@ import logging import os import random +import re import string import sys from collections.abc import Sequence from typing import IO from .default_reserved_words import default_reserved_words +from .description_anonymization import ( + generate_description_regexes, + replace_descriptions, +) from .ip_anonymization import IpAnonymizer, IpV6Anonymizer, anonymize_ip_addr from .sensitive_item_removal import ( AsNumberAnonymizer, @@ -54,6 +59,7 @@ def __init__( preserve_networks: Sequence[str] | None = None, preserve_suffix_v4: int | None = None, preserve_suffix_v6: int | None = None, + anon_descriptions: bool = False, ) -> None: """Creates anonymizer classes.""" self.undo_ip_anon = undo_ip_anon @@ -64,6 +70,8 @@ def __init__( self.anonymizer_sensitive_word: SensitiveWordAnonymizer | None = None self.compiled_regexes: list[list[CompiledRegexRule]] | None = None self.pwd_lookup: dict[str, str] | None = None + self.description_regexes: list[re.Pattern[str]] | None = None + self.description_lookup: dict[str, str] | None = None # The salt is only used for IP and sensitive word anonymization if salt is None: @@ -74,6 +82,10 @@ def __init__( self.salt: str = salt logging.debug('Using salt: "%s"', self.salt) + if anon_descriptions: + self.description_regexes = generate_description_regexes() + self.description_lookup = {} + if anon_pwd: self.compiled_regexes = generate_default_sensitive_item_regexes() self.pwd_lookup = {} @@ -122,6 +134,17 @@ def anonymize_io(self, in_io: IO[str], out_io: IO[str]) -> None: if self.anonymizer_sensitive_word is not None: output_line = self.anonymizer_sensitive_word.anonymize(output_line) + if ( + self.description_regexes is not None + and self.description_lookup is not None + ): + output_line = replace_descriptions( + self.description_regexes, + output_line, + self.description_lookup, + self.salt, + ) + if self.anonymizer_as_num is not None: output_line = anonymize_as_numbers(self.anonymizer_as_num, output_line) @@ -146,6 +169,7 @@ def anonymize_files( preserve_networks: Sequence[str] | None = None, preserve_suffix_v4: int | None = None, preserve_suffix_v6: int | None = None, + anon_descriptions: bool = False, ) -> None: """Anonymize each file in input and save to output.""" use_stdin = input_path == "-" @@ -196,6 +220,7 @@ def anonymize_files( salt=salt, sensitive_words=sensitive_words, undo_ip_anon=undo_ip_anon, + anon_descriptions=anon_descriptions, ) for in_path, out_path in file_list: diff --git a/netconan/description_anonymization.py b/netconan/description_anonymization.py new file mode 100644 index 0000000..feafa1d --- /dev/null +++ b/netconan/description_anonymization.py @@ -0,0 +1,65 @@ +"""Anonymize description fields in network configuration files.""" + +import hashlib +import re + +# Matches: description "some text here" +_DESCRIPTION_QUOTED_REGEX = re.compile( + r'(?P
description\s+")(?P[^"]+)(?P")'
+)
+
+# Matches: description some text here (with optional trailing semicolon)
+_DESCRIPTION_UNQUOTED_REGEX = re.compile(
+    r"(?P
description\s+)(?P[^\";\s].+?)\s*(?P;?\s*)$"
+)
+
+
+def anonymize_description(value: str, lookup: dict[str, str], salt: str) -> str:
+    """Return a deterministic anonymized replacement for a description value.
+
+    Uses SHA-256 hashing with the given salt to produce a stable 8-character
+    base32-encoded identifier prefixed with 'descr_'.
+    """
+    if value in lookup:
+        return lookup[value]
+    hash_input = (salt + value).encode("utf-8")
+    digest = hashlib.sha256(hash_input).digest()
+    # Use first 5 bytes -> 8 base32 chars, strip padding, lowercase
+    anon = "descr_" + _base32_encode(digest[:5]).lower()
+    lookup[value] = anon
+    return anon
+
+
+def _base32_encode(data: bytes) -> str:
+    """Base32 encode bytes and strip padding."""
+    import base64
+
+    return base64.b32encode(data).decode("ascii").rstrip("=")
+
+
+def generate_description_regexes() -> list[re.Pattern[str]]:
+    """Return list of compiled regexes for matching description lines."""
+    return [_DESCRIPTION_QUOTED_REGEX, _DESCRIPTION_UNQUOTED_REGEX]
+
+
+def replace_descriptions(
+    regexes: list[re.Pattern[str]], line: str, lookup: dict[str, str], salt: str
+) -> str:
+    """Replace description content in a line if it matches any regex.
+
+    First match wins. Returns the line with description content replaced,
+    preserving surrounding context (quotes, semicolons, whitespace).
+    """
+    for regex in regexes:
+        match = regex.search(line)
+        if match:
+            desc_value = match.group("desc")
+            anon_value = anonymize_description(desc_value, lookup, salt)
+            return (
+                line[: match.start()]
+                + match.group("pre")
+                + anon_value
+                + match.group("post")
+                + line[match.end() :]
+            )
+    return line
diff --git a/netconan/netconan.py b/netconan/netconan.py
index 0ec9bb8..3862c82 100644
--- a/netconan/netconan.py
+++ b/netconan/netconan.py
@@ -153,6 +153,12 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
         default=8,
         help="Preserve the trailing bits of IP addresses, aka the host bits of a network. Set this value large enough to represent the largest interface network (e.g., 8 for a /24 or 12 for a /20) or NAT pool.",
     )
+    parser.add_argument(
+        "--anonymize-descriptions",
+        action="store_true",
+        default=False,
+        help="Anonymize description fields with deterministic hashed replacements",
+    )
     result: argparse.Namespace = parser.parse_args(argv)
     return result
 
@@ -220,6 +226,7 @@ def main(argv: list[str] = sys.argv[1:]) -> None:
             args.anonymize_passwords,
             args.anonymize_ips,
             args.undo,
+            args.anonymize_descriptions,
         ]
     ):
         logging.warning(
@@ -239,6 +246,7 @@ def main(argv: list[str] = sys.argv[1:]) -> None:
             reserved_words,
             preserve_prefixes,
             preserve_addresses,
+            anon_descriptions=args.anonymize_descriptions,
             preserve_suffix_v4=args.preserve_host_bits,
             preserve_suffix_v6=args.preserve_host_bits,
         )
diff --git a/tests/end_to_end/test_e2e_descriptions.py b/tests/end_to_end/test_e2e_descriptions.py
new file mode 100644
index 0000000..4d255be
--- /dev/null
+++ b/tests/end_to_end/test_e2e_descriptions.py
@@ -0,0 +1,70 @@
+"""End-to-end tests for description anonymization."""
+
+from netconan.netconan import main
+
+INPUT_CONTENTS = """\
+interface GigabitEthernet0/0
+ description "uplink to core-router1 (port 14)"
+ ip address 10.0.0.1 255.255.255.0
+!
+interface GigabitEthernet0/1
+ description link-to-provider;
+ ip address 10.0.0.2 255.255.255.0
+!
+"""
+
+
+def test_e2e_descriptions(tmpdir):
+    """Test that --anonymize-descriptions replaces description content."""
+    filename = "test.cfg"
+    input_dir = tmpdir.mkdir("input")
+    input_dir.join(filename).write(INPUT_CONTENTS)
+
+    output_dir = tmpdir.mkdir("output")
+    args = [
+        "-i",
+        str(input_dir),
+        "-o",
+        str(output_dir),
+        "-s",
+        "E2ESALT",
+        "--anonymize-descriptions",
+    ]
+    main(args)
+
+    with open(str(output_dir.join(filename))) as f:
+        output = f.read()
+
+    # Description content should be replaced
+    assert "uplink to core-router1 (port 14)" not in output
+    assert "link-to-provider" not in output
+    assert "descr_" in output
+
+    # Non-description lines should be preserved
+    assert "interface GigabitEthernet0/0" in output
+    assert "ip address 10.0.0.1 255.255.255.0" in output
+    assert "interface GigabitEthernet0/1" in output
+
+
+def test_e2e_descriptions_deterministic(tmpdir):
+    """Test that description anonymization is deterministic with same salt."""
+    filename = "test.cfg"
+
+    input_dir1 = tmpdir.mkdir("input1")
+    input_dir1.join(filename).write(INPUT_CONTENTS)
+    output_dir1 = tmpdir.mkdir("output1")
+
+    input_dir2 = tmpdir.mkdir("input2")
+    input_dir2.join(filename).write(INPUT_CONTENTS)
+    output_dir2 = tmpdir.mkdir("output2")
+
+    args_base = ["-s", "DETSALT", "--anonymize-descriptions"]
+
+    main(args_base + ["-i", str(input_dir1), "-o", str(output_dir1)])
+    main(args_base + ["-i", str(input_dir2), "-o", str(output_dir2)])
+
+    with (
+        open(str(output_dir1.join(filename))) as f1,
+        open(str(output_dir2.join(filename))) as f2,
+    ):
+        assert f1.read() == f2.read()
diff --git a/tests/unit/test_description_anonymization.py b/tests/unit/test_description_anonymization.py
new file mode 100644
index 0000000..2f1b9fd
--- /dev/null
+++ b/tests/unit/test_description_anonymization.py
@@ -0,0 +1,204 @@
+"""Tests for description anonymization module."""
+
+import pytest
+
+from netconan.description_anonymization import (
+    anonymize_description,
+    generate_description_regexes,
+    replace_descriptions,
+)
+
+
+class TestAnonymizeDescription:
+    """Tests for the anonymize_description function."""
+
+    def test_deterministic(self):
+        """Same input+salt always produces the same output."""
+        lookup = {}
+        result1 = anonymize_description("server.example.net", lookup, "salt1")
+        lookup2 = {}
+        result2 = anonymize_description("server.example.net", lookup2, "salt1")
+        assert result1 == result2
+
+    def test_format_prefix(self):
+        """Output starts with 'descr_'."""
+        lookup = {}
+        result = anonymize_description("test value", lookup, "salt1")
+        assert result.startswith("descr_")
+
+    def test_format_length(self):
+        """Output has 8 chars after the prefix."""
+        lookup = {}
+        result = anonymize_description("test value", lookup, "salt1")
+        suffix = result[len("descr_") :]
+        assert len(suffix) == 8
+
+    def test_format_lowercase_alphanumeric(self):
+        """Output suffix is lowercase alphanumeric (base32)."""
+        lookup = {}
+        result = anonymize_description("test value", lookup, "salt1")
+        suffix = result[len("descr_") :]
+        assert suffix == suffix.lower()
+        assert suffix.isalnum()
+
+    def test_caching_in_lookup(self):
+        """Once computed, the result is cached in the lookup dict."""
+        lookup = {}
+        result = anonymize_description("cached value", lookup, "salt1")
+        assert "cached value" in lookup
+        assert lookup["cached value"] == result
+
+    def test_different_salt_different_result(self):
+        """Different salts produce different results."""
+        result1 = anonymize_description("same value", {}, "salt_a")
+        result2 = anonymize_description("same value", {}, "salt_b")
+        assert result1 != result2
+
+    def test_different_values_different_result(self):
+        """Different description values produce different results."""
+        result1 = anonymize_description("value_one", {}, "salt1")
+        result2 = anonymize_description("value_two", {}, "salt1")
+        assert result1 != result2
+
+
+class TestRegexMatching:
+    """Tests for description regex patterns."""
+
+    @pytest.fixture
+    def regexes(self):
+        """Return compiled description regexes."""
+        return generate_description_regexes()
+
+    @pytest.mark.parametrize(
+        "line,expected_desc",
+        [
+            (
+                'description "server.example.net (port14)"',
+                "server.example.net (port14)",
+            ),
+            ('description "Core Router - Site A"', "Core Router - Site A"),
+            (' description "indented quoted"', "indented quoted"),
+        ],
+        ids=["quoted-basic", "quoted-spaces", "quoted-indented"],
+    )
+    def test_quoted_regex_matches(self, regexes, line, expected_desc):
+        """Quoted description regex captures the description content."""
+        match = regexes[0].search(line)
+        assert match is not None
+        assert match.group("desc") == expected_desc
+
+    @pytest.mark.parametrize(
+        "line,expected_desc",
+        [
+            ("description server.example.net", "server.example.net"),
+            ("description Link-to-upstream;", "Link-to-upstream"),
+            ("description multi word value", "multi word value"),
+            (" description indented-value", "indented-value"),
+            ("description trailing-semi ;", "trailing-semi"),
+        ],
+        ids=[
+            "unquoted-simple",
+            "unquoted-semicolon",
+            "unquoted-multiword",
+            "unquoted-indented",
+            "unquoted-space-before-semi",
+        ],
+    )
+    def test_unquoted_regex_matches(self, regexes, line, expected_desc):
+        """Unquoted description regex captures the description content."""
+        match = regexes[1].search(line)
+        assert match is not None
+        assert match.group("desc") == expected_desc
+
+    @pytest.mark.parametrize(
+        "line",
+        [
+            "hostname router1",
+            "interface GigabitEthernet0/0",
+            "ip address 10.0.0.1 255.255.255.0",
+            "set description-limit 100",
+        ],
+        ids=[
+            "hostname",
+            "interface",
+            "ip-address",
+            "set-description-limit",
+        ],
+    )
+    def test_no_false_positives(self, regexes, line):
+        """Lines that are not descriptions should not match."""
+        for regex in regexes:
+            assert regex.search(line) is None
+
+
+class TestReplaceDescriptions:
+    """Tests for the replace_descriptions function."""
+
+    @pytest.fixture
+    def regexes(self):
+        """Return compiled description regexes."""
+        return generate_description_regexes()
+
+    def test_quoted_replacement(self, regexes):
+        """Quoted description content is replaced, quotes preserved."""
+        lookup = {}
+        line = 'description "server.example.net (port14)"'
+        result = replace_descriptions(regexes, line, lookup, "salt1")
+        assert result.startswith('description "descr_')
+        assert result.endswith('"')
+        assert "server.example.net" not in result
+
+    def test_unquoted_replacement(self, regexes):
+        """Unquoted description content is replaced."""
+        lookup = {}
+        line = "description server.example.net"
+        result = replace_descriptions(regexes, line, lookup, "salt1")
+        assert result.startswith("description descr_")
+        assert "server.example.net" not in result
+
+    def test_semicolon_preserved(self, regexes):
+        """Trailing semicolons are preserved after replacement."""
+        lookup = {}
+        line = "description Link-to-upstream;"
+        result = replace_descriptions(regexes, line, lookup, "salt1")
+        assert result.endswith(";")
+        assert "Link-to-upstream" not in result
+
+    def test_quoted_semicolon_preserved(self, regexes):
+        """Quoted description with trailing semicolon preserved."""
+        lookup = {}
+        line = 'description "Core Router - Site A";'
+        result = replace_descriptions(regexes, line, lookup, "salt1")
+        assert result.endswith('";')
+        assert "Core Router" not in result
+
+    def test_non_matching_unchanged(self, regexes):
+        """Lines that don't match are returned unchanged."""
+        lookup = {}
+        line = "ip address 10.0.0.1 255.255.255.0"
+        result = replace_descriptions(regexes, line, lookup, "salt1")
+        assert result == line
+
+    def test_deterministic_replacement(self, regexes):
+        """Same description produces the same replacement."""
+        lookup1 = {}
+        lookup2 = {}
+        line = 'description "test value"'
+        result1 = replace_descriptions(regexes, line, lookup1, "salt1")
+        result2 = replace_descriptions(regexes, line, lookup2, "salt1")
+        assert result1 == result2
+
+    def test_context_preserved(self, regexes):
+        """Leading whitespace/context is preserved."""
+        lookup = {}
+        line = '  description "indented value"'
+        result = replace_descriptions(regexes, line, lookup, "salt1")
+        assert result.startswith("  description ")
+
+    def test_set_style_description(self, regexes):
+        """Set description ... style lines are handled."""
+        lookup = {}
+        line = "set interfaces ge-0/0/0 description upstream-link"
+        result = replace_descriptions(regexes, line, lookup, "salt1")
+        assert "descr_" in result
+        assert "upstream-link" not in result
diff --git a/tests/unit/test_file_anonymizer_descriptions.py b/tests/unit/test_file_anonymizer_descriptions.py
new file mode 100644
index 0000000..bde906a
--- /dev/null
+++ b/tests/unit/test_file_anonymizer_descriptions.py
@@ -0,0 +1,73 @@
+"""Tests for description anonymization integration in the FileAnonymizer pipeline."""
+
+import io
+
+from netconan.anonymize_files import FileAnonymizer
+
+
+class TestFileAnonymizerDescriptions:
+    """Tests for description anonymization through the FileAnonymizer pipeline."""
+
+    def _anonymize_line(
+        self, line, anon_descriptions=True, anon_pwd=False, salt="test"
+    ):
+        """Helper: run a single line through the anonymizer pipeline."""
+        anonymizer = FileAnonymizer(
+            anon_pwd=anon_pwd,
+            anon_ip=False,
+            salt=salt,
+            anon_descriptions=anon_descriptions,
+        )
+        in_io = io.StringIO(line)
+        out_io = io.StringIO()
+        anonymizer.anonymize_io(in_io, out_io)
+        return out_io.getvalue()
+
+    def test_description_only(self):
+        """Description line is anonymized when anon_descriptions is enabled."""
+        result = self._anonymize_line('description "sensitive host"\n')
+        assert "descr_" in result
+        assert "sensitive host" not in result
+
+    def test_descriptions_disabled(self):
+        """Description line is unchanged when anon_descriptions is disabled."""
+        line = 'description "sensitive host"\n'
+        result = self._anonymize_line(line, anon_descriptions=False)
+        assert result == line
+
+    def test_description_with_passwords(self):
+        """Both description and password anonymization work together."""
+        lines = 'description "link to core"\npassword foobar\n'
+        result = self._anonymize_line(lines, anon_descriptions=True, anon_pwd=True)
+        assert "descr_" in result
+        assert "link to core" not in result
+        assert "foobar" not in result
+
+    def test_non_description_unchanged(self):
+        """Non-description lines are not modified."""
+        line = "ip address 10.0.0.1 255.255.255.0\n"
+        result = self._anonymize_line(line)
+        assert result == line
+
+    def test_deterministic_with_salt(self):
+        """Same salt produces same anonymized output."""
+        line = 'description "test value"\n'
+        result1 = self._anonymize_line(line, salt="mysalt")
+        result2 = self._anonymize_line(line, salt="mysalt")
+        assert result1 == result2
+
+    def test_multiline_file(self):
+        """Only description lines are modified in a multi-line file."""
+        content = (
+            "interface GigabitEthernet0/0\n"
+            ' description "uplink to ISP"\n'
+            " ip address 10.0.0.1 255.255.255.0\n"
+            "!\n"
+        )
+        result = self._anonymize_line(content)
+        lines = result.split("\n")
+        assert lines[0] == "interface GigabitEthernet0/0"
+        assert "descr_" in lines[1]
+        assert "uplink to ISP" not in lines[1]
+        assert lines[2] == " ip address 10.0.0.1 255.255.255.0"
+        assert lines[3] == "!"