diff --git a/README.rst b/README.rst index 34c46e0..cd9c113 100644 --- a/README.rst +++ b/README.rst @@ -64,6 +64,7 @@ Netconan can anonymize *many types of sensitive information*: * IPv4 and IPv6 addresses (``--anonymize-ips``, ``-a``). * User-specified sensitive words (``--sensitive-words``, ``-w``). *Note that any occurrence of a specified sensitive word will be replaced regardless of context, even if it is part of a larger string.* * User-specified AS numbers (``--as-numbers``, ``-n``). *Note that any number matching a specified AS number will be anonymized.* +* Description fields (``--anonymize-descriptions``). *Description text is replaced with a deterministic hash, preserving surrounding quotes and semicolons.* Netconan attempts to *preserve useful structure*. For example, diff --git a/netconan/anonymize_files.py b/netconan/anonymize_files.py index 4a0334a..8d78ce4 100644 --- a/netconan/anonymize_files.py +++ b/netconan/anonymize_files.py @@ -18,12 +18,17 @@ import logging import os import random +import re import string import sys from collections.abc import Sequence from typing import IO from .default_reserved_words import default_reserved_words +from .description_anonymization import ( + generate_description_regexes, + replace_descriptions, +) from .ip_anonymization import IpAnonymizer, IpV6Anonymizer, anonymize_ip_addr from .sensitive_item_removal import ( AsNumberAnonymizer, @@ -54,6 +59,7 @@ def __init__( preserve_networks: Sequence[str] | None = None, preserve_suffix_v4: int | None = None, preserve_suffix_v6: int | None = None, + anon_descriptions: bool = False, ) -> None: """Creates anonymizer classes.""" self.undo_ip_anon = undo_ip_anon @@ -64,6 +70,8 @@ def __init__( self.anonymizer_sensitive_word: SensitiveWordAnonymizer | None = None self.compiled_regexes: list[list[CompiledRegexRule]] | None = None self.pwd_lookup: dict[str, str] | None = None + self.description_regexes: list[re.Pattern[str]] | None = None + self.description_lookup: dict[str, str] | None = None # The salt is only used for IP and sensitive word anonymization if salt is None: @@ -74,6 +82,10 @@ def __init__( self.salt: str = salt logging.debug('Using salt: "%s"', self.salt) + if anon_descriptions: + self.description_regexes = generate_description_regexes() + self.description_lookup = {} + if anon_pwd: self.compiled_regexes = generate_default_sensitive_item_regexes() self.pwd_lookup = {} @@ -122,6 +134,17 @@ def anonymize_io(self, in_io: IO[str], out_io: IO[str]) -> None: if self.anonymizer_sensitive_word is not None: output_line = self.anonymizer_sensitive_word.anonymize(output_line) + if ( + self.description_regexes is not None + and self.description_lookup is not None + ): + output_line = replace_descriptions( + self.description_regexes, + output_line, + self.description_lookup, + self.salt, + ) + if self.anonymizer_as_num is not None: output_line = anonymize_as_numbers(self.anonymizer_as_num, output_line) @@ -146,6 +169,7 @@ def anonymize_files( preserve_networks: Sequence[str] | None = None, preserve_suffix_v4: int | None = None, preserve_suffix_v6: int | None = None, + anon_descriptions: bool = False, ) -> None: """Anonymize each file in input and save to output.""" use_stdin = input_path == "-" @@ -196,6 +220,7 @@ def anonymize_files( salt=salt, sensitive_words=sensitive_words, undo_ip_anon=undo_ip_anon, + anon_descriptions=anon_descriptions, ) for in_path, out_path in file_list: diff --git a/netconan/description_anonymization.py b/netconan/description_anonymization.py new file mode 100644 index 0000000..feafa1d --- /dev/null +++ b/netconan/description_anonymization.py @@ -0,0 +1,65 @@ +"""Anonymize description fields in network configuration files.""" + +import hashlib +import re + +# Matches: description "some text here" +_DESCRIPTION_QUOTED_REGEX = re.compile( + r'(?P
description\s+")(?P[^"]+)(?P ")' +) + +# Matches: description some text here (with optional trailing semicolon) +_DESCRIPTION_UNQUOTED_REGEX = re.compile( + r"(?P description\s+)(?P[^\";\s].+?)\s*(?P ;?\s*)$" +) + + +def anonymize_description(value: str, lookup: dict[str, str], salt: str) -> str: + """Return a deterministic anonymized replacement for a description value. + + Uses SHA-256 hashing with the given salt to produce a stable 8-character + base32-encoded identifier prefixed with 'descr_'. + """ + if value in lookup: + return lookup[value] + hash_input = (salt + value).encode("utf-8") + digest = hashlib.sha256(hash_input).digest() + # Use first 5 bytes -> 8 base32 chars, strip padding, lowercase + anon = "descr_" + _base32_encode(digest[:5]).lower() + lookup[value] = anon + return anon + + +def _base32_encode(data: bytes) -> str: + """Base32 encode bytes and strip padding.""" + import base64 + + return base64.b32encode(data).decode("ascii").rstrip("=") + + +def generate_description_regexes() -> list[re.Pattern[str]]: + """Return list of compiled regexes for matching description lines.""" + return [_DESCRIPTION_QUOTED_REGEX, _DESCRIPTION_UNQUOTED_REGEX] + + +def replace_descriptions( + regexes: list[re.Pattern[str]], line: str, lookup: dict[str, str], salt: str +) -> str: + """Replace description content in a line if it matches any regex. + + First match wins. Returns the line with description content replaced, + preserving surrounding context (quotes, semicolons, whitespace). + """ + for regex in regexes: + match = regex.search(line) + if match: + desc_value = match.group("desc") + anon_value = anonymize_description(desc_value, lookup, salt) + return ( + line[: match.start()] + + match.group("pre") + + anon_value + + match.group("post") + + line[match.end() :] + ) + return line diff --git a/netconan/netconan.py b/netconan/netconan.py index 0ec9bb8..3862c82 100644 --- a/netconan/netconan.py +++ b/netconan/netconan.py @@ -153,6 +153,12 @@ def _parse_args(argv: list[str]) -> argparse.Namespace: default=8, help="Preserve the trailing bits of IP addresses, aka the host bits of a network. Set this value large enough to represent the largest interface network (e.g., 8 for a /24 or 12 for a /20) or NAT pool.", ) + parser.add_argument( + "--anonymize-descriptions", + action="store_true", + default=False, + help="Anonymize description fields with deterministic hashed replacements", + ) result: argparse.Namespace = parser.parse_args(argv) return result @@ -220,6 +226,7 @@ def main(argv: list[str] = sys.argv[1:]) -> None: args.anonymize_passwords, args.anonymize_ips, args.undo, + args.anonymize_descriptions, ] ): logging.warning( @@ -239,6 +246,7 @@ def main(argv: list[str] = sys.argv[1:]) -> None: reserved_words, preserve_prefixes, preserve_addresses, + anon_descriptions=args.anonymize_descriptions, preserve_suffix_v4=args.preserve_host_bits, preserve_suffix_v6=args.preserve_host_bits, ) diff --git a/tests/end_to_end/test_e2e_descriptions.py b/tests/end_to_end/test_e2e_descriptions.py new file mode 100644 index 0000000..4d255be --- /dev/null +++ b/tests/end_to_end/test_e2e_descriptions.py @@ -0,0 +1,70 @@ +"""End-to-end tests for description anonymization.""" + +from netconan.netconan import main + +INPUT_CONTENTS = """\ +interface GigabitEthernet0/0 + description "uplink to core-router1 (port 14)" + ip address 10.0.0.1 255.255.255.0 +! +interface GigabitEthernet0/1 + description link-to-provider; + ip address 10.0.0.2 255.255.255.0 +! +""" + + +def test_e2e_descriptions(tmpdir): + """Test that --anonymize-descriptions replaces description content.""" + filename = "test.cfg" + input_dir = tmpdir.mkdir("input") + input_dir.join(filename).write(INPUT_CONTENTS) + + output_dir = tmpdir.mkdir("output") + args = [ + "-i", + str(input_dir), + "-o", + str(output_dir), + "-s", + "E2ESALT", + "--anonymize-descriptions", + ] + main(args) + + with open(str(output_dir.join(filename))) as f: + output = f.read() + + # Description content should be replaced + assert "uplink to core-router1 (port 14)" not in output + assert "link-to-provider" not in output + assert "descr_" in output + + # Non-description lines should be preserved + assert "interface GigabitEthernet0/0" in output + assert "ip address 10.0.0.1 255.255.255.0" in output + assert "interface GigabitEthernet0/1" in output + + +def test_e2e_descriptions_deterministic(tmpdir): + """Test that description anonymization is deterministic with same salt.""" + filename = "test.cfg" + + input_dir1 = tmpdir.mkdir("input1") + input_dir1.join(filename).write(INPUT_CONTENTS) + output_dir1 = tmpdir.mkdir("output1") + + input_dir2 = tmpdir.mkdir("input2") + input_dir2.join(filename).write(INPUT_CONTENTS) + output_dir2 = tmpdir.mkdir("output2") + + args_base = ["-s", "DETSALT", "--anonymize-descriptions"] + + main(args_base + ["-i", str(input_dir1), "-o", str(output_dir1)]) + main(args_base + ["-i", str(input_dir2), "-o", str(output_dir2)]) + + with ( + open(str(output_dir1.join(filename))) as f1, + open(str(output_dir2.join(filename))) as f2, + ): + assert f1.read() == f2.read() diff --git a/tests/unit/test_description_anonymization.py b/tests/unit/test_description_anonymization.py new file mode 100644 index 0000000..2f1b9fd --- /dev/null +++ b/tests/unit/test_description_anonymization.py @@ -0,0 +1,204 @@ +"""Tests for description anonymization module.""" + +import pytest + +from netconan.description_anonymization import ( + anonymize_description, + generate_description_regexes, + replace_descriptions, +) + + +class TestAnonymizeDescription: + """Tests for the anonymize_description function.""" + + def test_deterministic(self): + """Same input+salt always produces the same output.""" + lookup = {} + result1 = anonymize_description("server.example.net", lookup, "salt1") + lookup2 = {} + result2 = anonymize_description("server.example.net", lookup2, "salt1") + assert result1 == result2 + + def test_format_prefix(self): + """Output starts with 'descr_'.""" + lookup = {} + result = anonymize_description("test value", lookup, "salt1") + assert result.startswith("descr_") + + def test_format_length(self): + """Output has 8 chars after the prefix.""" + lookup = {} + result = anonymize_description("test value", lookup, "salt1") + suffix = result[len("descr_") :] + assert len(suffix) == 8 + + def test_format_lowercase_alphanumeric(self): + """Output suffix is lowercase alphanumeric (base32).""" + lookup = {} + result = anonymize_description("test value", lookup, "salt1") + suffix = result[len("descr_") :] + assert suffix == suffix.lower() + assert suffix.isalnum() + + def test_caching_in_lookup(self): + """Once computed, the result is cached in the lookup dict.""" + lookup = {} + result = anonymize_description("cached value", lookup, "salt1") + assert "cached value" in lookup + assert lookup["cached value"] == result + + def test_different_salt_different_result(self): + """Different salts produce different results.""" + result1 = anonymize_description("same value", {}, "salt_a") + result2 = anonymize_description("same value", {}, "salt_b") + assert result1 != result2 + + def test_different_values_different_result(self): + """Different description values produce different results.""" + result1 = anonymize_description("value_one", {}, "salt1") + result2 = anonymize_description("value_two", {}, "salt1") + assert result1 != result2 + + +class TestRegexMatching: + """Tests for description regex patterns.""" + + @pytest.fixture + def regexes(self): + """Return compiled description regexes.""" + return generate_description_regexes() + + @pytest.mark.parametrize( + "line,expected_desc", + [ + ( + 'description "server.example.net (port14)"', + "server.example.net (port14)", + ), + ('description "Core Router - Site A"', "Core Router - Site A"), + (' description "indented quoted"', "indented quoted"), + ], + ids=["quoted-basic", "quoted-spaces", "quoted-indented"], + ) + def test_quoted_regex_matches(self, regexes, line, expected_desc): + """Quoted description regex captures the description content.""" + match = regexes[0].search(line) + assert match is not None + assert match.group("desc") == expected_desc + + @pytest.mark.parametrize( + "line,expected_desc", + [ + ("description server.example.net", "server.example.net"), + ("description Link-to-upstream;", "Link-to-upstream"), + ("description multi word value", "multi word value"), + (" description indented-value", "indented-value"), + ("description trailing-semi ;", "trailing-semi"), + ], + ids=[ + "unquoted-simple", + "unquoted-semicolon", + "unquoted-multiword", + "unquoted-indented", + "unquoted-space-before-semi", + ], + ) + def test_unquoted_regex_matches(self, regexes, line, expected_desc): + """Unquoted description regex captures the description content.""" + match = regexes[1].search(line) + assert match is not None + assert match.group("desc") == expected_desc + + @pytest.mark.parametrize( + "line", + [ + "hostname router1", + "interface GigabitEthernet0/0", + "ip address 10.0.0.1 255.255.255.0", + "set description-limit 100", + ], + ids=[ + "hostname", + "interface", + "ip-address", + "set-description-limit", + ], + ) + def test_no_false_positives(self, regexes, line): + """Lines that are not descriptions should not match.""" + for regex in regexes: + assert regex.search(line) is None + + +class TestReplaceDescriptions: + """Tests for the replace_descriptions function.""" + + @pytest.fixture + def regexes(self): + """Return compiled description regexes.""" + return generate_description_regexes() + + def test_quoted_replacement(self, regexes): + """Quoted description content is replaced, quotes preserved.""" + lookup = {} + line = 'description "server.example.net (port14)"' + result = replace_descriptions(regexes, line, lookup, "salt1") + assert result.startswith('description "descr_') + assert result.endswith('"') + assert "server.example.net" not in result + + def test_unquoted_replacement(self, regexes): + """Unquoted description content is replaced.""" + lookup = {} + line = "description server.example.net" + result = replace_descriptions(regexes, line, lookup, "salt1") + assert result.startswith("description descr_") + assert "server.example.net" not in result + + def test_semicolon_preserved(self, regexes): + """Trailing semicolons are preserved after replacement.""" + lookup = {} + line = "description Link-to-upstream;" + result = replace_descriptions(regexes, line, lookup, "salt1") + assert result.endswith(";") + assert "Link-to-upstream" not in result + + def test_quoted_semicolon_preserved(self, regexes): + """Quoted description with trailing semicolon preserved.""" + lookup = {} + line = 'description "Core Router - Site A";' + result = replace_descriptions(regexes, line, lookup, "salt1") + assert result.endswith('";') + assert "Core Router" not in result + + def test_non_matching_unchanged(self, regexes): + """Lines that don't match are returned unchanged.""" + lookup = {} + line = "ip address 10.0.0.1 255.255.255.0" + result = replace_descriptions(regexes, line, lookup, "salt1") + assert result == line + + def test_deterministic_replacement(self, regexes): + """Same description produces the same replacement.""" + lookup1 = {} + lookup2 = {} + line = 'description "test value"' + result1 = replace_descriptions(regexes, line, lookup1, "salt1") + result2 = replace_descriptions(regexes, line, lookup2, "salt1") + assert result1 == result2 + + def test_context_preserved(self, regexes): + """Leading whitespace/context is preserved.""" + lookup = {} + line = ' description "indented value"' + result = replace_descriptions(regexes, line, lookup, "salt1") + assert result.startswith(" description ") + + def test_set_style_description(self, regexes): + """Set description ... style lines are handled.""" + lookup = {} + line = "set interfaces ge-0/0/0 description upstream-link" + result = replace_descriptions(regexes, line, lookup, "salt1") + assert "descr_" in result + assert "upstream-link" not in result diff --git a/tests/unit/test_file_anonymizer_descriptions.py b/tests/unit/test_file_anonymizer_descriptions.py new file mode 100644 index 0000000..bde906a --- /dev/null +++ b/tests/unit/test_file_anonymizer_descriptions.py @@ -0,0 +1,73 @@ +"""Tests for description anonymization integration in the FileAnonymizer pipeline.""" + +import io + +from netconan.anonymize_files import FileAnonymizer + + +class TestFileAnonymizerDescriptions: + """Tests for description anonymization through the FileAnonymizer pipeline.""" + + def _anonymize_line( + self, line, anon_descriptions=True, anon_pwd=False, salt="test" + ): + """Helper: run a single line through the anonymizer pipeline.""" + anonymizer = FileAnonymizer( + anon_pwd=anon_pwd, + anon_ip=False, + salt=salt, + anon_descriptions=anon_descriptions, + ) + in_io = io.StringIO(line) + out_io = io.StringIO() + anonymizer.anonymize_io(in_io, out_io) + return out_io.getvalue() + + def test_description_only(self): + """Description line is anonymized when anon_descriptions is enabled.""" + result = self._anonymize_line('description "sensitive host"\n') + assert "descr_" in result + assert "sensitive host" not in result + + def test_descriptions_disabled(self): + """Description line is unchanged when anon_descriptions is disabled.""" + line = 'description "sensitive host"\n' + result = self._anonymize_line(line, anon_descriptions=False) + assert result == line + + def test_description_with_passwords(self): + """Both description and password anonymization work together.""" + lines = 'description "link to core"\npassword foobar\n' + result = self._anonymize_line(lines, anon_descriptions=True, anon_pwd=True) + assert "descr_" in result + assert "link to core" not in result + assert "foobar" not in result + + def test_non_description_unchanged(self): + """Non-description lines are not modified.""" + line = "ip address 10.0.0.1 255.255.255.0\n" + result = self._anonymize_line(line) + assert result == line + + def test_deterministic_with_salt(self): + """Same salt produces same anonymized output.""" + line = 'description "test value"\n' + result1 = self._anonymize_line(line, salt="mysalt") + result2 = self._anonymize_line(line, salt="mysalt") + assert result1 == result2 + + def test_multiline_file(self): + """Only description lines are modified in a multi-line file.""" + content = ( + "interface GigabitEthernet0/0\n" + ' description "uplink to ISP"\n' + " ip address 10.0.0.1 255.255.255.0\n" + "!\n" + ) + result = self._anonymize_line(content) + lines = result.split("\n") + assert lines[0] == "interface GigabitEthernet0/0" + assert "descr_" in lines[1] + assert "uplink to ISP" not in lines[1] + assert lines[2] == " ip address 10.0.0.1 255.255.255.0" + assert lines[3] == "!"