"""
Tests for identifier extraction logic.

Tests the extract_identifier function which extracts clean identifier
values from raw input using country-specific PEPPOL schemes.
"""

import unittest
import warnings

from ubl.identifiers.extractor import extract_identifier


class TestIdentifierExtraction(unittest.TestCase):
    """Test identifier extraction with regex patterns."""

    def test_extract_vat_no_regex_belgium(self):
        """VAT schemes have no regex - accept as-is."""
        result = extract_identifier("BE0867709540", "BE", "vat")
        self.assertEqual(result, "BE0867709540")

    def test_extract_vat_no_regex_netherlands(self):
        """Netherlands VAT - no regex."""
        result = extract_identifier("NL855934682B01", "NL", "vat")
        self.assertEqual(result, "NL855934682B01")

    def test_extract_vat_with_prefix(self):
        """VAT with 'vat:' prefix - still accepted (no regex)."""
        result = extract_identifier("vat: BE0867709540", "BE", "vat")
        self.assertEqual(result, "vat: BE0867709540")

    def test_extract_kvk_fails_regex(self):
        """Netherlands KVK extraction fails (8 digits vs 17 expected)."""
        # KVK in real world is 8 digits, but PEPPOL scheme expects 17 (padded?)
        # OINO also expects 20 digits. Both regexes fail for 8-digit number.
        # For now, extraction returns None - padding logic can be added later.
        result = extract_identifier("kvk: 64985636", "NL", "registration")
        self.assertIsNone(result)  # No scheme matches 8 digits

    def test_extract_belgian_enterprise_number(self):
        """Belgian enterprise number with regex."""
        result = extract_identifier("0597601756", "BE", "registration")
        self.assertEqual(result, "0597601756")

    def test_extract_belgian_enterprise_with_prefix(self):
        """Belgian enterprise number with text prefix."""
        result = extract_identifier("enterprise: 0597601756", "BE", "registration")
        # BE:EN regex is 0[0-9]{9}
        self.assertEqual(result, "0597601756")

    def test_extract_no_matching_scheme(self):
        """No scheme for country+type returns None."""
        result = extract_identifier("12345", "XX", "vat")  # Invalid country
        self.assertIsNone(result)

    def test_extract_invalid_format_with_regex(self):
        """Invalid format for scheme with regex returns None and warns."""
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            # Belgian enterprise number must start with 0
            result = extract_identifier("9597601756", "BE", "registration")
            # Should fail regex validation
            self.assertIsNone(result)
            # Should have warning
            self.assertGreater(len(w), 0)
            self.assertIn("Could not extract", str(w[0].message))

    def test_extract_vat_germany(self):
        """German VAT - no regex."""
        result = extract_identifier("DE123456789", "DE", "vat")
        self.assertEqual(result, "DE123456789")

    def test_extract_vat_france(self):
        """French VAT - no regex."""
        result = extract_identifier("FR12345678901", "FR", "vat")
        self.assertEqual(result, "FR12345678901")

    def test_extract_french_sirene(self):
        """French SIRENE with regex."""
        result = extract_identifier("784301772", "FR", "registration")
        # FR:SIRENE regex: [0-9]{9}([0-9]{5})?
        self.assertEqual(result, "784301772")

    def test_extract_french_siret(self):
        """French SIRET (14 digits) with regex."""
        result = extract_identifier("78430177200025", "FR", "registration")
        # Should extract 14-digit number
        self.assertEqual(result, "78430177200025")

    def test_extract_empty_value(self):
        """Empty value with no regex returns empty string."""
        result = extract_identifier("", "BE", "vat")
        # No regex means accept as-is, even empty
        self.assertEqual(result, "")

    def test_extract_whitespace_trimmed(self):
        """Whitespace in value with no regex."""
        result = extract_identifier("  BE0867709540  ", "BE", "vat")
        # No regex means accept as-is, whitespace included
        self.assertEqual(result, "  BE0867709540  ")

    def test_extract_mixed_case(self):
        """Mixed case with no regex."""
        result = extract_identifier("be0867709540", "BE", "vat")
        # No regex means accept as-is
        self.assertEqual(result, "be0867709540")


if __name__ == "__main__":
    unittest.main()
