Special Character Classes Explained with Examples

Special Character Classes Explained with Examples

1. [\\\^\-\]] – Escaped special characters in brackets

Description: Matches literal backslash, caret, hyphen, or closing bracket characters inside character classes

Example 1: Matching literal special characters

python

import re

text = "Special chars: \\ ^ - ] ["
result = re.findall(r'[\\\^\-\]]', text)
print(result)  # ['\\', '^', '-', ']']
# Matches literal \, ^, -, and ] characters

Example 2: Extracting file paths with backslashes

python

text = "Paths: C:\\Windows\\System32, /usr/bin/, D:\\Program Files\\"
result = re.findall(r'[A-Z]:[\\\w]+', text)
print(result)  # ['C:\\Windows', 'D:\\Program']
# Matches Windows paths with literal backslashes

Example 3: Finding mathematical ranges

python

text = "Ranges: 1-10, 20-30, A-Z, 5-10, a-z"
result = re.findall(r'[A-Za-z0-9]\-[A-Za-z0-9]', text)
print(result)  # ['1-1', '0-3', 'A-Z', '5-1', 'a-z']
# Matches range patterns with literal hyphens

Example 4: Escaping regex metacharacters in search

python

text = "Regex specials: [group], ^start, end$, .any, *star"
result = re.findall(r'[\^\.\*\$\[\]]', text)
print(result)  # ['[', ']', '^', '$', '.', '*']
# Matches literal regex metacharacters

2. [\n\t\r] – Common whitespace characters

Description: Matches newline, tab, or carriage return characters

Example 1: Finding all whitespace characters

python

text = "Hello\tWorld\nHow are you?\rGoodbye"
result = re.findall(r'[\n\t\r]', text)
print(result)  # ['\t', '\n', '\r']
# Matches tab, newline, and carriage return
print("Whitespace count:", len(result))  # Whitespace count: 3

Example 2: Normalizing different line endings

python

text = "Line 1\r\nLine 2\nLine 3\rLine 4"
# Replace different line endings with Unix-style \n
normalized = re.sub(r'[\r\n]+', '\n', text)
print(repr(normalized))  # 'Line 1\nLine 2\nLine 3\nLine 4'

Example 3: Counting indentation levels (tabs)

python

code = "def example():\n\tprint('Hello')\n\t\tprint('Indented')\n\treturn"
tabs = re.findall(r'\t', code)
print("Indentation levels found:", len(tabs))  # Indentation levels found: 3

Example 4: Splitting on any whitespace including newlines

python

text = "Hello\tWorld\nHow  are\ryou today?"
words = re.split(r'[\s\n\t\r]+', text)
print(words)  # ['Hello', 'World', 'How', 'are', 'you', 'today?']
# Splits on any whitespace character

3. [\x00-\x7F] – ASCII characters

Description: Matches any character in the ASCII range (0-127)

Example 1: Filter ASCII characters only

python

text = "Hello 世界! 123 ñ Café"
ascii_only = re.findall(r'[\x00-\x7F]', text)
print(''.join(ascii_only))  # "Hello ! 123  Caf"
# Removes non-ASCII characters (中文, ñ, é)

Example 2: Validate ASCII-only text

python

def is_ascii_only(text):
    return not re.search(r'[^\x00-\x7F]', text)

print(is_ascii_only("Hello World"))      # True
print(is_ascii_only("Hello 世界"))       # False
print(is_ascii_only("Café"))             # False
print(is_ascii_only("123!@#"))           # True

Example 3: Remove control characters (non-printable ASCII)

python

text = "Hello\x00World\x07\x1BTest\nNormal"
# Keep only printable ASCII (32-126)
printable = re.findall(r'[\x20-\x7E]', text)
print(''.join(printable))  # "HelloWorldTestNormal"

Example 4: Extract ASCII strings from mixed content

python

text = "ASCII: Hello, Non-ASCII: 中文, Emoji: 😊, Numbers: 123"
ascii_parts = re.findall(r'[\x20-\x7E]+', text)
print(ascii_parts)  # ['ASCII: Hello, Non-ASCII: ', ', Emoji: ', ', Numbers: 123']

4. [\u0000-\uFFFF] – Unicode characters

Description: Matches any character in the Basic Multilingual Plane (most common Unicode characters)

Example 1: Working with multilingual text

python

text = "Hello 世界! 🌍 Bonjour ñ Café 🎉"
all_chars = re.findall(r'[\u0000-\uFFFF]', text)
print(all_chars)  # ['H', 'e', 'l', 'l', 'o', ' ', '世', '界', '!', ' ', '🌍', ' ', 'B', 'o', 'n', 'j', 'o', 'u', 'r', ' ', 'ñ', ' ', 'C', 'a', 'f', 'é', ' ', '🎉']
# Matches all characters including Unicode

Example 2: Finding specific Unicode ranges

python

text = "中文 Chinese, 日本語 Japanese, 한국어 Korean, English"
# Find CJK characters (approx range)
cjk_chars = re.findall(r'[\u4E00-\u9FFF]', text)
print(''.join(cjk_chars))  # "中文日本語韩国語"

Example 3: Validating Unicode input

python

def contains_unicode(text):
    return bool(re.search(r'[^\u0000-\u007F]', text))

print(contains_unicode("ASCII only"))    # False
print(contains_unicode("Café"))          # True
print(contains_unicode("Hello 世界"))    # True
print(contains_unicode("123!@#"))        # False

Example 4: Extracting emojis and symbols

python

text = "I love Python! 🐍🚀 It's amazing! 💻✨ 🎯"
# Approximate emoji/symbol range
symbols = re.findall(r'[\u2000-\uFFFF]', text)
print(symbols)  # ['🐍', '🚀', '💻', '✨', '🎯']
# Matches emojis and other symbols beyond basic ASCII

Bonus: Advanced Examples

Example: Mixed character class usage

python

text = "File: C:\\Users\\文档\\file.txt\nSize: 1.5MB\r\nUnicode: 中文 🎉"

# Extract different components
paths = re.findall(r'[A-Z]:[\\\w\u4E00-\u9FFF]+', text)
sizes = re.findall(r'[\d.]+[A-Za-z]+', text)
unicode_content = re.findall(r'[\u4E00-\u9FFF\U0001F300-\U0001F9FF]', text)

print("Paths:", paths)            # ['C:\\Users\\文档\\file']
print("Sizes:", sizes)            # ['1.5MB']
print("Unicode:", unicode_content) # ['文', '🎉']

Example: Cleaning text with multiple character classes

python

def clean_text(text):
    # Remove control characters but keep Unicode
    text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
    # Normalize whitespace
    text = re.sub(r'[\n\t\r]+', ' ', text)
    # Remove excessive spaces
    text = re.sub(r' +', ' ', text)
    return text.strip()

dirty_text = "Hello\t\tWorld\n\n\nUnicode: 中文\r\x00Control chars"
clean = clean_text(dirty_text)
print(repr(clean))  # 'Hello World Unicode: 中文'

Example: Password complexity checker

python

def check_password_complexity(password):
    has_upper = bool(re.search(r'[A-Z]', password))
    has_lower = bool(re.search(r'[a-z]', password))
    has_digit = bool(re.search(r'[0-9]', password))
    has_special = bool(re.search(r'[^\w]', password))
    has_unicode = bool(re.search(r'[^\x00-\x7F]', password))
    
    return {
        'has_upper': has_upper,
        'has_lower': has_lower,
        'has_digit': has_digit,
        'has_special': has_special,
        'has_unicode': has_unicode,
        'is_strong': has_upper and has_lower and has_digit and len(password) >= 8
    }

print(check_password_complexity("Pass123!"))
print(check_password_complexity("password"))
print(check_password_complexity("Pässwörd123!"))

Similar Posts

Leave a Reply

Your email address will not be published. Required fields are marked *