Special Sequences in Python
Special Sequences in Python Regular Expressions – Detailed Explanation
Special sequences are escape sequences that represent specific character types or positions in regex patterns.
1. \A – Start of String Anchor
Description: Matches only at the absolute start of the string (unaffected by re.MULTILINE flag)
Example 1: Match only at absolute beginning
python
import re text = "Start here\nStart again" result = re.findall(r'\AStart', text) print(result) # ['Start'] - Only matches at very beginning
Example 2: Comparison with ^ in multiline mode
python
text = "First line\nSecond line"
result_caret = re.findall(r'^.*', text, re.MULTILINE)
result_A = re.findall(r'\A.*', text)
print("^ matches:", result_caret) # ['First line', 'Second line']
print(r"\A matches:", result_A) # ['First line'] - Only first line
2. \Z – End of String Anchor
Description: Matches only at the absolute end of the string (unaffected by re.MULTILINE flag)
Example 1: Match only at absolute end
python
text = "Line one\nLine two\nEnd" result = re.findall(r'End\Z', text) print(result) # ['End'] - Only matches at very end
Example 2: Validate file extension at end
python
filenames = ["document.txt", "image.jpg", "script.py.backup"] valid_extensions = [f for f in filenames if re.search(r'\.txt\Z', f)] print(valid_extensions) # ['document.txt'] - Only exact .txt at end
3. \n – Newline Character
Description: Matches a newline character (line feed)
Example 1: Count lines in text
python
text = "Line 1\nLine 2\nLine 3\nLine 4"
line_count = len(re.findall(r'\n', text)) + 1
print("Number of lines:", line_count) # 4
Example 2: Split on newlines
python
text = "Hello\nWorld\nPython\nProgramming" lines = re.split(r'\n', text) print(lines) # ['Hello', 'World', 'Python', 'Programming']
4. \t – Tab Character
Description: Matches a tab character
Example 1: Find tab-separated values
python
text = "Name\tAge\tCity\nJohn\t25\tNYC\nJane\t30\tLA"
rows = re.split(r'\n', text)
for row in rows:
columns = re.split(r'\t', row)
print(columns)
# Output: ['Name', 'Age', 'City'], ['John', '25', 'NYC'], ['Jane', '30', 'LA']
Example 2: Replace tabs with spaces
python
text = "Column1\tColumn2\tColumn3" spaced_text = re.sub(r'\t', ' ', text) # Replace tabs with 4 spaces print(spaced_text) # "Column1 Column2 Column3"
5. \r – Carriage Return
Description: Matches a carriage return character
Example 1: Handle Windows line endings
python
text = "Line 1\r\nLine 2\r\nLine 3" # Remove carriage returns clean_text = re.sub(r'\r', '', text) print(repr(clean_text)) # 'Line 1\nLine 2\nLine 3'
Example 2: Count carriage returns
python
text = "Hello\rWorld\r\nTest\r"
cr_count = len(re.findall(r'\r', text))
print("Carriage returns:", cr_count) # 3
6. \f – Form Feed
Description: Matches a form feed character (rarely used in modern text)
Example 1: Find form feed characters
python
text = "Page 1\fPage 2\fPage 3"
page_breaks = re.findall(r'\f', text)
print("Form feeds found:", len(page_breaks)) # 2
Example 2: Split on form feeds
python
text = "Section 1\fSection 2\fSection 3" sections = re.split(r'\f', text) print(sections) # ['Section 1', 'Section 2', 'Section 3']
7. \v – Vertical Tab
Description: Matches a vertical tab character (rarely used)
Example 1: Detect vertical tabs
python
text = "Row1\vRow2\vRow3"
vertical_tabs = re.findall(r'\v', text)
print("Vertical tabs:", len(vertical_tabs)) # 2
Example 2: Replace vertical tabs
python
text = "Item1\vItem2\vItem3" clean_text = re.sub(r'\v', '\n', text) # Replace with newlines print(clean_text) # "Item1\nItem2\nItem3"
Practical Examples with Multiple Special Sequences
Example: Normalize different line endings
python
def normalize_line_endings(text):
"""Convert all line endings to Unix style (\n)"""
# Replace Windows (\r\n) and old Mac (\r) line endings
text = re.sub(r'\r\n', '\n', text) # Windows to Unix
text = re.sub(r'\r', '\n', text) # Old Mac to Unix
return text
mixed_text = "Line 1\r\nLine 2\nLine 3\rLine 4"
normalized = normalize_line_endings(mixed_text)
print(repr(normalized)) # 'Line 1\nLine 2\nLine 3\nLine 4'
Example: Parse CSV-like data with various whitespace
python
def parse_data(text):
"""Parse data with mixed whitespace separators"""
# Replace various whitespace with single comma
text = re.sub(r'[\t\v]+', ',', text) # Tabs and vertical tabs to commas
text = re.sub(r'[ ]+', ',', text) # Multiple spaces to commas
lines = re.split(r'\n', text) # Split into lines
return [line.split(',') for line in lines if line.strip()]
data_text = "Name Age City\nJohn 25\tNYC\nJane 30 LA"
parsed = parse_data(data_text)
print(parsed) # [['Name', 'Age', 'City'], ['John', '25', 'NYC'], ['Jane', '30', 'LA']]
Example: Validate complete string patterns
python
def is_valid_pattern(text, pattern):
"""Check if entire text matches pattern from start to finish"""
return bool(re.fullmatch(pattern, text))
# Validate email (entire string must match)
emails = ["user@example.com", "invalid@", "@domain.com"]
email_pattern = r'\A[\w\.-]+@[\w\.-]+\.\w+\Z'
for email in emails:
valid = is_valid_pattern(email, email_pattern)
print(f"'{email}': {valid}")
# Output: 'user@example.com': True, 'invalid@': False, '@domain.com': False
Example: Extract content between specific markers
python
def extract_sections(text, start_marker, end_marker):
"""Extract content between specific markers"""
pattern = fr'{re.escape(start_marker)}(.*?){re.escape(end_marker)}'
return re.findall(pattern, text, re.DOTALL)
text = """
[START_SECTION]
This is section 1
With multiple lines
[END_SECTION]
[START_SECTION]
Section 2 content
[END_SECTION]
"""
sections = extract_sections(text, "[START_SECTION]", "[END_SECTION]")
for i, section in enumerate(sections, 1):
print(f"Section {i}:\n{section.strip()}\n")
Important Notes
- String vs. Pattern: Special sequences work in regex patterns, not in ordinary strings
- Raw strings: Always use raw strings (
r'pattern') to avoid interpretation of backslashes - Multiline mode:
\Aand\Zare unaffected byre.MULTILINE, unlike^and$ - Platform differences: Line ending characters vary by platform (
\non Unix,\r\non Windows,\ron old Mac)
python
# Platform-aware line ending handling
def get_line_endings(text):
"""Detect the predominant line ending style"""
crlf_count = len(re.findall(r'\r\n', text))
lf_count = len(re.findall(r'(?<!\r)\n', text)) # \n not preceded by \r
cr_count = len(re.findall(r'\r(?!\n)', text)) # \r not followed by \n
if crlf_count > max(lf_count, cr_count):
return 'Windows (\\r\\n)'
elif lf_count > max(crlf_count, cr_count):
return 'Unix (\\n)'
elif cr_count > max(crlf_count, lf_count):
return 'Old Mac (\\r)'
else:
return 'Mixed or none'
text = "Line 1\r\nLine 2\nLine 3\rLine 4"
print("Line ending style:", get_line_endings(text))