After years of working in localization, I’ve built up a collection of Python scripts that I use almost daily. Today, I’m sharing five of the most useful ones.
1. TMX Segment Counter
Need to quickly count segments in a TMX file without opening a CAT tool?
import xml.etree.ElementTree as ET
def count_tmx_segments(filepath):
tree = ET.parse(filepath)
root = tree.getroot()
segments = root.findall('.//tu')
return len(segments)
# Usage
count = count_tmx_segments('my_memory.tmx')
print(f"Total segments: {count}")
Simple, but I use this constantly for quick sanity checks.
2. Batch XLIFF to CSV Converter
When you need to analyze translation data in a spreadsheet:
import xml.etree.ElementTree as ET
import csv
import glob
def xliff_to_csv(xliff_path, csv_path):
tree = ET.parse(xliff_path)
root = tree.getroot()
# Handle XLIFF namespace
ns = {'xliff': 'urn:oasis:names:tc:xliff:document:1.2'}
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Source', 'Target', 'ID'])
for unit in root.findall('.//xliff:trans-unit', ns):
source = unit.find('xliff:source', ns)
target = unit.find('xliff:target', ns)
writer.writerow([
source.text if source is not None else '',
target.text if target is not None else '',
unit.get('id', '')
])
# Batch process all XLIFF files in a folder
for xliff in glob.glob('*.xliff'):
xliff_to_csv(xliff, xliff.replace('.xliff', '.csv'))
3. Terminology Consistency Checker
Find inconsistent translations of key terms:
from collections import defaultdict
import csv
def check_consistency(csv_path, terms_to_check):
inconsistencies = defaultdict(set)
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
source = row['Source'].lower()
target = row['Target']
for term in terms_to_check:
if term.lower() in source:
inconsistencies[term].add(target)
# Report terms with multiple translations
for term, translations in inconsistencies.items():
if len(translations) > 1:
print(f"'{term}' has {len(translations)} translations:")
for t in translations:
print(f" - {t}")
# Usage
terms = ['machine learning', 'neural network', 'training data']
check_consistency('translations.csv', terms)
4. Word Count by File Type
Get accurate word counts across multiple file formats:
import os
import re
from pathlib import Path
def count_words(text):
# Remove extra whitespace and count
words = re.findall(r'\b\w+\b', text)
return len(words)
def analyze_folder(folder_path):
results = {}
for filepath in Path(folder_path).rglob('*'):
if filepath.suffix in ['.txt', '.md', '.html']:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
results[str(filepath)] = count_words(content)
total = sum(results.values())
print(f"Total words: {total:,}")
for path, count in sorted(results.items(), key=lambda x: -x[1]):
print(f" {count:,} - {path}")
return results
# Usage
analyze_folder('./documents')
5. MT Output Comparator
Compare outputs from different MT engines:
import csv
from difflib import SequenceMatcher
def compare_mt_outputs(file1, file2, output_file):
with open(file1, 'r', encoding='utf-8') as f1, \
open(file2, 'r', encoding='utf-8') as f2, \
open(output_file, 'w', newline='', encoding='utf-8') as out:
reader1 = csv.DictReader(f1)
reader2 = csv.DictReader(f2)
writer = csv.writer(out)
writer.writerow(['Source', 'MT1', 'MT2', 'Similarity'])
for row1, row2 in zip(reader1, reader2):
similarity = SequenceMatcher(
None,
row1['Target'],
row2['Target']
).ratio()
writer.writerow([
row1['Source'],
row1['Target'],
row2['Target'],
f"{similarity:.2%}"
])
# Usage
compare_mt_outputs('deepl_output.csv', 'google_output.csv', 'comparison.csv')
Bonus: Making Scripts Accessible
I keep all these scripts in a dedicated folder and add it to my PATH. That way, I can run them from anywhere:
# Instead of:
python /path/to/scripts/count_tmx.py file.tmx
# I can just run:
count_tmx file.tmx
What’s in Your Toolkit?
These are just a few examples—I have dozens more for specific tasks. The key is building your personal library over time.
Every time you find yourself doing a repetitive task, ask: “Could I script this?” Usually, the answer is yes, and future-you will be grateful.
Want the complete scripts with error handling and CLI arguments? Drop me a line and I’ll share my full toolkit.