forked from saturndec/waoowaoo
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_chinese.py
More file actions
91 lines (77 loc) · 3.09 KB
/
extract_chinese.py
File metadata and controls
91 lines (77 loc) · 3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
"""
提取React/TypeScript代码中的硬编码中文字符串
"""
import re
import os
from pathlib import Path
import json
def extract_chinese_strings(file_path):
"""提取文件中的中文字符串"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
except:
return []
results = []
# 匹配JSX/TSX中的中文字符串
# 1. {' 中文 '} 或 {"中文"}
pattern1 = r'\{\s*[\'"]([^\'"\{\}]*[\u4e00-\u9fff]+[^\'"\{\}]*)[\'\"]\s*\}'
# 2. >中文<
pattern2 = r'\>([^<\>]*[\u4e00-\u9fff]+[^<\>]*)\<'
# 3. placeholder="中文" 等属性
pattern3 = r'(?:placeholder|title|alt|value|defaultValue|confirmText|cancelText|message)\s*=\s*[\'"]([^\'\"]*[\u4e00-\u9fff]+[^\'\"]*)[\'"]'
# 4. 字符串默认值 = '中文'
pattern4 = r'=\s*[\'"]([^\'\"]*[\u4e00-\u9fff]+[^\'\"]*)[\'"]'
for pattern in [pattern1, pattern2, pattern3, pattern4]:
matches = re.finditer(pattern, content)
for match in matches:
chinese_text = match.group(1).strip()
if chinese_text and len(chinese_text) > 0:
# 跳过注释
line_num = content[:match.start()].count('\n') + 1
line = content.split('\n')[line_num - 1]
if '//' in line and line.index('//') < line.find(chinese_text):
continue
results.append({
'text': chinese_text,
'line': line_num,
'category': 'unknown'
})
# 去重
seen = set()
unique_results = []
for r in results:
key = f"{r['text']}_{r['line']}"
if key not in seen:
seen.add(key)
unique_results.append(r)
return unique_results
def scan_directory(base_path,exclude_patterns=['test-ui']):
"""扫描目录中的所有TSX/TS文件"""
all_findings = {}
for root, dirs, files in os.walk(base_path):
# 排除特定目录
dirs[:] = [d for d in dirs if d not in exclude_patterns and not d.startswith('.')]
for file in files:
if file.endswith(('.tsx', '.ts')):
file_path = os.path.join(root, file)
relative_path = os.path.relpath(file_path, base_path)
findings = extract_chinese_strings(file_path)
if findings:
all_findings[relative_path] = findings
return all_findings
if __name__ == '__main__':
base_dir = 'src'
results = scan_directory(base_dir)
# 输出结果
total = 0
for file_path, findings in sorted(results.items()):
if findings:
print(f"\n## {file_path} ({len(findings)} strings)")
for finding in findings[:10]: # 只显示前10个
print(f" Line {finding['line']}: {finding['text'][:60]}")
total += len(findings)
if len(findings) > 10:
print(f" ... and {len(findings) - 10} more")
print(f"\n\n总计: {len(results)} 个文件, {total} 处硬编码中文")