-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathconvert.py
executable file
·83 lines (63 loc) · 2.01 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Usage:
# convert.py input_filename
# input_filename is a file of Wikipedia article titles, one title per line.
import logging
import re
import sys
import opencc
from pypinyin import lazy_pinyin
# Require at least 2 characters
_MINIMUM_LEN = 2
_LIST_PAGE_ENDINGS = [
'列表',
'对照表',
]
_LOG_EVERY = 1000
_PINYIN_SEPARATOR = '\''
_HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
_TO_SIMPLIFIED_CHINESE = opencc.OpenCC('t2s.json')
_PINYIN_FIXES = {
'n': 'en', # https://github.com/felixonmars/fcitx5-pinyin-zhwiki/issues/13
}
logging.basicConfig(level=logging.INFO)
def is_good_title(title, previous_title=None):
if not _HANZI_RE.match(title):
return False
# Skip single character & too long pages
if len(title) < _MINIMUM_LEN:
return False
# Skip list pages
if title.endswith(tuple(_LIST_PAGE_ENDINGS)):
return False
if previous_title and \
len(previous_title) >= 4 and \
title.startswith(previous_title):
return False
return True
def log_count(count):
logging.info(f'{count} words generated')
def make_output(word, pinyin):
return '\t'.join([word, pinyin, '0'])
def main():
previous_title = None
result_count = 0
with open(sys.argv[1]) as f:
for line in f:
title = _TO_SIMPLIFIED_CHINESE.convert(line.strip())
if is_good_title(title, previous_title):
pinyin = [_PINYIN_FIXES.get(item, item) for item in lazy_pinyin(title)]
pinyin = _PINYIN_SEPARATOR.join(pinyin)
if pinyin == title:
logging.info(
f'Failed to convert to Pinyin. Ignoring: {pinyin}')
continue
print(make_output(title, pinyin))
result_count += 1
if result_count % _LOG_EVERY == 0:
log_count(result_count)
previous_title = title
log_count(result_count)
if __name__ == '__main__':
main()