-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse_wikis.py
94 lines (76 loc) · 2.84 KB
/
parse_wikis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
A utiliscript to process data pulled from List of Wikipedias:
http://en.wikipedia.org/wiki/List_of_wikipedias
I just ran the following jQuery barf in a Chrome console:
var x = $('table.sortable tbody tr:has(td)');
var y = [];
for(var i=0; i<x.length; ++i) {
cur_list = [$(x[i]).find('td:nth-child(2)').text(),
$(x[i]).find('td:nth-child(2) a').attr('title'),
$(x[i]).find('td:nth-child(4)').text(),
$(x[i]).find('td:nth-child(5)').text(),
$(x[i]).find('td:nth-child(10)').text(), $(x[i]).find('td:nth-child(12)').text()]
if(cur_list[2]) { y.push(cur_list); }
}
JSON.stringify(y);
which results in a list of lists like this:
[u'English', u'w:English language', u'en', u'4,234,378', u'129,657', u'763']
"""
import os
import sys
import json
import re
from pyquery import PyQuery
from wapiti import WapitiClient
wikis_json_filename = sys.argv[1]
wikis_json = open(wikis_json_filename, 'rb').read()
wikis_list = json.loads(wikis_json)
wc = WapitiClient('[email protected]')
RE_INTRO = re.compile(r'(?P<intro>.*?\.)\s*([A-Z\[\n]{1}|\Z)')
RE_FN = re.compile(r'\[[0-9]{1,2}\]')
def parse_count(count_text):
try:
return int(count_text.replace(',', '').strip())
except AttributeError:
raise
def get_desc(article_name):
lang_article = wc.web_request_operation('http://en.wikipedia.org/wiki/' + article_name)
pq = PyQuery(lang_article[0])
intro_paragraphs = pq('#toc').prevAll('p')
if not intro_paragraphs:
intro_paragraphs = pq('#toc').parent().prevAll('p')
if not intro_paragraphs:
intro_paragraphs = pq('div#content p')
if not intro_paragraphs:
return None
lang_article_intro = ''.join(intro_paragraphs[0].itertext())
# remove footnotes
m = RE_INTRO.match(lang_article_intro)
if m:
lang_intro = m.group('intro')
lang_intro = re.sub(RE_FN, '', lang_intro)
else:
return None
return lang_intro
processed_list = []
for winfo in wikis_list:
name, article_name, langcode, num_articles, num_active_users, depth = winfo
if article_name.startswith('w:'):
article_name = article_name[2:]
desc = get_desc(article_name)
if not desc:
import pdb; pdb.set_trace()
print '\n * ' + repr(desc)
num_articles = parse_count(num_articles)
num_active_users = parse_count(num_active_users)
depth = depth.replace('-', '').strip()
depth = depth and int(depth) or 0
processed_list.append([name, article_name, langcode,
num_articles, num_active_users, depth, desc])
assert len(processed_list) == len(wikis_list)
print processed_list[0]
print processed_list[-1]
out_dir = os.path.dirname(os.path.abspath(wikis_json_filename))
out_path = os.path.join(out_dir, 'processed_wikis.json')
with open(out_path, 'wb') as f:
f.write(json.dumps(processed_list, indent=2))