-
Notifications
You must be signed in to change notification settings - Fork 0
/
wp_articles_length.py
53 lines (42 loc) · 1.24 KB
/
wp_articles_length.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# wp_articles_length.py
#
# (C) 2011 Alchimista <alchimistawp@gmail.com>
#
# Distributed under the terms of the GNU GPL license.
#
import wikipedia
import pagegenerators
import re
def main():
def pageFromList(lx):
for i in lx:
yield wikipedia.Page(site, i)
site = wikipedia.getSite()
listPage = wikipedia.Page(site, u"Wikipédia:Wikipédia na Universidade/Verbetes")
text = listPage.get()
# Para criar a lista de artigos
lr = re.compile(u'\#(?P<links>.*?)\s*(?:\\n|\|)', re.I | re.M | re.U)
# Para obter a associação dos id's a cada artigo
lr2 = re.compile(u'\#(?P<links>.*?)\s*\|\s*ids\s*\=\s*(?P<id1>\d{8})\,\s*(?P<id2>\d{8})', re.I | re.M | re.U)
# Para obter a lista de id's
lr3 = re.compile(u'\#(?:.*)\s*\|\s*ids\s*\=\s*(?P<id1>\d{8})\,\s*(?P<id2>\d{8})', re.I | re.M | re.U)
links = lr.findall(text)
ids = lr3.findall(text)
ids = map(int, ids[0])
gen = pageFromList(links)
pages = pagegenerators.NamespaceFilterPageGenerator(gen, [0])
for i in pages:
print i
for n in i.getVersionHistory():
if n[0] in ids:
#tit = i.title()
print u"\n", i.title(), n[0], n[4], u"\n"
#dict[tit][n[0]] = n[4]
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()