#!/usr/bin/python
import popen2
import re
def getpage(url):
f=popen2.Popen4('wget --quiet -O - "'+url+'"')
s=f.fromchild
f.wait()
return s.read()
res={
'nextlink':re.compile(r'\(.*?previous.*?\) \(next .*?\)'),
'links-section':re.compile(r'
.*?
(.*?)', re.DOTALL),
'link':re.compile(r'.*?'),
'infobox':re.compile(r'{{Infobox Album.*?}}', re.DOTALL),
'infoboxdata':{
'Name':re.compile(r'Name\s*=\s*(.*?)[\n.*?\||}]', re.DOTALL),
'Artist':re.compile(r'Artist\s*=\s*(.*?)[\n.*?\||}]', re.DOTALL),
'Release':re.compile(r'Released\s*=\s*(.*?)[\n.*?\||}]', re.DOTALL),
}
}
links=[]
page='/wiki/Category:Upcoming_albums'
while page:
page=page.replace('&', '&')
# print 'fetching http://en.wikipedia.org'+page
html=getpage('http://en.wikipedia.org'+page)
page=res['nextlink'].search(html)
if page:
page=page.group(1)
linkssection=res['links-section'].search(html)
if linkssection:
linkssection=linkssection.group(1)
links+=res['link'].findall(linkssection)
# page=''
#print links
releases=[]
f=open("ua.txt", "w")
for link in links:
linkm=re.compile(r'/wiki/(.*)').match(link)
if not linkm:
print link + ' rejected.'
continue
link=linkm.group(1)
html=getpage('http://en.wikipedia.org/w/index.php?title='+link+'&action=raw')
infobox=res['infobox'].search(html)
if infobox:
infobox=infobox.group(0)
data={'Name':'','Artist':'','Release':''}
for field in res['infoboxdata']:
fd=res['infoboxdata'][field].search(infobox)
if fd:
data[field]=fd.group(1).strip()
print data['Artist']
print data['Name']
print data['Release']
f.write(data['Artist']+'\n')
f.write(data['Name']+'\n')
f.write(data['Release']+'\n')
f.write('\n')
print
releases.append(data)
# print releases
else:
print "no infobox: "+link
print
f.close()
#for item in releases:
# print item['artist']
# print item['name']
# print item['release']
# print