#!/usr/bin/python import popen2 import re def getpage(url): f=popen2.Popen4('wget --quiet -O - "'+url+'"') s=f.fromchild f.wait() return s.read() res={ 'nextlink':re.compile(r'\(.*?previous.*?\) \(next .*?\)'), 'links-section':re.compile(r'

.*?

(.*?)', re.DOTALL), 'link':re.compile(r'
  • .*?
  • '), 'infobox':re.compile(r'{{Infobox Album.*?}}', re.DOTALL), 'infoboxdata':{ 'Name':re.compile(r'Name\s*=\s*(.*?)[\n.*?\||}]', re.DOTALL), 'Artist':re.compile(r'Artist\s*=\s*(.*?)[\n.*?\||}]', re.DOTALL), 'Release':re.compile(r'Released\s*=\s*(.*?)[\n.*?\||}]', re.DOTALL), } } links=[] page='/wiki/Category:Upcoming_albums' while page: page=page.replace('&', '&') # print 'fetching http://en.wikipedia.org'+page html=getpage('http://en.wikipedia.org'+page) page=res['nextlink'].search(html) if page: page=page.group(1) linkssection=res['links-section'].search(html) if linkssection: linkssection=linkssection.group(1) links+=res['link'].findall(linkssection) # page='' #print links releases=[] f=open("ua.txt", "w") for link in links: linkm=re.compile(r'/wiki/(.*)').match(link) if not linkm: print link + ' rejected.' continue link=linkm.group(1) html=getpage('http://en.wikipedia.org/w/index.php?title='+link+'&action=raw') infobox=res['infobox'].search(html) if infobox: infobox=infobox.group(0) data={'Name':'','Artist':'','Release':''} for field in res['infoboxdata']: fd=res['infoboxdata'][field].search(infobox) if fd: data[field]=fd.group(1).strip() print data['Artist'] print data['Name'] print data['Release'] f.write(data['Artist']+'\n') f.write(data['Name']+'\n') f.write(data['Release']+'\n') f.write('\n') print releases.append(data) # print releases else: print "no infobox: "+link print f.close() #for item in releases: # print item['artist'] # print item['name'] # print item['release'] # print