# -*- coding: utf-8 -*-
# Copyright (C) 2009 Osama Khalid
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
# Please report bugs or help imporving this program by connecting to
#
import wikipedia, okbot
import sys, time, webbrowser
class dup:
def __init__(self):
self.recordfile = open('./getdup','a')
self.site = wikipedia.getSite(code='commons', fam='commons')
self.childkey = {}
self.mainCatFiles = []
self.subCats = []
def getSubCats(self, cmcontinue):
#http://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:User-created_public_domain_images&cmnamespace=6&cmcontinue=Shervin|&cmlimit=50&format=jsonfm
#print cmcontinue, "def" # FOR TESTING
predata = {
'action': 'query',
'list': 'categorymembers',
'cmtitle': 'Category:' + self.targetCategory,
'cmnamespace': '14',
'cmcontinue': cmcontinue,
'cmprop': 'title',
'cmlimit': '500',
'format': 'json',
}
while True:
catsdata = okbot.getAPI(self.site, predata)
if catsdata != None: break
for category in catsdata['query']['categorymembers']:
self.subCats.append(unicode(category['title']))
if 'query-continue' in catsdata:
#print "in",cmcontinue#test
#print "out",catsdata['query-continue']['categorymembers']['cmcontinue']#test
return catsdata['query-continue']['categorymembers']['cmcontinue']
else:
return None
def getMainCatFiles(self, cmcontinue):
predata = {
'action': 'query',
'list': 'categorymembers',
'cmtitle': 'Category:' + self.targetCategory,
'cmnamespace': '6',
'cmcontinue': cmcontinue,
'cmprop': 'title',
'cmlimit': '500',
'format': 'json',
}
while True:
filesdata = okbot.getAPI(self.site, predata)
if filesdata != None: break
for file in filesdata['query']['categorymembers']:
#print "Adding " + unicode(file['title']) + "...",
self.mainCatFiles.append(unicode(file['title']))
if 'query-continue' in filesdata:
return filesdata['query-continue']['categorymembers']['cmcontinue']
else:
return None
def getSubCatsFiles(self, subCategory, cmcontinue):
predata = {
'action': 'query',
'list': 'categorymembers',
'cmtitle': subCategory,
'cmnamespace': '6',
'cmcontinue': cmcontinue,
'cmprop': 'title',
'cmlimit': '500',
'format': 'json',
}
while True:
subcatdata = okbot.getAPI(self.site, predata)
if subcatdata != None: break
if len(subcatdata['query']['categorymembers']) == 0:
wikipedia.output(subCategory + " is empty!")
return None
if not subCategory in self.childkey:
self.childkey[subCategory]=[]
for file in subcatdata['query']['categorymembers']:
#print file#test
self.childkey[subCategory].append(unicode(file['title']))
if 'query-continue' in subcatdata:
return subcatdata['query-continue']['categorymembers']['cmcontinue']
else:
return None
def findDup(self):
for cat, list in self.childkey.items():
for file in [file for file in list if file in self.mainCatFiles]:
print " in "+ cat +" is also in "+ self.targetCategory
self.recordfile.write(file.encode('utf-8')+'\n'.encode('utf-8'))
#webbrowser.open("http://commons.wikimedia.org/wiki/"+file)
def getTarget(self):
for argv in [argv[6:] for argv in sys.argv if argv.startswith("--cat:")]:
self.targetCategory = argv
else:
self.targetCategory = raw_input("You want to scan the category... ").strip()
def run(self):
self.getTarget()
self.recordfile.write("Running on " + self.targetCategory + ":\n")
print "Getting " + self.targetCategory + " subcategories..."
getSubCatsCM='!|'
while True:
getSubCatsCM = self.getSubCats(getSubCatsCM)
#print "main cat subcats return",getSubCatsCM#test
if getSubCatsCM == None:
break
print "Scanning " + self.targetCategory + " for files..."
getMainCatFilesCM='!|'
while True:
getMainCatFilesCM = self.getMainCatFiles(getMainCatFilesCM)
#print "main cat files return", getMainCatFilesCM#test
if getMainCatFilesCM == None:
break
for subCategory in self.subCats:
print "Scanning " + subCategory + "..."
getSubCatsFilesCM='!|'
while True:
getSubCatsFilesCM = self.getSubCatsFiles(subCategory, getSubCatsFilesCM)
#print "subcat files return",getSubCatsFilesCM#test
if getSubCatsFilesCM == None:
break
self.findDup()
self.recordfile.close()
if __name__ == '__main__':
try:
bot = dup()
bot.run()
finally:
wikipedia.stopme()