# -*- coding: utf-8 -*- # Copyright (C) 2009 Osama Khalid # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # Please report bugs or help imporving this program by connecting to # import wikipedia, okbot import sys, time, webbrowser class dup: def __init__(self): self.recordfile = open('./getdup','a') self.site = wikipedia.getSite(code='commons', fam='commons') self.childkey = {} self.mainCatFiles = [] self.subCats = [] def getSubCats(self, cmcontinue): #http://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:User-created_public_domain_images&cmnamespace=6&cmcontinue=Shervin|&cmlimit=50&format=jsonfm #print cmcontinue, "def" # FOR TESTING predata = { 'action': 'query', 'list': 'categorymembers', 'cmtitle': 'Category:' + self.targetCategory, 'cmnamespace': '14', 'cmcontinue': cmcontinue, 'cmprop': 'title', 'cmlimit': '500', 'format': 'json', } while True: catsdata = okbot.getAPI(self.site, predata) if catsdata != None: break for category in catsdata['query']['categorymembers']: self.subCats.append(unicode(category['title'])) if 'query-continue' in catsdata: #print "in",cmcontinue#test #print "out",catsdata['query-continue']['categorymembers']['cmcontinue']#test return catsdata['query-continue']['categorymembers']['cmcontinue'] else: return None def getMainCatFiles(self, cmcontinue): predata = { 'action': 'query', 'list': 'categorymembers', 'cmtitle': 'Category:' + self.targetCategory, 'cmnamespace': '6', 'cmcontinue': cmcontinue, 'cmprop': 'title', 'cmlimit': '500', 'format': 'json', } while True: filesdata = okbot.getAPI(self.site, predata) if filesdata != None: break for file in filesdata['query']['categorymembers']: #print "Adding " + unicode(file['title']) + "...", self.mainCatFiles.append(unicode(file['title'])) if 'query-continue' in filesdata: return filesdata['query-continue']['categorymembers']['cmcontinue'] else: return None def getSubCatsFiles(self, subCategory, cmcontinue): predata = { 'action': 'query', 'list': 'categorymembers', 'cmtitle': subCategory, 'cmnamespace': '6', 'cmcontinue': cmcontinue, 'cmprop': 'title', 'cmlimit': '500', 'format': 'json', } while True: subcatdata = okbot.getAPI(self.site, predata) if subcatdata != None: break if len(subcatdata['query']['categorymembers']) == 0: wikipedia.output(subCategory + " is empty!") return None if not subCategory in self.childkey: self.childkey[subCategory]=[] for file in subcatdata['query']['categorymembers']: #print file#test self.childkey[subCategory].append(unicode(file['title'])) if 'query-continue' in subcatdata: return subcatdata['query-continue']['categorymembers']['cmcontinue'] else: return None def findDup(self): for cat, list in self.childkey.items(): for file in [file for file in list if file in self.mainCatFiles]: print " in "+ cat +" is also in "+ self.targetCategory self.recordfile.write(file.encode('utf-8')+'\n'.encode('utf-8')) #webbrowser.open("http://commons.wikimedia.org/wiki/"+file) def getTarget(self): for argv in [argv[6:] for argv in sys.argv if argv.startswith("--cat:")]: self.targetCategory = argv else: self.targetCategory = raw_input("You want to scan the category... ").strip() def run(self): self.getTarget() self.recordfile.write("Running on " + self.targetCategory + ":\n") print "Getting " + self.targetCategory + " subcategories..." getSubCatsCM='!|' while True: getSubCatsCM = self.getSubCats(getSubCatsCM) #print "main cat subcats return",getSubCatsCM#test if getSubCatsCM == None: break print "Scanning " + self.targetCategory + " for files..." getMainCatFilesCM='!|' while True: getMainCatFilesCM = self.getMainCatFiles(getMainCatFilesCM) #print "main cat files return", getMainCatFilesCM#test if getMainCatFilesCM == None: break for subCategory in self.subCats: print "Scanning " + subCategory + "..." getSubCatsFilesCM='!|' while True: getSubCatsFilesCM = self.getSubCatsFiles(subCategory, getSubCatsFilesCM) #print "subcat files return",getSubCatsFilesCM#test if getSubCatsFilesCM == None: break self.findDup() self.recordfile.close() if __name__ == '__main__': try: bot = dup() bot.run() finally: wikipedia.stopme()