diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..273ac5b --- /dev/null +++ b/.gitignore @@ -0,0 +1,47 @@ +# datasets and models +datasets/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +ipython/.ipynb_checkpoints/ + +# Temporary text editor files +*~ + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +#lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Sphinx documentation +doc/_build/ +doc/.buildfile +*.toctree diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/bwds/__init__.py b/bwds/__init__.py new file mode 100644 index 0000000..94c6aeb --- /dev/null +++ b/bwds/__init__.py @@ -0,0 +1,5 @@ +from .about import (__name__, __version__, __author__, __author_email__, + __description__, __license__, __url__) + +__all__ = [__name__, __version__, __author__, __author_email__, + __description__, __license__, __url__] diff --git a/bwds/about.py b/bwds/about.py new file mode 100644 index 0000000..e8d798c --- /dev/null +++ b/bwds/about.py @@ -0,0 +1,8 @@ +__name__ = "bwds" +__version__ = "0.0.1" +__author__ = "Amir Sarabadani" +__author_email__ = "ladsgroup@gmail.com" +__description__ = "A library for performing automatic detection of the " + \ + "badwords added to Wikipedia articles" +__url__ = "https://github.com/wiki-ai/bwds" +__license__ = "MIT" diff --git a/bwds/bot.py b/bwds/bot.py new file mode 100644 index 0000000..7760c9c --- /dev/null +++ b/bwds/bot.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +# Copyright © 2014 He7d3r +# License: http://he7d3r.mit-license.org/ +""" +Extermely under construction. +Some parts are copied from +https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c +""" + +import re +import sys +import time + +import regex +from mw.lib import reverts +from nltk.tokenize import RegexpTokenizer + +import pywikibot +from bad_words_detection_system import Bot, Edit +from pywikibot import xmlreader + + +def page_info(dump, lang): + global tokenizer + c = 1 + di_old = [] + di = [] + nombre = '3,' if lang not in ['ja', 'zh'] else '1' + for entry in dump.parse(): + if entry.ns != '0': + continue + if c != entry.id: + if c != 1: + di_old = di[:] + di = [] + if entry.id and int(entry.id[-1]) == 0: + print('new page', entry.id) + di.append(entry) + else: + di.append(entry) + continue + c = entry.id + firstRev = True + history = {} + detector = reverts.Detector(radius=3) + for revision in di_old: + revision.text = re.sub( + r'\[\[(%s)\:' % '|'.join(languages_by_size), + '', + revision.text) + words = set() + if lang in chars: + token_pattern = r'[%s]{%s}' % (chars[lang], nombre) + tokenizer = RegexpTokenizer(token_pattern) + tokens = tokenizer.tokenize(revision.text) + else: + token_pattern = r'\p{alpha}+' + tokens = regex.findall(token_pattern, revision.text) + for w in tokens: + words.add(lower(w, lang)) + if firstRev: + prevIntersection = words + firstRev = False + added = words - prevIntersection + prevIntersection = words + history[revision.revisionid] = Edit( + revision.revisionid, added, False) + rev = detector.process(revision.text, + {'rev_id': revision.revisionid}) + if rev: + for reverted in rev.reverteds: + history[reverted['rev_id']].reverted = True + + yield history + + +def run(dumps): + number = 500000 + counter = 0 + start_time = time.time() + for casee in dumps: + lang = casee.split('/')[-1].split('wiki')[0] + dump = xmlreader.XmlDump(casee, True) + bot = Bot() + for case in page_info(dump, lang): + counter += 1 + if number and counter > number: + break + bot.parse_edits(case.values()) + bot.parse_bad_edits(250) + bot.dump() + print(time.time() - start_time) + site = pywikibot.Site('meta', fam='meta') + page = pywikibot.Page( + site, 'Research:Revision scoring as a service/Word lists/' + lang) + try: + text = page.get() + except pywikibot.NoPage: + text = ("{{Research:Revision scoring as a service/template/word list " + "data\n |lang=%s\n |gen=250\n |badwords=-\n |informal=-" + "\n |stopwords=-\n |dictionary=-\n |stemmer=-\n |contact=" + "\n |features=no\n |labels=requested\n |campaign=no\n " + "|needs=-\n |list-generated=\n |list-stop=\n}}\n" % lang) + except: + return False + new_text = text + if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text): + if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text): + new_text = re.sub( + r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})', + r'\1%s\2' % bot.bad_words_res_text, + new_text) + else: + new_text = re.sub( + r'\}\}', + r'|list-generated=%s\n}}' % bot.bad_words_res_text, + new_text) + if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text): + if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text): + new_text = re.sub( + r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})', + r'\1%s\2' % bot.stop_words_res_text, + new_text) + else: + new_text = re.sub( + r'\}\}', + r'|list-stop=%s\n}}' % bot.stop_words_res_text, + new_text) + if new_text != text: + page.text = new_text + page.save('Bot: update results') +if __name__ == "__main__": + dumps = sys.argv[1:] + run(dumps) diff --git a/bwds/bwds.py b/bwds/bwds.py new file mode 100644 index 0000000..71590e1 --- /dev/null +++ b/bwds/bwds.py @@ -0,0 +1,47 @@ +""" +This script provides access to a set of utilities for extracting features and +building edit quality predictors. + +* process_api -- Processes a sample of revisions using the API +* process_dump -- Processes an XML dump + +Usage: + bwds (-h | --help) + bwds [-h | --help] + +Options: + -h | --help Prints this documentation + The name of the utility to run +""" +import sys +import traceback +from importlib import import_module + + +USAGE = """Usage: + bwds (-h | --help) + bwds [-h | --help]\n""" + + +def main(): + + if len(sys.argv) < 2: + sys.stderr.write(USAGE) + sys.exit(1) + elif sys.argv[1] in ("-h", "--help"): + sys.stderr.write(__doc__ + "\n") + sys.exit(1) + elif sys.argv[1][:1] == "-": + sys.stderr.write(USAGE) + sys.exit(1) + + module_name = sys.argv[1] + try: + module = import_module(".utilities." + module_name, + package="bwds") + except ImportError: + sys.stderr.write(traceback.format_exc()) + sys.stderr.write("Could not load utility {0}.\n".format(module_name)) + sys.exit(1) + + module.main(sys.argv[2:]) diff --git a/bwds/utilities/process_api.py b/bwds/utilities/process_api.py new file mode 100644 index 0000000..c5a5213 --- /dev/null +++ b/bwds/utilities/process_api.py @@ -0,0 +1,216 @@ +""" +Generate a datafile of potentially badwords, likely stopwords, and other +features of a set of input revisions by comparing words added in edits +that are reverted with words added in edits that are not reverted. Assumes +that revisions have a wikitext content model. + +Produces a json BLOB file with a few fields: + - badwords -- A list of potential bad words (most common to reverted edits) + - stopwords -- A list of potential stopwords (most common overall) + +:Usage: + process_api -h | --help + process_api --host= + [--word-limit=] + [--token-type=] + [--lang=] + [--norm-lower] + [--norm-derepeat] + [--norm-de1337] + [--norm-stem=] + [--grams=] + [--processes=] + [--input=] + [--output=] + [--verbose] + [--debug] + +:Options: + --host= The host URL of the MediaWiki install where an API + can be found. + --word-limit= Limit the number of words output in word lists to + this number [default: 1000] + --token-type= Limit the tokens processed to this type + [default: word] + --lang= Limit tokens to those with at least one character + from the appropriate alphabet for + --norm-stem= If set, use the stemmer for on words before + processing + --norm-lower User `lower()` to normalize all words before + processing + --norm-derepeat Singularize repeated characters within the word + before processing + --grams= Produce ngrams of words of this length + [default: 1] + --processes= The number of parallel processes to start for + processing edits. [default: ] + --input= The path to a file containing rev_ids to process + [default: ] + --output= The path to write output to [default: ] + --verbose Prints dots and stuff to + --debug Prints debug logs to stderr + +""" +import json +import logging +import sys +from collections import defaultdict +from itertools import islice +from multiprocessing import cpu_count + +import docopt +import mwapi +import para +from nltk.stem.snowball import SnowballStemmer +from revscoring.dependencies import draw +from revscoring.extractors import api + +from . import util + +logger = logging.getLogger(__name__) + + +def main(argv=None): + args = docopt.docopt(__doc__, argv=argv) + + logging.basicConfig( + format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' + ) + logger.setLevel(logging.DEBUG if args['--debug'] else logging.WARNING) + + api_host = args['--host'] + + if args['--input'] == "": + revisions = (json.loads(line) for line in sys.stdin) + else: + revisions = (json.loads(line) for line in open(args['--input'])) + + if args['--output'] == "": + wordstats_f = sys.stdout + else: + wordstats_f = open(args['--output'], 'w') + + word_limit = int(args['--word-limit']) + token_type = args['--token-type'] + lang_code = args['--lang'] + norm_lower = bool(args['--norm-lower']) + norm_derepeat = bool(args['--norm-derepeat']) + norm_de1337 = bool(args['--norm-de1337']) + if args['--norm-stem'] is not None: + stemmer = SnowballStemmer(args['--norm-stem']) + else: + stemmer = None + + grams = int(args['--grams']) + + if args['--processes'] == "": + processes = cpu_count() + else: + processes = int(args['--processes']) + + verbose = bool(args['--verbose']) + run(api_host, revisions, wordstats_f, word_limit, token_type, + lang_code, norm_lower, norm_derepeat, norm_de1337, stemmer, grams, + processes, verbose) + + +def run(api_host, revisions, wordstats_f, word_limit, token_type, lang_code, + norm_lower, norm_derepeat, norm_de1337, stemmer, grams, processes, + verbose): + + # Construct our API session + session = mwapi.Session( + api_host, user_agent="wiki-ai/editquality -- bwds script") + extractor = api.Extractor(session) + + # Construct the revision processor + process_revisions = revision_processor( + extractor, token_type, lang_code, norm_lower, norm_derepeat, + norm_de1337, stemmer, grams) + + # Construct dictionaries for tracking frequencies and mappings between + # processed revisions + bad_grams = defaultdict(int) + grams = defaultdict(int) + norms = defaultdict(set) + + logging.info("Processing revisions") + revision_chunks = chunk(revisions, 50) + revision_delta_norms = para.map(process_revisions, revision_chunks, + mappers=processes) + + for rev, freq_delta, norm_map in revision_delta_norms: + if rev['reverted_for_damage']: + logger.debug( + str(rev['rev_id']) + + " was reverted for damage and added the following words " + + str([str(k[0]) for k in freq_delta.keys()])) + for gram, delta in freq_delta.items(): + if rev['reverted_for_damage']: + bad_grams[gram] += delta + grams[gram] += delta + + for gram, originals in norm_map.items(): + for o in originals: + norms[gram].add(o) + + logging.info("Sorting grams by badness and frequency") + gram_badness = (((freq + 1) / (grams[gram] + 10), gram) + for gram, freq in bad_grams.items()) + limited_badword_grams = \ + islice(sorted(gram_badness, reverse=True), word_limit) + + gram_freq = ((freq, gram) for gram, freq in grams.items()) + limited_stop_grams = islice(sorted(gram_freq, reverse=True), word_limit) + + badwords = [{'gram': list(gram), + 'originals': [list(g) for g in norms[gram]], + 'badness': round(badness, 3)} + for badness, gram in limited_badword_grams] + stopwords = [{'gram': list(gram), + 'originals': [list(g) for g in norms[gram]], + 'freq': freq} + for freq, gram in limited_stop_grams] + + json.dump({'badwords': badwords, 'stopwords': stopwords}, wordstats_f, + indent=2) + + +def revision_processor(extractor, token_type, lang_code, norm_lower, + norm_derepeat, norm_de1337, stemmer, grams): + + orig_r_grams, norm_r_grams, norm_gram_delta = \ + util.build_token_gram_types(token_type, lang_code, norm_lower, + norm_derepeat, norm_de1337, stemmer, grams) + logger.info("Normalized grams: ") + logger.info(draw(norm_r_grams)) + + def _process_revisions(revisions): + rev_ids = [r['rev_id'] for r in revisions] + + error_values = extractor.extract( + rev_ids, [orig_r_grams, norm_r_grams, norm_gram_delta], + context=util.solving_context) + + for (error, values), revision in zip(error_values, revisions): + if error is None: + orig_r, norm_r, gram_delta = values + norm_map = defaultdict(set) + for original, gram in zip(orig_r, norm_r): + norm_map[gram].add(original) + + yield revision, gram_delta, norm_map + else: + logger.warning("{0} while solving for {1}" + .format(error, revision['rev_id'])) + + return _process_revisions + + +def chunk(iterable, size): + while True: + batch = list(islice(iterable, size)) + if len(batch) == 0: + break + else: + yield batch diff --git a/bad_words_detection_system.py b/bwds/utilities/process_dump.py similarity index 100% rename from bad_words_detection_system.py rename to bwds/utilities/process_dump.py diff --git a/bwds/utilities/util.py b/bwds/utilities/util.py new file mode 100644 index 0000000..1f549b3 --- /dev/null +++ b/bwds/utilities/util.py @@ -0,0 +1,66 @@ +import logging + +from revscoring.datasources import Datasource, revision_oriented +from revscoring.datasources.meta import filters, frequencies, gramming, mappers +from revscoring.features import wikitext + +from ..wikitext import strip_interwikilink_prefixes, token_contains_lang + +logger = logging.getLogger(__name__) + + +def build_token_gram_types(token_type, lang_code, lower, derepeat, de1337, + stemmer, grams): + orig_r_tokens = wikitext.revision.datasources.tokens_in_types({token_type}) + orig_p_tokens = \ + wikitext.revision.parent.datasources.tokens_in_types({token_type}) + + norm_r_tokens = orig_r_tokens + norm_p_tokens = orig_p_tokens + + if lang_code is not None: + logger.info("Filtering tokens that do not contain {0}" + .format(lang_code)) + norm_r_tokens = filters.filter( + lambda t: token_contains_lang(t, lang_code), norm_r_tokens) + norm_p_tokens = filters.filter( + lambda t: token_contains_lang(t, lang_code), norm_p_tokens) + if lower: + norm_r_tokens = mappers.lower_case(norm_r_tokens) + norm_p_tokens = mappers.lower_case(norm_p_tokens) + if de1337: + norm_r_tokens = mappers.de1337(norm_r_tokens) + norm_p_tokens = mappers.de1337(norm_p_tokens) + if stemmer: + norm_r_tokens = mappers.map(stemmer.stem, norm_r_tokens) + norm_p_tokens = mappers.map(stemmer.stem, norm_p_tokens) + if derepeat: + norm_r_tokens = mappers.derepeat(norm_r_tokens) + norm_p_tokens = mappers.derepeat(norm_p_tokens) + + orig_r_grams = gramming.gram(orig_r_tokens, grams=[tuple(range(0, grams))]) + # p_grams = gramming.gram(p_tokens) + norm_r_grams = gramming.gram(norm_r_tokens, grams=[tuple(range(0, grams))]) + norm_p_grams = gramming.gram(norm_p_tokens, grams=[tuple(range(0, grams))]) + + norm_r_gram_table = frequencies.table(norm_r_grams) + norm_p_gram_table = frequencies.table(norm_p_grams) + + norm_gram_delta = frequencies.positive( + frequencies.delta(norm_p_gram_table, norm_r_gram_table)) + + return orig_r_grams, norm_r_grams, norm_gram_delta + + +revision_tokens = Datasource( + str(wikitext.revision.tokens), + lambda t: wikitext.revision.tokens.process( + strip_interwikilink_prefixes(t)), + depends_on=[revision_oriented.revision.text]) +parent_tokens = Datasource( + str(wikitext.revision.tokens), + lambda t: wikitext.revision.parent.tokens.process( + strip_interwikilink_prefixes(t)), + depends_on=[revision_oriented.revision.parent.text]) + +solving_context = {revision_tokens, parent_tokens} diff --git a/bwds/wikitext.py b/bwds/wikitext.py new file mode 100644 index 0000000..5fd5f7f --- /dev/null +++ b/bwds/wikitext.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +# Copyright © 2014 He7d3r +# License: http://he7d3r.mit-license.org/ +""" +Extermely under construction. +Some parts are copied from +https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c +""" +import re + +LANGUAGE_CODES = [ + 'en', 'sv', 'nl', 'de', 'fr', 'war', 'ru', 'ceb', 'it', 'es', 'vi', + 'pl', 'ja', 'pt', 'zh', 'uk', 'ca', 'fa', 'no', 'sh', 'fi', 'ar', + 'id', 'cs', 'sr', 'ro', 'ko', 'hu', 'ms', 'tr', 'min', 'eo', 'kk', + 'eu', 'sk', 'da', 'bg', 'he', 'lt', 'hy', 'hr', 'sl', 'et', 'uz', + 'gl', 'nn', 'vo', 'la', 'simple', 'el', 'hi', 'az', 'th', 'ka', + 'ce', 'oc', 'be', 'mk', 'mg', 'new', 'ur', 'tt', 'ta', 'pms', 'cy', + 'tl', 'lv', 'bs', 'te', 'be-tarask', 'br', 'ht', 'sq', 'jv', 'lb', + 'mr', 'is', 'ml', 'zh-yue', 'bn', 'af', 'ba', 'ga', 'pnb', 'cv', + 'fy', 'lmo', 'tg', 'sco', 'my', 'yo', 'an', 'ky', 'sw', 'io', 'ne', + 'gu', 'scn', 'bpy', 'nds', 'ku', 'ast', 'qu', 'als', 'su', 'pa', + 'kn', 'ckb', 'ia', 'mn', 'nap', 'bug', 'arz', 'bat-smg', 'wa', + 'zh-min-nan', 'am', 'map-bms', 'gd', 'yi', 'mzn', 'si', 'fo', + 'bar', 'vec', 'nah', 'sah', 'os', 'sa', 'roa-tara', 'li', 'hsb', + 'pam', 'mrj', 'mhr', 'se', 'mi', 'ilo', 'hif', 'bcl', 'gan', 'rue', + 'ps', 'glk', 'nds-nl', 'bo', 'vls', 'diq', 'fiu-vro', 'bh', 'xmf', + 'tk', 'gv', 'sc', 'co', 'csb', 'hak', 'km', 'kv', 'vep', 'zea', + 'crh', 'zh-classical', 'frr', 'eml', 'ay', 'stq', 'udm', 'wuu', + 'nrm', 'kw', 'rm', 'szl', 'so', 'koi', 'as', 'lad', 'fur', 'mt', + 'dv', 'gn', 'dsb', 'ie', 'pcd', 'sd', 'lij', 'cbk-zam', 'cdo', + 'ksh', 'ext', 'mwl', 'gag', 'ang', 'ug', 'ace', 'pi', 'pag', 'nv', + 'lez', 'frp', 'sn', 'kab', 'ln', 'myv', 'pfl', 'xal', 'krc', 'haw', + 'rw', 'pdc', 'kaa', 'to', 'kl', 'arc', 'nov', 'kbd', 'av', 'bxr', + 'lo', 'bjn', 'ha', 'tet', 'tpi', 'na', 'pap', 'lbe', 'jbo', 'ty', + 'mdf', 'roa-rup', 'wo', 'tyv', 'ig', 'srn', 'nso', 'kg', 'ab', + 'ltg', 'zu', 'om', 'za', 'chy', 'cu', 'rmy', 'tw', 'tn', 'chr', + 'mai', 'pih', 'got', 'xh', 'bi', 'sm', 'ss', 'rn', 'ki', 'pnt', + 'bm', 'iu', 'ee', 'lg', 'ts', 'fj', 'ak', 'ik', 'st', 'sg', 'ff', + 'dz', 'ny', 'ch', 'ti', 've', 'ks', 'tum', 'cr', 'gom', 'lrc', + 'azb', 'or' +] + + +def strip_interwikilink_prefixes(text): + return re.sub(r'\[\[\:?' + r'|'.join(LANGUAGE_CODES) + r'\:', '', + text) + +ALPHABETS = {lang: re.compile(r'[' + alpha + r']') + for lang, alpha in { + 'az': r'A-Za-zÇçƏəĞğıİÖöŞşÜü', + 'af': r'A-Za-züûöôïîëêè', + 'ar': r'غظضذخثتشرقصفعسنملكيطحزوهدجبا', + 'cs': r'A-Za-zÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž', + 'de': r'A-Za-zÄäÖöÜüß', + 'en': r'A-Za-z', + 'es': r'A-Za-zÑñéÉüÜóÓ', + 'et': r'A-Za-zŠšŽžÕõÄäÖöÜü', + 'fa': r'ابپتثجچحخدذرزژسشصآضطظعغفقکگلمنوهی‌يك', + 'fr': r'A-Za-zÀàÂâÆæÄäÇçÉéÈèÊêËëÎîÏïÔôŒœÖöÙùÛûÜüŸÿ', + 'hi': r'कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहळक्षज्ञ:अपआपाइपिईपीउपुऊपूऋपृॠप' + r'ॄऌपॢॡपॣएपेऐपैओपोऔपौअंपंअःपः', + 'he': r'למנסעפצקרשתםןףץאבגדהוזחטיכך', + 'hu': r'A-Za-zËëÉéÓóÖöŐőÚúÜüŰűÁá', + 'hy': r'ԱաԲբԳգԴդԵեԶզԷէԸըԹթԺժԻիԼլԽխԾծԿկՀհՁձՂղՃճՄմՅյՆնՇշՈոՉչՊպՋջՌռՍսՎվՏտՐր' + r'ՑցՈՒՈւուՒւՓփՔքևևՕօՖֆ', + 'id': r'A-Za-z', + 'ja': r'\u3000-\u303F' # Japanese punctuation + r'\u3040-\u309F' # Hiragana + r'\u30A0-\u30FF' # Katakana + r'\uFF00-\uFFEF' # Roman characters and half-width katakana + r'\u4E00-\u9FCC' # Unified Ideographs + r'\u3400-\u4DFF', # Unified Ideographs Ext A + 'ko': r'\uAC00-\uD7AF' # Hangul Syllables + r'\u1100-\u11FF' # Hangul Jamo + r'\u3130-\u318F' # Hangul Compatibility Jamo + r'\u3200-\u32FF' # Enclosed CJK Letters and Months + r'\uA960-\uA97F' # Hangul Jamo Extended-A + r'\uD7B0-\uD7FF' # Hangul Jamo Extended-B + r'\uFF00-\uFFEF', # Halfwidth and Fullwidth Forms + 'no': r'A-Za-zÆØÅæøåéèêóòâôüáàé', + 'pl': r'AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż', + 'pt': r'A-Za-záàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ', + 'sv': r'A-Za-zÅÄÖåäö', + 'ta': r'௰௱௲௳௴௵௶௷௸௹௺ௗௐொோௌ்ெேைீுூாிரறலளழவஶஷஸஹணதநனபம' + r'யஐஒஓஔகஙசஜஞடஂஃஅஆஇஈஉஊஎஏ', + 'tr': r'A-Za-zÇĞİÖŞÜçğıöşüâîûÂÎÛ', + 'uk': r'АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЬ' + r'ьЮюЯя', + 'ur': r'ابپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوهھءیےٹڈڑ‌آّْیٰوَُِٗ', + 'uz': r'A-Za-zʻ', + 'vi': r'AaĂăÂâBbCcDdĐđEeÊêGgHhIiKkLlMmNnOoÔôƠơPpQqRrSsTtUuƯưVvXxYy', + 'zh': r'\u4E00-\u9FCC' # Unified Ideographs + r'\u3400-\u4DFF' # Unified Ideographs Ext A + r'\U00020000-\U0002A6DF' # Unified Ideographs Ext. B + r'\uF900-\uFAFF' # Compatibility Ideographs + r'\U0002F800-\U0002FA1F' # Compatibility Ideographs Suppl. +}.items()} + + +def token_contains_lang(token, lang_code): + return ALPHABETS[lang_code].search(token) is not None diff --git a/datasets/.gitkeep b/datasets/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/dump_based_detection.py b/dump_based_detection.py deleted file mode 100644 index d1ccfd9..0000000 --- a/dump_based_detection.py +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -# Copyright © 2014 He7d3r -# License: http://he7d3r.mit-license.org/ -""" -Extermely under construction. -Some parts are copied from -https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c -""" -from nltk.tokenize import RegexpTokenizer -import sys -from mw.lib import reverts -from pywikibot import xmlreader -import pywikibot -import re -import time -import regex - -from bad_words_detection_system import Edit, Bot - -cache = {} - -languages_by_size = [ - 'en', 'sv', 'nl', 'de', 'fr', 'war', 'ru', 'ceb', 'it', 'es', 'vi', - 'pl', 'ja', 'pt', 'zh', 'uk', 'ca', 'fa', 'no', 'sh', 'fi', 'ar', - 'id', 'cs', 'sr', 'ro', 'ko', 'hu', 'ms', 'tr', 'min', 'eo', 'kk', - 'eu', 'sk', 'da', 'bg', 'he', 'lt', 'hy', 'hr', 'sl', 'et', 'uz', - 'gl', 'nn', 'vo', 'la', 'simple', 'el', 'hi', 'az', 'th', 'ka', - 'ce', 'oc', 'be', 'mk', 'mg', 'new', 'ur', 'tt', 'ta', 'pms', 'cy', - 'tl', 'lv', 'bs', 'te', 'be-tarask', 'br', 'ht', 'sq', 'jv', 'lb', - 'mr', 'is', 'ml', 'zh-yue', 'bn', 'af', 'ba', 'ga', 'pnb', 'cv', - 'fy', 'lmo', 'tg', 'sco', 'my', 'yo', 'an', 'ky', 'sw', 'io', 'ne', - 'gu', 'scn', 'bpy', 'nds', 'ku', 'ast', 'qu', 'als', 'su', 'pa', - 'kn', 'ckb', 'ia', 'mn', 'nap', 'bug', 'arz', 'bat-smg', 'wa', - 'zh-min-nan', 'am', 'map-bms', 'gd', 'yi', 'mzn', 'si', 'fo', - 'bar', 'vec', 'nah', 'sah', 'os', 'sa', 'roa-tara', 'li', 'hsb', - 'pam', 'mrj', 'mhr', 'se', 'mi', 'ilo', 'hif', 'bcl', 'gan', 'rue', - 'ps', 'glk', 'nds-nl', 'bo', 'vls', 'diq', 'fiu-vro', 'bh', 'xmf', - 'tk', 'gv', 'sc', 'co', 'csb', 'hak', 'km', 'kv', 'vep', 'zea', - 'crh', 'zh-classical', 'frr', 'eml', 'ay', 'stq', 'udm', 'wuu', - 'nrm', 'kw', 'rm', 'szl', 'so', 'koi', 'as', 'lad', 'fur', 'mt', - 'dv', 'gn', 'dsb', 'ie', 'pcd', 'sd', 'lij', 'cbk-zam', 'cdo', - 'ksh', 'ext', 'mwl', 'gag', 'ang', 'ug', 'ace', 'pi', 'pag', 'nv', - 'lez', 'frp', 'sn', 'kab', 'ln', 'myv', 'pfl', 'xal', 'krc', 'haw', - 'rw', 'pdc', 'kaa', 'to', 'kl', 'arc', 'nov', 'kbd', 'av', 'bxr', - 'lo', 'bjn', 'ha', 'tet', 'tpi', 'na', 'pap', 'lbe', 'jbo', 'ty', - 'mdf', 'roa-rup', 'wo', 'tyv', 'ig', 'srn', 'nso', 'kg', 'ab', - 'ltg', 'zu', 'om', 'za', 'chy', 'cu', 'rmy', 'tw', 'tn', 'chr', - 'mai', 'pih', 'got', 'xh', 'bi', 'sm', 'ss', 'rn', 'ki', 'pnt', - 'bm', 'iu', 'ee', 'lg', 'ts', 'fj', 'ak', 'ik', 'st', 'sg', 'ff', - 'dz', 'ny', 'ch', 'ti', 've', 'ks', 'tum', 'cr', 'gom', 'lrc', - 'azb', 'or' - ] -cjk = ( - r'\u4E00-\u62FF' + # Unified Ideographs - r'\u6300-\u77FF' + - r'\u7800-\u8CFF' + - r'\u8D00-\u9FCC' + - r'\u3400-\u4DFF' + # Unified Ideographs Ext A - r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B - r'\U00021600-\U000230FF' + - r'\U00023100-\U000245FF' + - r'\U00024600-\U000260FF' + - r'\U00026100-\U000275FF' + - r'\U00027600-\U000290FF' + - r'\U00029100-\U0002A6DF' + - r'\uF900-\uFAFF' + # Compatibility Ideographs - r'\U0002F800-\U0002FA1F' # Compatibility Ideographs Suppl. -) - -chars = { - 'az': u'A-Za-zÇçƏəĞğıİÖöŞşÜü', - 'ar': u'غظضذخثتشرقصفعسنملكيطحزوهدجبا', - 'et': u'A-Za-zŠšŽžÕõÄäÖöÜü', - 'af': u'A-Za-züûöôïîëêè', - 'en': u'A-Za-z', - 'id': u'A-Za-z', - 'ko': cjk, - 'zh': cjk, - 'ja': cjk, - 'pt': u'A-Za-záàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ', - 'tr': u'A-Za-zÇĞİÖŞÜçğıöşüâîûÂÎÛ', - 'fa': u'ابپتثجچحخدذرزژسشصآضطظعغفقکگلمنوهی‌يك', - 'fr': u'A-Za-zÀàÂâÆæÄäÇçÉéÈèÊêËëÎîÏïÔôŒœÖöÙùÛûÜüŸÿ', - 'de': u'A-Za-zÄäÖöÜüß', - 'es': u'A-Za-zÑñéÉüÜóÓ', - 'uk': u'АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЬ' - u'ьЮюЯя', - 'pl': u'AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż', - 'he': u'למנסעפצקרשתםןףץאבגדהוזחטיכך', - 'hy': u'ԱաԲբԳգԴդԵեԶզԷէԸըԹթԺժԻիԼլԽխԾծԿկՀհՁձՂղՃճՄմՅյՆնՇշՈոՉչՊպՋջՌռՍսՎվՏտՐր' - u'ՑցՈՒՈւուՒւՓփՔքևևՕօՖֆ', - 'vi': u'AaĂăÂâBbCcDdĐđEeÊêGgHhIiKkLlMmNnOoÔôƠơPpQqRrSsTtUuƯưVvXxYy', - 'ur': u'ابپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوهھءیےٹڈڑ‌آّْیٰوَُِٗ', - 'uz': 'A-Za-zʻ', - 'sv': u'A-Za-zÅÄÖåäö', - 'hu': u'A-Za-zËëÉéÓóÖöŐőÚúÜüŰűÁá', - 'cs': u'A-Za-zÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž', - 'hi': u'कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहळक्षज्ञ:अपआपाइपिईपीउपुऊपूऋपृॠप' - u'ॄऌपॢॡपॣएपेऐपैओपोऔपौअंपंअःपः', - 'no': u'A-Za-zÆØÅæøåéèêóòâôüáàé', - 'ta': u'௰௱௲௳௴௵௶௷௸௹௺ௗௐொோௌ்ெேைீுூாிரறலளழவஶஷஸஹணதநனபம' - u'யஐஒஓஔகஙசஜஞடஂஃஅஆஇஈஉஊஎஏ', -} - - -def lower(a, lang): - if lang == 'tr': - return a.replace('I', u'ı').replace(u'İ', 'i').lower() - return a.lower() - - -def page_info(dump, lang): - global tokenizer - c = 1 - di_old = [] - di = [] - nombre = '3,' if lang not in ['ja', 'zh'] else '1' - for entry in dump.parse(): - if entry.ns != '0': - continue - if c != entry.id: - if c != 1: - di_old = di[:] - di = [] - if entry.id and int(entry.id[-1]) == 0: - print('new page', entry.id) - di.append(entry) - else: - di.append(entry) - continue - c = entry.id - firstRev = True - history = {} - detector = reverts.Detector(radius=3) - for revision in di_old: - revision.text = re.sub( - r'\[\[(%s)\:' % '|'.join(languages_by_size), - '', - revision.text) - words = set() - if lang in chars: - token_pattern = r'[%s]{%s}' % (chars[lang], nombre) - tokenizer = RegexpTokenizer(token_pattern) - tokens = tokenizer.tokenize(revision.text) - else: - token_pattern = r'\p{alpha}+' - tokens = regex.findall(token_pattern, revision.text) - for w in tokens: - words.add(lower(w, lang)) - if firstRev: - prevIntersection = words - firstRev = False - added = words - prevIntersection - prevIntersection = words - history[revision.revisionid] = Edit( - revision.revisionid, added, False) - rev = detector.process(revision.text, - {'rev_id': revision.revisionid}) - if rev: - for reverted in rev.reverteds: - history[reverted['rev_id']].reverted = True - - yield history - - -def run(dumps): - number = 500000 - counter = 0 - start_time = time.time() - for casee in dumps: - lang = casee.split('/')[-1].split('wiki')[0] - dump = xmlreader.XmlDump(casee, True) - bot = Bot() - for case in page_info(dump, lang): - counter += 1 - if number and counter > number: - break - bot.parse_edits(case.values()) - bot.parse_bad_edits(250) - bot.dump() - print(time.time() - start_time) - site = pywikibot.Site('meta', fam='meta') - page = pywikibot.Page( - site, 'Research:Revision scoring as a service/Word lists/' + lang) - try: - text = page.get() - except pywikibot.NoPage: - text = ("{{Research:Revision scoring as a service/template/word list " - "data\n |lang=%s\n |gen=250\n |badwords=-\n |informal=-" - "\n |stopwords=-\n |dictionary=-\n |stemmer=-\n |contact=" - "\n |features=no\n |labels=requested\n |campaign=no\n " - "|needs=-\n |list-generated=\n |list-stop=\n}}\n" % lang) - except: - return False - new_text = text - if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text): - if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text): - new_text = re.sub( - r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})', - r'\1%s\2' % bot.bad_words_res_text, - new_text) - else: - new_text = re.sub( - r'\}\}', - r'|list-generated=%s\n}}' % bot.bad_words_res_text, - new_text) - if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text): - if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text): - new_text = re.sub( - r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})', - r'\1%s\2' % bot.stop_words_res_text, - new_text) - else: - new_text = re.sub( - r'\}\}', - r'|list-stop=%s\n}}' % bot.stop_words_res_text, - new_text) - if new_text != text: - page.text = new_text - page.save('Bot: update results') -if __name__ == "__main__": - dumps = sys.argv[1:] - run(dumps) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..2d90e26 --- /dev/null +++ b/setup.py @@ -0,0 +1,44 @@ +import os + +from setuptools import find_packages, setup + +about_path = os.path.join(os.path.dirname(__file__), "bwds/about.py") +exec(compile(open(about_path).read(), about_path, "exec")) + + +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + + +def requirements(fname): + for line in open(os.path.join(os.path.dirname(__file__), fname)): + yield line.strip() + +setup( + name=__name__, # noqa + version=__version__, # noqa + author=__author__, # noqa + author_email=__author_email__, # noqa + description=__description__, # noqa + url=__url__, # noqa + license=__license__, # noqa + packages=find_packages(), + entry_points={ + 'console_scripts': [ + 'bwds=bwds.bwds:main' + ], + }, + long_description=read('README.md'), + install_requires=requirements('requirements.txt'), + classifiers=[ + "Development Status :: 4 - Beta", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Topic :: Utilities", + "Topic :: Scientific/Engineering" + ], +) diff --git a/utility b/utility new file mode 100755 index 0000000..1038754 --- /dev/null +++ b/utility @@ -0,0 +1,4 @@ +#!/usr/bin/env python +from bwds import bwds + +bwds.main()