wiki-ai · halfak · Mar 18, 2017 · Mar 18, 2017 · Mar 19, 2017
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,47 @@
+# datasets and models
+datasets/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+ipython/.ipynb_checkpoints/
+
+# Temporary text editor files
+*~
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+#lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Sphinx documentation
+doc/_build/
+doc/.buildfile
+*.toctree
diff --git a/README.md b/README.md
diff --git a/bwds/__init__.py b/bwds/__init__.py
@@ -0,0 +1,5 @@
+from .about import (__name__, __version__, __author__, __author_email__,
+                    __description__, __license__, __url__)
+
+__all__ = [__name__, __version__, __author__, __author_email__,
+           __description__, __license__, __url__]
diff --git a/bwds/about.py b/bwds/about.py
@@ -0,0 +1,8 @@
+__name__ = "bwds"
+__version__ = "0.0.1"
+__author__ = "Amir Sarabadani"
+__author_email__ = "ladsgroup@gmail.com"
+__description__ = "A library for performing automatic detection of the " + \
+                  "badwords added to Wikipedia articles"
+__url__ = "https://github.com/wiki-ai/bwds"
+__license__ = "MIT"
diff --git a/bwds/bot.py b/bwds/bot.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+# Copyright Â© 2014 He7d3r
+# License: http://he7d3r.mit-license.org/
+"""
+Extermely under construction.
+Some parts are copied from
+https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c
+"""
+
+import re
+import sys
+import time
+
+import regex
+from mw.lib import reverts
+from nltk.tokenize import RegexpTokenizer
+
+import pywikibot
+from bad_words_detection_system import Bot, Edit
+from pywikibot import xmlreader
+
+
+def page_info(dump, lang):
+    global tokenizer
+    c = 1
+    di_old = []
+    di = []
+    nombre = '3,' if lang not in ['ja', 'zh'] else '1'
+    for entry in dump.parse():
+        if entry.ns != '0':
+            continue
+        if c != entry.id:
+            if c != 1:
+                di_old = di[:]
+            di = []
+            if entry.id and int(entry.id[-1]) == 0:
+                print('new page', entry.id)
+            di.append(entry)
+        else:
+            di.append(entry)
+            continue
+        c = entry.id
+        firstRev = True
+        history = {}
+        detector = reverts.Detector(radius=3)
+        for revision in di_old:
+            revision.text = re.sub(
+                r'\[\[(%s)\:' % '|'.join(languages_by_size),
+                '',
+                revision.text)
+            words = set()
+            if lang in chars:
+                token_pattern = r'[%s]{%s}' % (chars[lang], nombre)
+                tokenizer = RegexpTokenizer(token_pattern)
+                tokens = tokenizer.tokenize(revision.text)
+            else:
+                token_pattern = r'\p{alpha}+'
+                tokens = regex.findall(token_pattern, revision.text)
+            for w in tokens:
+                words.add(lower(w, lang))
+            if firstRev:
+                prevIntersection = words
+                firstRev = False
+            added = words - prevIntersection
+            prevIntersection = words
+            history[revision.revisionid] = Edit(
+                revision.revisionid, added, False)
+            rev = detector.process(revision.text,
+                                   {'rev_id': revision.revisionid})
+            if rev:
+                for reverted in rev.reverteds:
+                    history[reverted['rev_id']].reverted = True
+
+        yield history
+
+
+def run(dumps):
+    number = 500000
+    counter = 0
+    start_time = time.time()
+    for casee in dumps:
+        lang = casee.split('/')[-1].split('wiki')[0]
+        dump = xmlreader.XmlDump(casee, True)
+        bot = Bot()
+        for case in page_info(dump, lang):
+            counter += 1
+            if number and counter > number:
+                break
+            bot.parse_edits(case.values())
+    bot.parse_bad_edits(250)
+    bot.dump()
+    print(time.time() - start_time)
+    site = pywikibot.Site('meta', fam='meta')
+    page = pywikibot.Page(
+        site, 'Research:Revision scoring as a service/Word lists/' + lang)
+    try:
+        text = page.get()
+    except pywikibot.NoPage:
+        text = ("{{Research:Revision scoring as a service/template/word list "
+                "data\n  |lang=%s\n  |gen=250\n  |badwords=-\n  |informal=-"
+                "\n  |stopwords=-\n  |dictionary=-\n  |stemmer=-\n  |contact="
+                "\n  |features=no\n  |labels=requested\n  |campaign=no\n  "
+                "|needs=-\n  |list-generated=\n  |list-stop=\n}}\n" % lang)
+    except:
+        return False
+    new_text = text
+    if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text):
+        if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text):
+            new_text = re.sub(
+                r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})',
+                r'\1%s\2' % bot.bad_words_res_text,
+                new_text)
+    else:
+        new_text = re.sub(
+            r'\}\}',
+            r'|list-generated=%s\n}}' % bot.bad_words_res_text,
+            new_text)
+    if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text):
+        if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text):
+            new_text = re.sub(
+                r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})',
+                r'\1%s\2' % bot.stop_words_res_text,
+                new_text)
+    else:
+        new_text = re.sub(
+            r'\}\}',
+            r'|list-stop=%s\n}}' % bot.stop_words_res_text,
+            new_text)
+    if new_text != text:
+        page.text = new_text
+        page.save('Bot: update results')
+if __name__ == "__main__":
+    dumps = sys.argv[1:]
+    run(dumps)
diff --git a/bwds/bwds.py b/bwds/bwds.py
@@ -0,0 +1,47 @@
+"""
+This script provides access to a set of utilities for extracting features and
+building edit quality predictors.
+
+* process_api -- Processes a sample of revisions using the API
+* process_dump -- Processes an XML dump
+
+Usage:
+    bwds (-h | --help)
+    bwds <utility> [-h | --help]
+
+Options:
+    -h | --help  Prints this documentation
+    <utility>    The name of the utility to run
+"""
+import sys
+import traceback
+from importlib import import_module
+
+
+USAGE = """Usage:
+    bwds (-h | --help)
+    bwds <utility> [-h | --help]\n"""
+
+
+def main():
+
+    if len(sys.argv) < 2:
+        sys.stderr.write(USAGE)
+        sys.exit(1)
+    elif sys.argv[1] in ("-h", "--help"):
+        sys.stderr.write(__doc__ + "\n")
+        sys.exit(1)
+    elif sys.argv[1][:1] == "-":
+        sys.stderr.write(USAGE)
+        sys.exit(1)
+
+    module_name = sys.argv[1]
+    try:
+        module = import_module(".utilities." + module_name,
+                               package="bwds")
+    except ImportError:
+        sys.stderr.write(traceback.format_exc())
+        sys.stderr.write("Could not load utility {0}.\n".format(module_name))
+        sys.exit(1)
+
+    module.main(sys.argv[2:])