Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# datasets and models
datasets/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
ipython/.ipynb_checkpoints/

# Temporary text editor files
*~

# C extensions
*.so

# Distribution / packaging
.Python
env/
bin/
build/
develop-eggs/
dist/
eggs/
#lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml

# Sphinx documentation
doc/_build/
doc/.buildfile
*.toctree
Empty file added README.md
Empty file.
5 changes: 5 additions & 0 deletions bwds/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .about import (__name__, __version__, __author__, __author_email__,
__description__, __license__, __url__)

__all__ = [__name__, __version__, __author__, __author_email__,
__description__, __license__, __url__]
8 changes: 8 additions & 0 deletions bwds/about.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
__name__ = "bwds"
__version__ = "0.0.1"
__author__ = "Amir Sarabadani"
__author_email__ = "ladsgroup@gmail.com"
__description__ = "A library for performing automatic detection of the " + \
"badwords added to Wikipedia articles"
__url__ = "https://github.com/wiki-ai/bwds"
__license__ = "MIT"
135 changes: 135 additions & 0 deletions bwds/bot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Copyright © 2014 He7d3r
# License: http://he7d3r.mit-license.org/
"""
Extermely under construction.
Some parts are copied from
https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c
"""

import re
import sys
import time

import regex
from mw.lib import reverts
from nltk.tokenize import RegexpTokenizer

import pywikibot
from bad_words_detection_system import Bot, Edit
from pywikibot import xmlreader


def page_info(dump, lang):
global tokenizer
c = 1
di_old = []
di = []
nombre = '3,' if lang not in ['ja', 'zh'] else '1'
for entry in dump.parse():
if entry.ns != '0':
continue
if c != entry.id:
if c != 1:
di_old = di[:]
di = []
if entry.id and int(entry.id[-1]) == 0:
print('new page', entry.id)
di.append(entry)
else:
di.append(entry)
continue
c = entry.id
firstRev = True
history = {}
detector = reverts.Detector(radius=3)
for revision in di_old:
revision.text = re.sub(
r'\[\[(%s)\:' % '|'.join(languages_by_size),
'',
revision.text)
words = set()
if lang in chars:
token_pattern = r'[%s]{%s}' % (chars[lang], nombre)
tokenizer = RegexpTokenizer(token_pattern)
tokens = tokenizer.tokenize(revision.text)
else:
token_pattern = r'\p{alpha}+'
tokens = regex.findall(token_pattern, revision.text)
for w in tokens:
words.add(lower(w, lang))
if firstRev:
prevIntersection = words
firstRev = False
added = words - prevIntersection
prevIntersection = words
history[revision.revisionid] = Edit(
revision.revisionid, added, False)
rev = detector.process(revision.text,
{'rev_id': revision.revisionid})
if rev:
for reverted in rev.reverteds:
history[reverted['rev_id']].reverted = True

yield history


def run(dumps):
number = 500000
counter = 0
start_time = time.time()
for casee in dumps:
lang = casee.split('/')[-1].split('wiki')[0]
dump = xmlreader.XmlDump(casee, True)
bot = Bot()
for case in page_info(dump, lang):
counter += 1
if number and counter > number:
break
bot.parse_edits(case.values())
bot.parse_bad_edits(250)
bot.dump()
print(time.time() - start_time)
site = pywikibot.Site('meta', fam='meta')
page = pywikibot.Page(
site, 'Research:Revision scoring as a service/Word lists/' + lang)
try:
text = page.get()
except pywikibot.NoPage:
text = ("{{Research:Revision scoring as a service/template/word list "
"data\n |lang=%s\n |gen=250\n |badwords=-\n |informal=-"
"\n |stopwords=-\n |dictionary=-\n |stemmer=-\n |contact="
"\n |features=no\n |labels=requested\n |campaign=no\n "
"|needs=-\n |list-generated=\n |list-stop=\n}}\n" % lang)
except:
return False
new_text = text
if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text):
if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text):
new_text = re.sub(
r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})',
r'\1%s\2' % bot.bad_words_res_text,
new_text)
else:
new_text = re.sub(
r'\}\}',
r'|list-generated=%s\n}}' % bot.bad_words_res_text,
new_text)
if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text):
if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text):
new_text = re.sub(
r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})',
r'\1%s\2' % bot.stop_words_res_text,
new_text)
else:
new_text = re.sub(
r'\}\}',
r'|list-stop=%s\n}}' % bot.stop_words_res_text,
new_text)
if new_text != text:
page.text = new_text
page.save('Bot: update results')
if __name__ == "__main__":
dumps = sys.argv[1:]
run(dumps)
47 changes: 47 additions & 0 deletions bwds/bwds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
This script provides access to a set of utilities for extracting features and
building edit quality predictors.

* process_api -- Processes a sample of revisions using the API
* process_dump -- Processes an XML dump

Usage:
bwds (-h | --help)
bwds <utility> [-h | --help]

Options:
-h | --help Prints this documentation
<utility> The name of the utility to run
"""
import sys
import traceback
from importlib import import_module


USAGE = """Usage:
bwds (-h | --help)
bwds <utility> [-h | --help]\n"""


def main():

if len(sys.argv) < 2:
sys.stderr.write(USAGE)
sys.exit(1)
elif sys.argv[1] in ("-h", "--help"):
sys.stderr.write(__doc__ + "\n")
sys.exit(1)
elif sys.argv[1][:1] == "-":
sys.stderr.write(USAGE)
sys.exit(1)

module_name = sys.argv[1]
try:
module = import_module(".utilities." + module_name,
package="bwds")
except ImportError:
sys.stderr.write(traceback.format_exc())
sys.stderr.write("Could not load utility {0}.\n".format(module_name))
sys.exit(1)

module.main(sys.argv[2:])
Loading