From a2e1548ee666c4f4b0d625ca788935f32fcc2dc9 Mon Sep 17 00:00:00 2001
From: Lyudmila Vaseva <vaseva@mi.fu-berlin.de>
Date: Sun, 24 Feb 2019 12:53:51 +0100
Subject: [PATCH] Deepen potential harassment investigation

---
 src/explore.ipynb | 159 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 140 insertions(+), 19 deletions(-)

diff --git a/src/explore.ipynb b/src/explore.ipynb
index d1e1e2f..58fd1af 100644
--- a/src/explore.ipynb
+++ b/src/explore.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -20,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -715,8 +715,144 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Potential harassment\n",
+    "## Potential harassment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     af_id  af_hidden                                 af_public_comments  \\\n",
+      "14     189          0                             BLP vandalism or libel   \n",
+      "16     380          1                               Multiple obscenities   \n",
+      "23     686          0    IP adding possibly unreferenced material to BLP   \n",
+      "42     247          1                          Adding emails in articles   \n",
+      "45      11          0                                You/He/She/It sucks   \n",
+      "53     339          0  Claims of homosexuality, bisexuality, or trans...   \n",
+      "72       9          0       Personal attacks by unregistered or new user   \n",
+      "74     466          1                     Userspace & talk page spamming   \n",
+      "93     460          0                               Feedback: Foul words   \n",
+      "136    478          1                                    Talk page abuse   \n",
+      "148     97          0                       Personal attacks by new user   \n",
+      "150    294          1                                   Personal attacks   \n",
+      "154    463          1                   Feedback: Adding email addresses   \n",
+      "156    874          1  Long term abuse username / impersonation creat...   \n",
+      "168    497          0                       Feedback: Common Vandalism 5   \n",
+      "173    494          0                       Feedback: Common Vandalism 2   \n",
+      "187    495          0                       Feedback: Common Vandalism 3   \n",
+      "190    496          0                       Feedback: Common Vandalism 4   \n",
+      "199    233          1                       Targeted user talk vandalism   \n",
+      "202    472          0                    Feedback: Addition of bad words   \n",
+      "209    273          1                               Repeated obscenities   \n",
+      "212    667          1        Wikipedia:Long-term abuse/Best known for IP   \n",
+      "215    179          1                                    Ongoing attacks   \n",
+      "226    921          0                        Suspicious claims of nazism   \n",
+      "227    475          0                       Feedback: Vandalism or libel   \n",
+      "236    474          0            Feedback: Comment with only obscenities   \n",
+      "245    755          1                                       LTA trolling   \n",
+      "247    834          1                    Possible attacks on other users   \n",
+      "250    341          1          Persistent talk page abuse from IP ranges   \n",
+      "255    859          1                                           LTA #859   \n",
+      "258    739          1                                   User talk vandal   \n",
+      "286    445          1              Disruptive user threatening self harm   \n",
+      "309    742          1                            BT UserTalkPage stalker   \n",
+      "349    461          0                    Feedback: Vandalism in all caps   \n",
+      "360    389          1                         Attacks on user talk pages   \n",
+      "445    513          1       Persistent talk page abuse from IP ranges II   \n",
+      "457    769          1                          AfD nomination harassment   \n",
+      "465    288          1           IP reverting/wikistalking by banned user   \n",
+      "485    140          1                          Long-term harassment case   \n",
+      "506    567          1                         Disruptive CSD nominations   \n",
+      "512    660          1                     Possibly JarlaxleArtemis (LTA)   \n",
+      "522    330          1                                 Attacks on editors   \n",
+      "524    293          1                                  J.delanoy attacks   \n",
+      "527    263          1                          Serafin - talk page abuse   \n",
+      "544    480          1                                          Swamilive   \n",
+      "554    945          1                              proxy talk page abuse   \n",
+      "571    792          1                                         Harassment   \n",
+      "594    938          1                                     Targeted abuse   \n",
+      "651    319          1                      Attacks on User:Rodhullandemu   \n",
+      "664    884          1                                            Nsmutte   \n",
+      "668    462          1                      Serial harassment of MuZemike   \n",
+      "671    556          1                            Impersonation usernames   \n",
+      "752    318          1                                     School attacks   \n",
+      "833    537          0                  Transient harassment sites filter   \n",
+      "947    754          1                            Harassment by IP hopper   \n",
+      "\n",
+      "                                           manual_tags  \n",
+      "14                               vandalism, harassment  \n",
+      "16                               vandalism, harassment  \n",
+      "23                   vandalism, harassment, biased_pov  \n",
+      "42                                  misc?, harassment?  \n",
+      "45                               vandalism, harassment  \n",
+      "53                               vandalism, harassment  \n",
+      "72                                          harassment  \n",
+      "74                             vandalism?, harassment?  \n",
+      "93                               vandalism, harassment  \n",
+      "136                            vandalism?, harassment?  \n",
+      "148                                         harassment  \n",
+      "150                                        harassment?  \n",
+      "154                                         harassment  \n",
+      "156                                         harassment  \n",
+      "168                             vandalism, harassment?  \n",
+      "173                            vandalism?, harassment?  \n",
+      "187                             vandalism, harassment?  \n",
+      "190                             vandalism, harassment?  \n",
+      "199                            vandalism?, harassment?  \n",
+      "202                             vandalism, harassment?  \n",
+      "209                            vandalism?, harassment?  \n",
+      "212                            vandalism?, harassment?  \n",
+      "215                            vandalism?, harassment?  \n",
+      "226                            vandalism?, harassment?  \n",
+      "227                              vandalism, harassment  \n",
+      "236                                         harassment  \n",
+      "245                             vandalism, harassment?  \n",
+      "247                            vandalism?, harassment?  \n",
+      "250                            harassment?, vandalism?  \n",
+      "255                            vandalism?, harassment?  \n",
+      "258                            vandalism?, harassment?  \n",
+      "286                            vandalism?, harassment?  \n",
+      "309                                        harassment?  \n",
+      "349                             vandalism, harassment?  \n",
+      "360                              vandalism, harassment  \n",
+      "445                    harassment?, vandalism?, abuse?  \n",
+      "457                                         harassment  \n",
+      "465                                         harassment  \n",
+      "485                        harassment, long_term_abuse  \n",
+      "506                            vandalism?, harassment?  \n",
+      "512  long_term_abuse, vandalism, religious_vandalis...  \n",
+      "522                                         harassment  \n",
+      "524                                        harassment?  \n",
+      "527                                abuse?, harassment?  \n",
+      "544                vandalism, sockpuppetry, harassment  \n",
+      "554                                abuse?, harassment?  \n",
+      "571                                         harassment  \n",
+      "594                                abuse?, harassment?  \n",
+      "651                                         harassment  \n",
+      "664  sockpuppetry, long_term_abuse, personal_attack...  \n",
+      "668                                         harassment  \n",
+      "671                             vandalism, harassment?  \n",
+      "752                                        harassment?  \n",
+      "833                                        harassment?  \n",
+      "947                                         harassment  \n"
+     ]
+    }
+   ],
+   "source": [
+    "df_harassment_tagged = df[df['manual_tags'].fillna('').str.contains('harassment')]\n",
     "\n",
+    "print(df_harassment_tagged[['af_id', 'af_hidden', 'af_public_comments', 'manual_tags']])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "Another idea would be to classify filters according to the namespaces they cover. A filter targeting the talk/user name spaces may be indicative of dealing with personal attacks or harassment."
    ]
   },
@@ -807,44 +943,29 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Following filters seem to be potentially targeting harassment:\n",
+    "Following filters seem to be potentially targeting harassment: (manually kick out all that obviously do not have anything to do with harassment)\n",
     "\n",
     "    af_id                                 af_public_comments\n",
-    "    37     320                               \"Your mom\" Vandalism\n",
     "    67     803  Prevent new users from editing other's user pages\n",
     "    101    602         Arbitration discretionary sanctions alerts\n",
     "    109    733  New user creating a page in someone else's use...\n",
-    "    134    878                     New user removing COI template\n",
     "    160      5  User self-renaming or moving user talk pages i...\n",
-    "    177    850              New user moving page to project space\n",
     "    193    840                                Indexing user pages\n",
     "    239    930         Prevent indexing userspaces by newer users\n",
     "    244    134  Template deletion notifications transcluded in...\n",
     "    274     99                 Edits to an other user's userspace\n",
     "    285    123                New users moving other users' pages\n",
-    "    296    369                   New user editing AFD discussions\n",
     "    302    828                              Redirecting talk page\n",
     "    329    212  New user placing comments without a header on ...\n",
-    "    353    192          Direct use of stub categories in articles\n",
     "    391    928                          Transclusion of userpages\n",
     "    424    168          Non-admins responding to unblock requests\n",
-    "    427    185                               Cross-namespace move\n",
     "    441    848                    Large contributions test filter\n",
     "    448    144                            Hiding content of pages\n",
-    "    499    694              Moves to or from the Module namespace\n",
     "    516    910                         Maureen Wroblewitz spammer\n",
-    "    520    635             OTRS template added by non-OTRS member\n",
     "    578    437   Title blacklist for TITLES with more than 9 caps\n",
     "    619      6           Users editing editnotices of other users\n",
     "    643     15                          Discussion page vandalism\n",
-    "    653    897                                      Weird spambot\n",
-    "    657    616                       disruption of music articles\n",
-    "    669    219                     Arbcom requested filter (FOFF)\n",
     "    730    207              Non-admins reviewing unblock requests\n",
-    "    746    658    Disallow moving articles to the Topic namespace\n",
-    "    754    127                    Helpme tag in content namespace\n",
-    "    784    118  The final selection of TFA's is made by Raul65...\n",
-    "    785    379              username registrations by trademarkia\n",
     "    863     67                    Sockpuppetry at AfD discussions\n",
     "    866    329                                     SPI disruption\n",
     "    921    427                  Possible Emergency Reponse Needed"
-- 
GitLab