From 40b09af76b74611095e9779da32b1a510b204303 Mon Sep 17 00:00:00 2001
From: Lyudmila Vaseva <vaseva@mi.fu-berlin.de>
Date: Sun, 10 Feb 2019 13:17:11 +0100
Subject: [PATCH] Explore filter actions

---
 src/explore.ipynb | 178 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 170 insertions(+), 8 deletions(-)

diff --git a/src/explore.ipynb b/src/explore.ipynb
index 4e1fc5d..20476db 100644
--- a/src/explore.ipynb
+++ b/src/explore.ipynb
@@ -4,14 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# An explorative study into EN Wikipedia's edit filter system\n",
+    "# An explorative inquiry into EN Wikipedia's edit filter system\n",
     "\n",
     "This notebook serves to explore EN Wikipedia's edit filters"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -66,7 +66,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -75,7 +75,8 @@
      "text": [
       "201\n",
       "753\n",
-      "600\n"
+      "600\n",
+      "110\n"
      ]
     }
    ],
@@ -87,7 +88,10 @@
     "print (len(df.query('af_enabled==0')))\n",
     "\n",
     "# Deleted filters\n",
-    "print (len(df.query('af_deleted==1')))"
+    "print (len(df.query('af_deleted==1')))\n",
+    "\n",
+    "# Active public filters\n",
+    "print (len(df.query('af_hidden==0 and af_enabled==1')))"
    ]
   },
   {
@@ -130,6 +134,165 @@
     "print (len(df.query('af_global==0')))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "948\n",
+      "6\n"
+     ]
+    }
+   ],
+   "source": [
+    "# throttled\n",
+    "print (len(df.query('af_throttled==0')))\n",
+    "\n",
+    "print (len(df.query('af_throttled==1')))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "947\n",
+      "     Unnamed: 0  af_id  af_hidden  af_global  af_enabled  af_deleted  \\\n",
+      "168         168    497          0          0           0           1   \n",
+      "173         173    494          0          0           0           1   \n",
+      "174         174    502          0          0           0           1   \n",
+      "187         187    495          0          0           0           1   \n",
+      "190         190    496          0          0           0           1   \n",
+      "227         227    475          0          0           0           1   \n",
+      "349         349    461          0          0           0           1   \n",
+      "\n",
+      "     af_throttled  af_group    af_timestamp af_actions  af_hit_count  \\\n",
+      "168             0  feedback  20130108151106   disallow          3660   \n",
+      "173             0  feedback  20130108151035   disallow          3325   \n",
+      "174             0  feedback  20130424011002   disallow          3280   \n",
+      "187             0  feedback  20130108151045   disallow          2697   \n",
+      "190             0  feedback  20130108151054   disallow          2658   \n",
+      "227             0  feedback  20131003210159        NaN          1390   \n",
+      "349             0  feedback  20130411173111   disallow           283   \n",
+      "\n",
+      "                  af_public_comments                          manual_tags  \\\n",
+      "168     Feedback: Common Vandalism 5               vandalism, harassment?   \n",
+      "173     Feedback: Common Vandalism 2              vandalism?, harassment?   \n",
+      "174   Feedback: Extremely long words  vandalism?, good_faith?, bad_style?   \n",
+      "187     Feedback: Common Vandalism 3               vandalism, harassment?   \n",
+      "190     Feedback: Common Vandalism 4               vandalism, harassment?   \n",
+      "227     Feedback: Vandalism or libel                vandalism, harassment   \n",
+      "349  Feedback: Vandalism in all caps               vandalism, harassment?   \n",
+      "\n",
+      "                                        notes  \n",
+      "168  deleted; â€œMerged back into 460. --mlitnâ€  \n",
+      "173  deleted; â€œMerged back into 460. --mlitnâ€  \n",
+      "174                                   deleted  \n",
+      "187  deleted; â€œMerged back into 460. --mlitnâ€  \n",
+      "190  deleted; â€œMerged back into 460. --mlitnâ€  \n",
+      "227                                   deleted  \n",
+      "349                                       NaN  \n"
+     ]
+    }
+   ],
+   "source": [
+    "# group\n",
+    "print (len(df.query('af_group==\"default\"')))\n",
+    "print (df.query('af_group!=\"default\"'))\n",
+    "\n",
+    "# --> so available groups are \"default\" and \"feedback\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Helper functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "flatten = lambda x: list(itertools.chain.from_iterable(x))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Edit filter actions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('', 413), ('disallow', 406), ('warn', 122), ('tag', 70), ('throttle', 52), ('blockautopromote', 4)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "actions = df['af_actions'].fillna('')\n",
+    "actions_list = [x.split(\",\") for x in list(actions)]\n",
+    "all_actions = flatten(actions_list)\n",
+    "\n",
+    "print(collections.Counter(all_actions).most_common())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('disallow', 51), ('', 19), ('throttle,disallow', 7), ('throttle', 4), ('tag', 3), ('warn,tag', 2), ('throttle,warn', 2), ('warn', 1), ('disallow,tag', 1), ('warn,disallow', 1)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# What are the actions of active hidden filters\n",
+    "active_hidden = df.query('af_hidden==1 and af_enabled==1')\n",
+    "print(collections.Counter(list(active_hidden['af_actions'].fillna(''))).most_common())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('tag', 25), ('warn,tag', 25), ('disallow', 22), ('', 20), ('warn', 12), ('throttle,tag', 2), ('warn,disallow', 2), ('throttle,warn,tag', 1), ('throttle,disallow', 1)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# What are the actions of active public filters\n",
+    "active_public = df.query('af_hidden==0 and af_enabled==1')\n",
+    "print(collections.Counter(list(active_public['af_actions'].fillna(''))).most_common())"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -139,7 +302,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -153,7 +316,6 @@
    "source": [
     "manual_tags = df['manual_tags']\n",
     "manual_tags_list = [x.split(\", \") for x in list(manual_tags)]\n",
-    "flatten = lambda x: list(itertools.chain.from_iterable(x))\n",
     "all_tags = flatten(manual_tags_list)\n",
     "\n",
     "print(collections.Counter(all_tags).most_common())"
-- 
GitLab