From 40b09af76b74611095e9779da32b1a510b204303 Mon Sep 17 00:00:00 2001 From: Lyudmila Vaseva <vaseva@mi.fu-berlin.de> Date: Sun, 10 Feb 2019 13:17:11 +0100 Subject: [PATCH] Explore filter actions --- src/explore.ipynb | 178 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 170 insertions(+), 8 deletions(-) diff --git a/src/explore.ipynb b/src/explore.ipynb index 4e1fc5d..20476db 100644 --- a/src/explore.ipynb +++ b/src/explore.ipynb @@ -4,14 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# An explorative study into EN Wikipedia's edit filter system\n", + "# An explorative inquiry into EN Wikipedia's edit filter system\n", "\n", "This notebook serves to explore EN Wikipedia's edit filters" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -75,7 +75,8 @@ "text": [ "201\n", "753\n", - "600\n" + "600\n", + "110\n" ] } ], @@ -87,7 +88,10 @@ "print (len(df.query('af_enabled==0')))\n", "\n", "# Deleted filters\n", - "print (len(df.query('af_deleted==1')))" + "print (len(df.query('af_deleted==1')))\n", + "\n", + "# Active public filters\n", + "print (len(df.query('af_hidden==0 and af_enabled==1')))" ] }, { @@ -130,6 +134,165 @@ "print (len(df.query('af_global==0')))" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "948\n", + "6\n" + ] + } + ], + "source": [ + "# throttled\n", + "print (len(df.query('af_throttled==0')))\n", + "\n", + "print (len(df.query('af_throttled==1')))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "947\n", + " Unnamed: 0 af_id af_hidden af_global af_enabled af_deleted \\\n", + "168 168 497 0 0 0 1 \n", + "173 173 494 0 0 0 1 \n", + "174 174 502 0 0 0 1 \n", + "187 187 495 0 0 0 1 \n", + "190 190 496 0 0 0 1 \n", + "227 227 475 0 0 0 1 \n", + "349 349 461 0 0 0 1 \n", + "\n", + " af_throttled af_group af_timestamp af_actions af_hit_count \\\n", + "168 0 feedback 20130108151106 disallow 3660 \n", + "173 0 feedback 20130108151035 disallow 3325 \n", + "174 0 feedback 20130424011002 disallow 3280 \n", + "187 0 feedback 20130108151045 disallow 2697 \n", + "190 0 feedback 20130108151054 disallow 2658 \n", + "227 0 feedback 20131003210159 NaN 1390 \n", + "349 0 feedback 20130411173111 disallow 283 \n", + "\n", + " af_public_comments manual_tags \\\n", + "168 Feedback: Common Vandalism 5 vandalism, harassment? \n", + "173 Feedback: Common Vandalism 2 vandalism?, harassment? \n", + "174 Feedback: Extremely long words vandalism?, good_faith?, bad_style? \n", + "187 Feedback: Common Vandalism 3 vandalism, harassment? \n", + "190 Feedback: Common Vandalism 4 vandalism, harassment? \n", + "227 Feedback: Vandalism or libel vandalism, harassment \n", + "349 Feedback: Vandalism in all caps vandalism, harassment? \n", + "\n", + " notes \n", + "168 deleted; “Merged back into 460. --mlitn†\n", + "173 deleted; “Merged back into 460. --mlitn†\n", + "174 deleted \n", + "187 deleted; “Merged back into 460. --mlitn†\n", + "190 deleted; “Merged back into 460. --mlitn†\n", + "227 deleted \n", + "349 NaN \n" + ] + } + ], + "source": [ + "# group\n", + "print (len(df.query('af_group==\"default\"')))\n", + "print (df.query('af_group!=\"default\"'))\n", + "\n", + "# --> so available groups are \"default\" and \"feedback\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Helper functions" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "flatten = lambda x: list(itertools.chain.from_iterable(x))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Edit filter actions" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('', 413), ('disallow', 406), ('warn', 122), ('tag', 70), ('throttle', 52), ('blockautopromote', 4)]\n" + ] + } + ], + "source": [ + "actions = df['af_actions'].fillna('')\n", + "actions_list = [x.split(\",\") for x in list(actions)]\n", + "all_actions = flatten(actions_list)\n", + "\n", + "print(collections.Counter(all_actions).most_common())" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('disallow', 51), ('', 19), ('throttle,disallow', 7), ('throttle', 4), ('tag', 3), ('warn,tag', 2), ('throttle,warn', 2), ('warn', 1), ('disallow,tag', 1), ('warn,disallow', 1)]\n" + ] + } + ], + "source": [ + "# What are the actions of active hidden filters\n", + "active_hidden = df.query('af_hidden==1 and af_enabled==1')\n", + "print(collections.Counter(list(active_hidden['af_actions'].fillna(''))).most_common())" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('tag', 25), ('warn,tag', 25), ('disallow', 22), ('', 20), ('warn', 12), ('throttle,tag', 2), ('warn,disallow', 2), ('throttle,warn,tag', 1), ('throttle,disallow', 1)]\n" + ] + } + ], + "source": [ + "# What are the actions of active public filters\n", + "active_public = df.query('af_hidden==0 and af_enabled==1')\n", + "print(collections.Counter(list(active_public['af_actions'].fillna(''))).most_common())" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -139,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -153,7 +316,6 @@ "source": [ "manual_tags = df['manual_tags']\n", "manual_tags_list = [x.split(\", \") for x in list(manual_tags)]\n", - "flatten = lambda x: list(itertools.chain.from_iterable(x))\n", "all_tags = flatten(manual_tags_list)\n", "\n", "print(collections.Counter(all_tags).most_common())" -- GitLab