From 1e61d1e6faebefc6779e163f530cdd5dd082411c Mon Sep 17 00:00:00 2001
From: Lyudmila Vaseva <vaseva@mi.fu-berlin.de>
Date: Wed, 6 Mar 2019 20:04:40 +0100
Subject: [PATCH] Explore namespaces, editors actions and tags per year

---
 src/explore.ipynb | 1708 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 1589 insertions(+), 119 deletions(-)

diff --git a/src/explore.ipynb b/src/explore.ipynb
index 091282a..e85452a 100644
--- a/src/explore.ipynb
+++ b/src/explore.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -20,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -636,7 +636,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1809,126 +1809,1596 @@
     "df_actions.fillna('log')"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Explore Manual Tags"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
+   "execution_count": 2,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[('vandalism', 263), ('vandalism?', 162), ('unknown', 71), ('good_faith?', 63), ('misc', 59), ('sockpuppetry', 59), ('good_faith', 48), ('test', 43), ('spam?', 41), ('long_term_abuse', 35), ('sockpuppetry?', 35), ('harassment?', 31), ('harassment', 24), ('abuse?', 21), ('biased_pov', 17), ('spam', 17), ('biased_pov?', 15), ('unclear', 14), ('bad_style', 13), ('bad_style?', 12), ('bug?', 10), ('wiki_policy?', 9), ('long_term_abuse?', 9), ('misc?', 8), ('seo', 8), ('politically_motivated?', 8), ('maintenance', 7), ('trolling?', 7), ('maintenance?', 6), ('personal_attacks', 6), ('bug', 5), ('vandalbot', 5), ('page_move_vandalism', 5), ('silly_vandalism', 5), ('lazyness', 4), ('seo?', 4), ('test?', 4), ('hoaxing?', 4), ('personal_attacks?', 4), ('edit_warring?', 3), ('copyright', 3), ('image_vandalism', 3), ('talk_page_vandalism', 3), ('page_move_vandalism?', 3), ('conflict_of_interest', 3), ('stockbrocker_vandalism', 3), ('copyright?', 2), ('vandalbot?', 2), ('religious_vandalism?', 2), ('politically_motivated', 2), ('self_promotion?', 2), ('template_spam', 2), ('hoaxing', 2), ('silly_vandalism?', 2), ('doxxing?', 2), ('not_polite', 1), ('template_vandalism', 1), ('religious_vandalism', 1), ('self_promotion', 1), ('abuse', 1), ('template_vandalism?', 1), ('link_vandalism?', 1), ('abuse_of_tags_vandalism?', 1), ('avoidant_vandalism', 1), ('guideline_vio?', 1), ('username_vandalism?', 1), ('phishing?', 1), ('avoidant_vandalism?', 1), ('malware?', 1), ('malware', 1), ('conflict_of_interest?', 1), ('impersonation', 1), ('prank', 1)]\n"
-     ]
-    }
-   ],
-   "source": [
-    "manual_tags = df['manual_tags']\n",
-    "manual_tags_list = [x.split(\", \") for x in list(manual_tags)]\n",
-    "all_tags = flatten(manual_tags_list)\n",
-    "\n",
-    "print(collections.Counter(all_tags).most_common())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "('vandalism', 263),\n",
-    "('vandalism?', 162),\n",
-    "  ('spam?', 41),\n",
-    "  ('spam', 17),\n",
-    "  ('vandalbot', 5),\n",
-    "  ('vandalbot?', 2),\n",
-    "  ('page_move_vandalism', 5),\n",
-    "  ('page_move_vandalism?', 3),\n",
-    "  ('silly_vandalism', 5),\n",
-    "  ('silly_vandalism?', 2),\n",
-    "  ('trolling?', 7),\n",
-    "  ('hoaxing?', 4),\n",
-    "  ('hoaxing', 2),\n",
-    "  ('copyright', 3),\n",
-    "  ('copyright?', 2),\n",
-    "  ('image_vandalism', 3),\n",
-    "  ('talk_page_vandalism', 3),\n",
-    "  ('template_vandalism?', 1),\n",
-    "  ('template_vandalism', 1),\n",
-    "  ('template_spam', 2),\n",
-    "  ('link_vandalism?', 1),\n",
-    "  ('abuse_of_tags_vandalism?', 1),\n",
-    "  ('avoidant_vandalism', 1),\n",
-    "  ('avoidant_vandalism?', 1),\n",
-    "  ('username_vandalism?', 1),\n",
-    "\n",
-    "('prank', 1)\n",
-    "\n",
-    "('phishing?', 1),\n",
-    "('malware?', 1),\n",
-    "('malware', 1),\n",
-    "\n",
-    "('guideline_vio?', 1),\n",
-    "\n",
-    "('religious_vandalism?', 3),\n",
-    "('politically_motivated?', 8),\n",
-    "('politically_motivated', 2),\n",
-    "\n",
-    "('sockpuppetry', 59),\n",
-    "('sockpuppetry?', 35),\n",
-    "('long_term_abuse', 35),\n",
-    "('long_term_abuse?', 9),\n",
-    "('abuse', 1),\n",
-    "('abuse?', 21),\n",
-    "('harassment?', 31),\n",
-    "('harassment', 24),\n",
-    "('doxxing?', 2),\n",
-    "('personal_attacks', 6),\n",
-    "('personal_attacks?', 4),\n",
-    "('impersonation', 1),\n",
-    "('not_polite', 1),\n",
-    "\n",
-    "('biased_pov', 17),\n",
-    "('biased_pov?', 15),\n",
-    "\n",
-    "('conflict_of_interest', 3),\n",
-    "('stockbrocker_vandalism', 3),\n",
-    "('self_promotion?', 2),\n",
-    "('conflict_of_interest?', 1),\n",
-    "('self_promotion', 1),\n",
-    "\n",
-    "('seo', 8),\n",
-    "('seo?', 4),\n",
-    "\n",
-    "('bad_style', 13),\n",
-    "('bad_style?', 12),\n",
-    "('edit_warring?', 3),\n",
-    "\n",
-    "('good_faith?', 63),\n",
-    "('good_faith', 48),\n",
-    "\n",
-    "('lazyness', 4),\n",
-    "\n",
-    "('maintenance', 7),\n",
-    "('maintenance?', 5),\n",
-    "('maintenance? ', 1),\n",
-    "\n",
-    "('bug', 5),\n",
-    "('bug?', 10),\n",
-    "('wiki_policy?', 9),\n",
-    "\n",
-    "('test', 43),\n",
-    "('test?', 4),\n",
-    "\n",
-    "('unknown', 71),\n",
-    "('misc', 59),\n",
-    "('misc?', 8),\n",
-    "('unclear', 14),"
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>LogMonth</th>\n",
+       "      <th>EditorActions</th>\n",
+       "      <th>Freq</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>autocreateaccount</td>\n",
+       "      <td>47</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>4780</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>delete</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>37950</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>move</td>\n",
+       "      <td>84</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>upload</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>autocreateaccount</td>\n",
+       "      <td>454</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>25204</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>delete</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>210488</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>move</td>\n",
+       "      <td>445</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>upload</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>201901</td>\n",
+       "      <td>autocreateaccount</td>\n",
+       "      <td>281</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>201901</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>27924</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>201901</td>\n",
+       "      <td>delete</td>\n",
+       "      <td>21</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>201901</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>223870</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>201901</td>\n",
+       "      <td>move</td>\n",
+       "      <td>568</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>201901</td>\n",
+       "      <td>upload</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>201812</td>\n",
+       "      <td>autocreateaccount</td>\n",
+       "      <td>102</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>201812</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>35405</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>201812</td>\n",
+       "      <td>delete</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>201812</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>189795</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>201812</td>\n",
+       "      <td>move</td>\n",
+       "      <td>959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>201812</td>\n",
+       "      <td>upload</td>\n",
+       "      <td>21</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>201811</td>\n",
+       "      <td>autocreateaccount</td>\n",
+       "      <td>486</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>201811</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>35421</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>201811</td>\n",
+       "      <td>delete</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>201811</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>216220</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>201811</td>\n",
+       "      <td>move</td>\n",
+       "      <td>1048</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>201811</td>\n",
+       "      <td>upload</td>\n",
+       "      <td>56</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>535</th>\n",
+       "      <td>200911</td>\n",
+       "      <td>move</td>\n",
+       "      <td>128</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>536</th>\n",
+       "      <td>200910</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>145</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>537</th>\n",
+       "      <td>200910</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>226571</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>538</th>\n",
+       "      <td>200910</td>\n",
+       "      <td>move</td>\n",
+       "      <td>149</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>539</th>\n",
+       "      <td>200909</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>35</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>540</th>\n",
+       "      <td>200909</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>213070</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>541</th>\n",
+       "      <td>200909</td>\n",
+       "      <td>move</td>\n",
+       "      <td>103</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>542</th>\n",
+       "      <td>200908</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>95</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>543</th>\n",
+       "      <td>200908</td>\n",
+       "      <td>delete</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>544</th>\n",
+       "      <td>200908</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>162038</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>545</th>\n",
+       "      <td>200908</td>\n",
+       "      <td>move</td>\n",
+       "      <td>146</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>546</th>\n",
+       "      <td>200907</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>124</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>547</th>\n",
+       "      <td>200907</td>\n",
+       "      <td>delete</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>548</th>\n",
+       "      <td>200907</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>160740</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>549</th>\n",
+       "      <td>200907</td>\n",
+       "      <td>move</td>\n",
+       "      <td>128</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>550</th>\n",
+       "      <td>200906</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>141</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>551</th>\n",
+       "      <td>200906</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>178879</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>552</th>\n",
+       "      <td>200906</td>\n",
+       "      <td>move</td>\n",
+       "      <td>156</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>553</th>\n",
+       "      <td>200905</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>156</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>554</th>\n",
+       "      <td>200905</td>\n",
+       "      <td>delete</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>555</th>\n",
+       "      <td>200905</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>211506</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>556</th>\n",
+       "      <td>200905</td>\n",
+       "      <td>move</td>\n",
+       "      <td>215</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>557</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>558</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>delete</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>559</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>178865</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>560</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>move</td>\n",
+       "      <td>256</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>561</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>createaccount</td>\n",
+       "      <td>418</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>562</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>delete</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>563</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>edit</td>\n",
+       "      <td>98346</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>564</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>move</td>\n",
+       "      <td>241</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>565 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     LogMonth      EditorActions    Freq\n",
+       "0      201903  autocreateaccount      47\n",
+       "1      201903      createaccount    4780\n",
+       "2      201903             delete       5\n",
+       "3      201903               edit   37950\n",
+       "4      201903               move      84\n",
+       "5      201903             upload       3\n",
+       "6      201902  autocreateaccount     454\n",
+       "7      201902      createaccount   25204\n",
+       "8      201902             delete       5\n",
+       "9      201902               edit  210488\n",
+       "10     201902               move     445\n",
+       "11     201902             upload      10\n",
+       "12     201901  autocreateaccount     281\n",
+       "13     201901      createaccount   27924\n",
+       "14     201901             delete      21\n",
+       "15     201901               edit  223870\n",
+       "16     201901               move     568\n",
+       "17     201901             upload       4\n",
+       "18     201812  autocreateaccount     102\n",
+       "19     201812      createaccount   35405\n",
+       "20     201812             delete       5\n",
+       "21     201812               edit  189795\n",
+       "22     201812               move     959\n",
+       "23     201812             upload      21\n",
+       "24     201811  autocreateaccount     486\n",
+       "25     201811      createaccount   35421\n",
+       "26     201811             delete       2\n",
+       "27     201811               edit  216220\n",
+       "28     201811               move    1048\n",
+       "29     201811             upload      56\n",
+       "..        ...                ...     ...\n",
+       "535    200911               move     128\n",
+       "536    200910      createaccount     145\n",
+       "537    200910               edit  226571\n",
+       "538    200910               move     149\n",
+       "539    200909      createaccount      35\n",
+       "540    200909               edit  213070\n",
+       "541    200909               move     103\n",
+       "542    200908      createaccount      95\n",
+       "543    200908             delete       2\n",
+       "544    200908               edit  162038\n",
+       "545    200908               move     146\n",
+       "546    200907      createaccount     124\n",
+       "547    200907             delete       1\n",
+       "548    200907               edit  160740\n",
+       "549    200907               move     128\n",
+       "550    200906      createaccount     141\n",
+       "551    200906               edit  178879\n",
+       "552    200906               move     156\n",
+       "553    200905      createaccount     156\n",
+       "554    200905             delete       2\n",
+       "555    200905               edit  211506\n",
+       "556    200905               move     215\n",
+       "557    200904      createaccount      24\n",
+       "558    200904             delete       7\n",
+       "559    200904               edit  178865\n",
+       "560    200904               move     256\n",
+       "561    200903      createaccount     418\n",
+       "562    200903             delete       3\n",
+       "563    200903               edit   98346\n",
+       "564    200903               move     241\n",
+       "\n",
+       "[565 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Which editors' actions triggered a filter\n",
+    "df_ed_actions = pd.read_csv(\"quarry-34050-which-actions-triggered-an-abuse-filter-en-wiki-run346498.csv\", sep=',')\n",
+    "df_ed_actions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>LogMonth</th>\n",
+       "      <th>Namespace</th>\n",
+       "      <th>Freq</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>5177</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>0</td>\n",
+       "      <td>37653</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>1</td>\n",
+       "      <td>200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1636</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>3</td>\n",
+       "      <td>604</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>4</td>\n",
+       "      <td>159</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>5</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>6</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>10</td>\n",
+       "      <td>98</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>11</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>14</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>15</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>100</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>118</td>\n",
+       "      <td>313</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>119</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>828</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>25658</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>0</td>\n",
+       "      <td>197552</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>1</td>\n",
+       "      <td>727</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>2</td>\n",
+       "      <td>8281</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>3</td>\n",
+       "      <td>2086</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>4</td>\n",
+       "      <td>679</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>5</td>\n",
+       "      <td>63</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>6</td>\n",
+       "      <td>65</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>9</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>10</td>\n",
+       "      <td>503</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>11</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>13</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>14</td>\n",
+       "      <td>34</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>15</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2233</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>4</td>\n",
+       "      <td>568</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2234</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>5</td>\n",
+       "      <td>183</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2235</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>6</td>\n",
+       "      <td>2186</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2236</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>7</td>\n",
+       "      <td>86</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2237</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>9</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2238</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>10</td>\n",
+       "      <td>636</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2239</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>11</td>\n",
+       "      <td>25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2240</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>12</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2241</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>13</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2242</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>14</td>\n",
+       "      <td>61</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2243</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>15</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2244</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>100</td>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2245</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>101</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2246</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>418</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2247</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>0</td>\n",
+       "      <td>91967</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2248</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1599</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2249</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1986</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2250</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1510</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2251</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>4</td>\n",
+       "      <td>326</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2252</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>5</td>\n",
+       "      <td>107</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2253</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>6</td>\n",
+       "      <td>834</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2254</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>7</td>\n",
+       "      <td>47</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2255</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>10</td>\n",
+       "      <td>110</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2256</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>11</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2257</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>12</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2258</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>13</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2259</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>14</td>\n",
+       "      <td>37</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2260</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>15</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2261</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>100</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2262</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>101</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2263 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      LogMonth  Namespace    Freq\n",
+       "0       201903         -1    5177\n",
+       "1       201903          0   37653\n",
+       "2       201903          1     200\n",
+       "3       201903          2    1636\n",
+       "4       201903          3     604\n",
+       "5       201903          4     159\n",
+       "6       201903          5      13\n",
+       "7       201903          6       7\n",
+       "8       201903         10      98\n",
+       "9       201903         11       8\n",
+       "10      201903         14       5\n",
+       "11      201903         15       2\n",
+       "12      201903        100       4\n",
+       "13      201903        118     313\n",
+       "14      201903        119      11\n",
+       "15      201903        828       1\n",
+       "16      201902         -1   25658\n",
+       "17      201902          0  197552\n",
+       "18      201902          1     727\n",
+       "19      201902          2    8281\n",
+       "20      201902          3    2086\n",
+       "21      201902          4     679\n",
+       "22      201902          5      63\n",
+       "23      201902          6      65\n",
+       "24      201902          9       2\n",
+       "25      201902         10     503\n",
+       "26      201902         11       1\n",
+       "27      201902         13       1\n",
+       "28      201902         14      34\n",
+       "29      201902         15       6\n",
+       "...        ...        ...     ...\n",
+       "2233    200904          4     568\n",
+       "2234    200904          5     183\n",
+       "2235    200904          6    2186\n",
+       "2236    200904          7      86\n",
+       "2237    200904          9       5\n",
+       "2238    200904         10     636\n",
+       "2239    200904         11      25\n",
+       "2240    200904         12       6\n",
+       "2241    200904         13      11\n",
+       "2242    200904         14      61\n",
+       "2243    200904         15      16\n",
+       "2244    200904        100      22\n",
+       "2245    200904        101      30\n",
+       "2246    200903         -1     418\n",
+       "2247    200903          0   91967\n",
+       "2248    200903          1    1599\n",
+       "2249    200903          2    1986\n",
+       "2250    200903          3    1510\n",
+       "2251    200903          4     326\n",
+       "2252    200903          5     107\n",
+       "2253    200903          6     834\n",
+       "2254    200903          7      47\n",
+       "2255    200903         10     110\n",
+       "2256    200903         11      10\n",
+       "2257    200903         12      11\n",
+       "2258    200903         13       5\n",
+       "2259    200903         14      37\n",
+       "2260    200903         15      10\n",
+       "2261    200903        100      15\n",
+       "2262    200903        101      16\n",
+       "\n",
+       "[2263 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# In which namespaces did a filter trigger occur?\n",
+    "df_namespaces = pd.read_csv(\"quarry-34072-edits-in-which-namespaces-actions-triggered-an-abuse-filter-en-wiki-run346852.csv\", sep=',')\n",
+    "df_namespaces"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Manual tags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('vandalism', 263), ('vandalism?', 162), ('unknown', 71), ('good_faith?', 63), ('misc', 59), ('sockpuppetry', 59), ('good_faith', 48), ('test', 43), ('spam?', 41), ('long_term_abuse', 35), ('sockpuppetry?', 35), ('harassment?', 31), ('harassment', 24), ('abuse?', 21), ('biased_pov', 17), ('spam', 17), ('biased_pov?', 15), ('unclear', 14), ('bad_style', 13), ('bad_style?', 12), ('bug?', 10), ('wiki_policy?', 9), ('long_term_abuse?', 9), ('misc?', 8), ('seo', 8), ('politically_motivated?', 8), ('maintenance', 7), ('trolling?', 7), ('maintenance?', 6), ('personal_attacks', 6), ('bug', 5), ('vandalbot', 5), ('page_move_vandalism', 5), ('silly_vandalism', 5), ('lazyness', 4), ('seo?', 4), ('test?', 4), ('hoaxing?', 4), ('personal_attacks?', 4), ('edit_warring?', 3), ('copyright', 3), ('image_vandalism', 3), ('talk_page_vandalism', 3), ('page_move_vandalism?', 3), ('conflict_of_interest', 3), ('stockbrocker_vandalism', 3), ('copyright?', 2), ('vandalbot?', 2), ('religious_vandalism?', 2), ('politically_motivated', 2), ('self_promotion?', 2), ('template_spam', 2), ('hoaxing', 2), ('silly_vandalism?', 2), ('doxxing?', 2), ('not_polite', 1), ('template_vandalism', 1), ('religious_vandalism', 1), ('self_promotion', 1), ('abuse', 1), ('template_vandalism?', 1), ('link_vandalism?', 1), ('abuse_of_tags_vandalism?', 1), ('avoidant_vandalism', 1), ('guideline_vio?', 1), ('username_vandalism?', 1), ('phishing?', 1), ('avoidant_vandalism?', 1), ('malware?', 1), ('malware', 1), ('conflict_of_interest?', 1), ('impersonation', 1), ('prank', 1)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "manual_tags = df['manual_tags']\n",
+    "manual_tags_list = [x.split(\", \") for x in list(manual_tags)]\n",
+    "all_tags = flatten(manual_tags_list)\n",
+    "\n",
+    "print(collections.Counter(all_tags).most_common())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "('vandalism', 263),\n",
+    "('vandalism?', 162),\n",
+    "  ('spam?', 41),\n",
+    "  ('spam', 17),\n",
+    "  ('vandalbot', 5),\n",
+    "  ('vandalbot?', 2),\n",
+    "  ('page_move_vandalism', 5),\n",
+    "  ('page_move_vandalism?', 3),\n",
+    "  ('silly_vandalism', 5),\n",
+    "  ('silly_vandalism?', 2),\n",
+    "  ('trolling?', 7),\n",
+    "  ('hoaxing?', 4),\n",
+    "  ('hoaxing', 2),\n",
+    "  ('copyright', 3),\n",
+    "  ('copyright?', 2),\n",
+    "  ('image_vandalism', 3),\n",
+    "  ('talk_page_vandalism', 3),\n",
+    "  ('template_vandalism?', 1),\n",
+    "  ('template_vandalism', 1),\n",
+    "  ('template_spam', 2),\n",
+    "  ('link_vandalism?', 1),\n",
+    "  ('abuse_of_tags_vandalism?', 1),\n",
+    "  ('avoidant_vandalism', 1),\n",
+    "  ('avoidant_vandalism?', 1),\n",
+    "  ('username_vandalism?', 1),\n",
+    "\n",
+    "('prank', 1)\n",
+    "\n",
+    "('phishing?', 1),\n",
+    "('malware?', 1),\n",
+    "('malware', 1),\n",
+    "\n",
+    "('guideline_vio?', 1),\n",
+    "\n",
+    "('religious_vandalism?', 3),\n",
+    "('politically_motivated?', 8),\n",
+    "('politically_motivated', 2),\n",
+    "\n",
+    "('sockpuppetry', 59),\n",
+    "('sockpuppetry?', 35),\n",
+    "('long_term_abuse', 35),\n",
+    "('long_term_abuse?', 9),\n",
+    "('abuse', 1),\n",
+    "('abuse?', 21),\n",
+    "('harassment?', 31),\n",
+    "('harassment', 24),\n",
+    "('doxxing?', 2),\n",
+    "('personal_attacks', 6),\n",
+    "('personal_attacks?', 4),\n",
+    "('impersonation', 1),\n",
+    "('not_polite', 1),\n",
+    "\n",
+    "('biased_pov', 17),\n",
+    "('biased_pov?', 15),\n",
+    "\n",
+    "('conflict_of_interest', 3),\n",
+    "('stockbrocker_vandalism', 3),\n",
+    "('self_promotion?', 2),\n",
+    "('conflict_of_interest?', 1),\n",
+    "('self_promotion', 1),\n",
+    "\n",
+    "('seo', 8),\n",
+    "('seo?', 4),\n",
+    "\n",
+    "('bad_style', 13),\n",
+    "('bad_style?', 12),\n",
+    "('edit_warring?', 3),\n",
+    "\n",
+    "('good_faith?', 63),\n",
+    "('good_faith', 48),\n",
+    "\n",
+    "('lazyness', 4),\n",
+    "\n",
+    "('maintenance', 7),\n",
+    "('maintenance?', 5),\n",
+    "('maintenance? ', 1),\n",
+    "\n",
+    "('bug', 5),\n",
+    "('bug?', 10),\n",
+    "('wiki_policy?', 9),\n",
+    "\n",
+    "('test', 43),\n",
+    "('test?', 4),\n",
+    "\n",
+    "('unknown', 71),\n",
+    "('misc', 59),\n",
+    "('misc?', 8),\n",
+    "('unclear', 14),"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>afl_filter</th>\n",
+       "      <th>count(*)</th>\n",
+       "      <th>manual_tags</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>135</td>\n",
+       "      <td>175455</td>\n",
+       "      <td>vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>30</td>\n",
+       "      <td>160302</td>\n",
+       "      <td>good_faith, vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>61</td>\n",
+       "      <td>147377</td>\n",
+       "      <td>good_faith</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>18</td>\n",
+       "      <td>133640</td>\n",
+       "      <td>lazyness</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>3</td>\n",
+       "      <td>95916</td>\n",
+       "      <td>good_faith</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>172</td>\n",
+       "      <td>89710</td>\n",
+       "      <td>good_faith</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>50</td>\n",
+       "      <td>88827</td>\n",
+       "      <td>vandalism, good_faith</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>98</td>\n",
+       "      <td>80434</td>\n",
+       "      <td>good_faith</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>65</td>\n",
+       "      <td>74098</td>\n",
+       "      <td>vandalism, good_faith?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>132</td>\n",
+       "      <td>68607</td>\n",
+       "      <td>vandalism, good_faith</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>46</td>\n",
+       "      <td>47280</td>\n",
+       "      <td>vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>225</td>\n",
+       "      <td>45462</td>\n",
+       "      <td>vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>180</td>\n",
+       "      <td>37713</td>\n",
+       "      <td>good_faith</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>79</td>\n",
+       "      <td>36645</td>\n",
+       "      <td>good_faith</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>189</td>\n",
+       "      <td>35683</td>\n",
+       "      <td>vandalism, harassment</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>149</td>\n",
+       "      <td>32336</td>\n",
+       "      <td>misc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>9</td>\n",
+       "      <td>28972</td>\n",
+       "      <td>harassment</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>129</td>\n",
+       "      <td>27780</td>\n",
+       "      <td>vandalism, sockpuppetry</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>29</td>\n",
+       "      <td>27130</td>\n",
+       "      <td>good_faith</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>148</td>\n",
+       "      <td>24914</td>\n",
+       "      <td>biased_pov</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>231</td>\n",
+       "      <td>21507</td>\n",
+       "      <td>vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>220</td>\n",
+       "      <td>19946</td>\n",
+       "      <td>misc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>39</td>\n",
+       "      <td>18456</td>\n",
+       "      <td>vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>80</td>\n",
+       "      <td>18189</td>\n",
+       "      <td>vandalism, biased_pov, seo</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>12</td>\n",
+       "      <td>18159</td>\n",
+       "      <td>vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>155</td>\n",
+       "      <td>17517</td>\n",
+       "      <td>misc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>106</td>\n",
+       "      <td>14513</td>\n",
+       "      <td>misc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>96</td>\n",
+       "      <td>14399</td>\n",
+       "      <td>good_faith?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>11</td>\n",
+       "      <td>14368</td>\n",
+       "      <td>vandalism, harassment</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>28</td>\n",
+       "      <td>12264</td>\n",
+       "      <td>good_faith?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>189</th>\n",
+       "      <td>245</td>\n",
+       "      <td>7</td>\n",
+       "      <td>abuse?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>190</th>\n",
+       "      <td>257</td>\n",
+       "      <td>7</td>\n",
+       "      <td>vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>191</th>\n",
+       "      <td>275</td>\n",
+       "      <td>6</td>\n",
+       "      <td>page_move_vandalism?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>192</th>\n",
+       "      <td>70</td>\n",
+       "      <td>6</td>\n",
+       "      <td>page_move_vandalism?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>193</th>\n",
+       "      <td>214</td>\n",
+       "      <td>6</td>\n",
+       "      <td>self_promotion?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>194</th>\n",
+       "      <td>207</td>\n",
+       "      <td>6</td>\n",
+       "      <td>avoidant_vandalism?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>195</th>\n",
+       "      <td>38</td>\n",
+       "      <td>6</td>\n",
+       "      <td>unknown, abuse?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>196</th>\n",
+       "      <td>57</td>\n",
+       "      <td>6</td>\n",
+       "      <td>personal_attacks, doxxing?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>197</th>\n",
+       "      <td>49</td>\n",
+       "      <td>5</td>\n",
+       "      <td>spam</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>198</th>\n",
+       "      <td>69</td>\n",
+       "      <td>5</td>\n",
+       "      <td>page_move_vandalism?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>199</th>\n",
+       "      <td>109</td>\n",
+       "      <td>5</td>\n",
+       "      <td>unknown</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200</th>\n",
+       "      <td>20</td>\n",
+       "      <td>5</td>\n",
+       "      <td>good_faith?, vandalism?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>201</th>\n",
+       "      <td>2</td>\n",
+       "      <td>4</td>\n",
+       "      <td>test</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>202</th>\n",
+       "      <td>127</td>\n",
+       "      <td>4</td>\n",
+       "      <td>good_faith?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>203</th>\n",
+       "      <td>173</td>\n",
+       "      <td>4</td>\n",
+       "      <td>unknown</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>204</th>\n",
+       "      <td>40</td>\n",
+       "      <td>4</td>\n",
+       "      <td>vandalism, hoaxing?, personal_attacks?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>205</th>\n",
+       "      <td>244</td>\n",
+       "      <td>3</td>\n",
+       "      <td>bug?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>206</th>\n",
+       "      <td>184</td>\n",
+       "      <td>3</td>\n",
+       "      <td>vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>207</th>\n",
+       "      <td>251</td>\n",
+       "      <td>3</td>\n",
+       "      <td>vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>208</th>\n",
+       "      <td>243</td>\n",
+       "      <td>3</td>\n",
+       "      <td>malware</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>209</th>\n",
+       "      <td>73</td>\n",
+       "      <td>2</td>\n",
+       "      <td>test</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>210</th>\n",
+       "      <td>118</td>\n",
+       "      <td>2</td>\n",
+       "      <td>vandalism?, wiki_policy?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>211</th>\n",
+       "      <td>162</td>\n",
+       "      <td>2</td>\n",
+       "      <td>unknown</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>212</th>\n",
+       "      <td>142</td>\n",
+       "      <td>1</td>\n",
+       "      <td>unknown</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>213</th>\n",
+       "      <td>120</td>\n",
+       "      <td>1</td>\n",
+       "      <td>doxxing?</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>214</th>\n",
+       "      <td>196</td>\n",
+       "      <td>1</td>\n",
+       "      <td>vandalism</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>215</th>\n",
+       "      <td>121</td>\n",
+       "      <td>1</td>\n",
+       "      <td>test</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>216</th>\n",
+       "      <td>198</td>\n",
+       "      <td>1</td>\n",
+       "      <td>test</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>217</th>\n",
+       "      <td>145</td>\n",
+       "      <td>1</td>\n",
+       "      <td>sockpuppetry</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>218</th>\n",
+       "      <td>230</td>\n",
+       "      <td>1</td>\n",
+       "      <td>unknown</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>219 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     afl_filter  count(*)                             manual_tags\n",
+       "0           135    175455                               vandalism\n",
+       "1            30    160302                   good_faith, vandalism\n",
+       "2            61    147377                              good_faith\n",
+       "3            18    133640                                lazyness\n",
+       "4             3     95916                              good_faith\n",
+       "5           172     89710                              good_faith\n",
+       "6            50     88827                   vandalism, good_faith\n",
+       "7            98     80434                              good_faith\n",
+       "8            65     74098                  vandalism, good_faith?\n",
+       "9           132     68607                   vandalism, good_faith\n",
+       "10           46     47280                               vandalism\n",
+       "11          225     45462                               vandalism\n",
+       "12          180     37713                              good_faith\n",
+       "13           79     36645                              good_faith\n",
+       "14          189     35683                   vandalism, harassment\n",
+       "15          149     32336                                    misc\n",
+       "16            9     28972                              harassment\n",
+       "17          129     27780                 vandalism, sockpuppetry\n",
+       "18           29     27130                              good_faith\n",
+       "19          148     24914                              biased_pov\n",
+       "20          231     21507                               vandalism\n",
+       "21          220     19946                                    misc\n",
+       "22           39     18456                               vandalism\n",
+       "23           80     18189              vandalism, biased_pov, seo\n",
+       "24           12     18159                               vandalism\n",
+       "25          155     17517                                    misc\n",
+       "26          106     14513                                    misc\n",
+       "27           96     14399                             good_faith?\n",
+       "28           11     14368                   vandalism, harassment\n",
+       "29           28     12264                             good_faith?\n",
+       "..          ...       ...                                     ...\n",
+       "189         245         7                                  abuse?\n",
+       "190         257         7                               vandalism\n",
+       "191         275         6                    page_move_vandalism?\n",
+       "192          70         6                    page_move_vandalism?\n",
+       "193         214         6                         self_promotion?\n",
+       "194         207         6                     avoidant_vandalism?\n",
+       "195          38         6                         unknown, abuse?\n",
+       "196          57         6              personal_attacks, doxxing?\n",
+       "197          49         5                                    spam\n",
+       "198          69         5                    page_move_vandalism?\n",
+       "199         109         5                                 unknown\n",
+       "200          20         5                 good_faith?, vandalism?\n",
+       "201           2         4                                    test\n",
+       "202         127         4                             good_faith?\n",
+       "203         173         4                                 unknown\n",
+       "204          40         4  vandalism, hoaxing?, personal_attacks?\n",
+       "205         244         3                                    bug?\n",
+       "206         184         3                               vandalism\n",
+       "207         251         3                               vandalism\n",
+       "208         243         3                                 malware\n",
+       "209          73         2                                    test\n",
+       "210         118         2                vandalism?, wiki_policy?\n",
+       "211         162         2                                 unknown\n",
+       "212         142         1                                 unknown\n",
+       "213         120         1                                doxxing?\n",
+       "214         196         1                               vandalism\n",
+       "215         121         1                                    test\n",
+       "216         198         1                                    test\n",
+       "217         145         1                            sockpuppetry\n",
+       "218         230         1                                 unknown\n",
+       "\n",
+       "[219 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# manual tags per year\n",
+    "df_logs_2009 = pd.read_csv(\"data/log-entries-yearly/2009.csv\", sep=',')\n",
+    "df_ids_manual_tags = df[['af_id', 'manual_tags']]\n",
+    "df_2009_tags = df_logs_2009.join(df_ids_manual_tags.set_index('af_id'), on='afl_filter', how='inner')\n",
+    "df_2009_tags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('vandalism', 66), ('vandalism?', 37), ('good_faith?', 24), ('good_faith', 21), ('misc', 18), ('sockpuppetry', 10), ('unknown', 10), ('test', 9), ('abuse?', 7), ('spam?', 6), ('harassment', 5), ('harassment?', 5), ('sockpuppetry?', 5), ('biased_pov?', 4), ('bad_style', 4), ('politically_motivated?', 3), ('maintenance', 3), ('personal_attacks?', 3), ('page_move_vandalism?', 3), ('biased_pov', 2), ('seo', 2), ('misc?', 2), ('copyright?', 2), ('long_term_abuse', 2), ('unclear', 2), ('wiki_policy?', 2), ('hoaxing?', 2), ('silly_vandalism', 2), ('spam', 2), ('bug?', 2), ('doxxing?', 2), ('lazyness', 1), ('bad_style?', 1), ('seo?', 1), ('personal_attack', 1), ('maintenance?', 1), ('trolling?', 1), ('religious_vandalism?', 1), ('template_vandalism?', 1), ('link_vandalism?', 1), ('politically_motivated', 1), ('guideline_vio?', 1), ('silly_vandalism?', 1), ('template_spam', 1), ('self_promotion?', 1), ('avoidant_vandalism?', 1), ('personal_attacks', 1), ('malware', 1)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_2009_tags['manual_tags']\n",
+    "tags_list_2009 = flatten([x.split(\", \") for x in list(df_2009_tags['manual_tags'])])\n",
+    "# would be interesting to multiply with hitcount\n",
+    "\n",
+    "print(collections.Counter(tags_list_2009).most_common())"
    ]
   },
   {
-- 
GitLab