From 1e61d1e6faebefc6779e163f530cdd5dd082411c Mon Sep 17 00:00:00 2001 From: Lyudmila Vaseva <vaseva@mi.fu-berlin.de> Date: Wed, 6 Mar 2019 20:04:40 +0100 Subject: [PATCH] Explore namespaces, editors actions and tags per year --- src/explore.ipynb | 1708 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 1589 insertions(+), 119 deletions(-) diff --git a/src/explore.ipynb b/src/explore.ipynb index 091282a..e85452a 100644 --- a/src/explore.ipynb +++ b/src/explore.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -636,7 +636,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -1809,126 +1809,1596 @@ "df_actions.fillna('log')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Explore Manual Tags" - ] - }, { "cell_type": "code", - "execution_count": 39, - "metadata": {}, + "execution_count": 2, + "metadata": { + "scrolled": true + }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('vandalism', 263), ('vandalism?', 162), ('unknown', 71), ('good_faith?', 63), ('misc', 59), ('sockpuppetry', 59), ('good_faith', 48), ('test', 43), ('spam?', 41), ('long_term_abuse', 35), ('sockpuppetry?', 35), ('harassment?', 31), ('harassment', 24), ('abuse?', 21), ('biased_pov', 17), ('spam', 17), ('biased_pov?', 15), ('unclear', 14), ('bad_style', 13), ('bad_style?', 12), ('bug?', 10), ('wiki_policy?', 9), ('long_term_abuse?', 9), ('misc?', 8), ('seo', 8), ('politically_motivated?', 8), ('maintenance', 7), ('trolling?', 7), ('maintenance?', 6), ('personal_attacks', 6), ('bug', 5), ('vandalbot', 5), ('page_move_vandalism', 5), ('silly_vandalism', 5), ('lazyness', 4), ('seo?', 4), ('test?', 4), ('hoaxing?', 4), ('personal_attacks?', 4), ('edit_warring?', 3), ('copyright', 3), ('image_vandalism', 3), ('talk_page_vandalism', 3), ('page_move_vandalism?', 3), ('conflict_of_interest', 3), ('stockbrocker_vandalism', 3), ('copyright?', 2), ('vandalbot?', 2), ('religious_vandalism?', 2), ('politically_motivated', 2), ('self_promotion?', 2), ('template_spam', 2), ('hoaxing', 2), ('silly_vandalism?', 2), ('doxxing?', 2), ('not_polite', 1), ('template_vandalism', 1), ('religious_vandalism', 1), ('self_promotion', 1), ('abuse', 1), ('template_vandalism?', 1), ('link_vandalism?', 1), ('abuse_of_tags_vandalism?', 1), ('avoidant_vandalism', 1), ('guideline_vio?', 1), ('username_vandalism?', 1), ('phishing?', 1), ('avoidant_vandalism?', 1), ('malware?', 1), ('malware', 1), ('conflict_of_interest?', 1), ('impersonation', 1), ('prank', 1)]\n" - ] - } - ], - "source": [ - "manual_tags = df['manual_tags']\n", - "manual_tags_list = [x.split(\", \") for x in list(manual_tags)]\n", - "all_tags = flatten(manual_tags_list)\n", - "\n", - "print(collections.Counter(all_tags).most_common())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "('vandalism', 263),\n", - "('vandalism?', 162),\n", - " ('spam?', 41),\n", - " ('spam', 17),\n", - " ('vandalbot', 5),\n", - " ('vandalbot?', 2),\n", - " ('page_move_vandalism', 5),\n", - " ('page_move_vandalism?', 3),\n", - " ('silly_vandalism', 5),\n", - " ('silly_vandalism?', 2),\n", - " ('trolling?', 7),\n", - " ('hoaxing?', 4),\n", - " ('hoaxing', 2),\n", - " ('copyright', 3),\n", - " ('copyright?', 2),\n", - " ('image_vandalism', 3),\n", - " ('talk_page_vandalism', 3),\n", - " ('template_vandalism?', 1),\n", - " ('template_vandalism', 1),\n", - " ('template_spam', 2),\n", - " ('link_vandalism?', 1),\n", - " ('abuse_of_tags_vandalism?', 1),\n", - " ('avoidant_vandalism', 1),\n", - " ('avoidant_vandalism?', 1),\n", - " ('username_vandalism?', 1),\n", - "\n", - "('prank', 1)\n", - "\n", - "('phishing?', 1),\n", - "('malware?', 1),\n", - "('malware', 1),\n", - "\n", - "('guideline_vio?', 1),\n", - "\n", - "('religious_vandalism?', 3),\n", - "('politically_motivated?', 8),\n", - "('politically_motivated', 2),\n", - "\n", - "('sockpuppetry', 59),\n", - "('sockpuppetry?', 35),\n", - "('long_term_abuse', 35),\n", - "('long_term_abuse?', 9),\n", - "('abuse', 1),\n", - "('abuse?', 21),\n", - "('harassment?', 31),\n", - "('harassment', 24),\n", - "('doxxing?', 2),\n", - "('personal_attacks', 6),\n", - "('personal_attacks?', 4),\n", - "('impersonation', 1),\n", - "('not_polite', 1),\n", - "\n", - "('biased_pov', 17),\n", - "('biased_pov?', 15),\n", - "\n", - "('conflict_of_interest', 3),\n", - "('stockbrocker_vandalism', 3),\n", - "('self_promotion?', 2),\n", - "('conflict_of_interest?', 1),\n", - "('self_promotion', 1),\n", - "\n", - "('seo', 8),\n", - "('seo?', 4),\n", - "\n", - "('bad_style', 13),\n", - "('bad_style?', 12),\n", - "('edit_warring?', 3),\n", - "\n", - "('good_faith?', 63),\n", - "('good_faith', 48),\n", - "\n", - "('lazyness', 4),\n", - "\n", - "('maintenance', 7),\n", - "('maintenance?', 5),\n", - "('maintenance? ', 1),\n", - "\n", - "('bug', 5),\n", - "('bug?', 10),\n", - "('wiki_policy?', 9),\n", - "\n", - "('test', 43),\n", - "('test?', 4),\n", - "\n", - "('unknown', 71),\n", - "('misc', 59),\n", - "('misc?', 8),\n", - "('unclear', 14)," + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LogMonth</th>\n", + " <th>EditorActions</th>\n", + " <th>Freq</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>201903</td>\n", + " <td>autocreateaccount</td>\n", + " <td>47</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>201903</td>\n", + " <td>createaccount</td>\n", + " <td>4780</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>201903</td>\n", + " <td>delete</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>201903</td>\n", + " <td>edit</td>\n", + " <td>37950</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>201903</td>\n", + " <td>move</td>\n", + " <td>84</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>201903</td>\n", + " <td>upload</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>201902</td>\n", + " <td>autocreateaccount</td>\n", + " <td>454</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>201902</td>\n", + " <td>createaccount</td>\n", + " <td>25204</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>201902</td>\n", + " <td>delete</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>201902</td>\n", + " <td>edit</td>\n", + " <td>210488</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>201902</td>\n", + " <td>move</td>\n", + " <td>445</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>201902</td>\n", + " <td>upload</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>201901</td>\n", + " <td>autocreateaccount</td>\n", + " <td>281</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>201901</td>\n", + " <td>createaccount</td>\n", + " <td>27924</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>201901</td>\n", + " <td>delete</td>\n", + " <td>21</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>201901</td>\n", + " <td>edit</td>\n", + " <td>223870</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>201901</td>\n", + " <td>move</td>\n", + " <td>568</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>201901</td>\n", + " <td>upload</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>201812</td>\n", + " <td>autocreateaccount</td>\n", + " <td>102</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>201812</td>\n", + " <td>createaccount</td>\n", + " <td>35405</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>201812</td>\n", + " <td>delete</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>201812</td>\n", + " <td>edit</td>\n", + " <td>189795</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>201812</td>\n", + " <td>move</td>\n", + " <td>959</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>201812</td>\n", + " <td>upload</td>\n", + " <td>21</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>201811</td>\n", + " <td>autocreateaccount</td>\n", + " <td>486</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>201811</td>\n", + " <td>createaccount</td>\n", + " <td>35421</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>201811</td>\n", + " <td>delete</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>201811</td>\n", + " <td>edit</td>\n", + " <td>216220</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>201811</td>\n", + " <td>move</td>\n", + " <td>1048</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>201811</td>\n", + " <td>upload</td>\n", + " <td>56</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>535</th>\n", + " <td>200911</td>\n", + " <td>move</td>\n", + " <td>128</td>\n", + " </tr>\n", + " <tr>\n", + " <th>536</th>\n", + " <td>200910</td>\n", + " <td>createaccount</td>\n", + " <td>145</td>\n", + " </tr>\n", + " <tr>\n", + " <th>537</th>\n", + " <td>200910</td>\n", + " <td>edit</td>\n", + " <td>226571</td>\n", + " </tr>\n", + " <tr>\n", + " <th>538</th>\n", + " <td>200910</td>\n", + " <td>move</td>\n", + " <td>149</td>\n", + " </tr>\n", + " <tr>\n", + " <th>539</th>\n", + " <td>200909</td>\n", + " <td>createaccount</td>\n", + " <td>35</td>\n", + " </tr>\n", + " <tr>\n", + " <th>540</th>\n", + " <td>200909</td>\n", + " <td>edit</td>\n", + " <td>213070</td>\n", + " </tr>\n", + " <tr>\n", + " <th>541</th>\n", + " <td>200909</td>\n", + " <td>move</td>\n", + " <td>103</td>\n", + " </tr>\n", + " <tr>\n", + " <th>542</th>\n", + " <td>200908</td>\n", + " <td>createaccount</td>\n", + " <td>95</td>\n", + " </tr>\n", + " <tr>\n", + " <th>543</th>\n", + " <td>200908</td>\n", + " <td>delete</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>544</th>\n", + " <td>200908</td>\n", + " <td>edit</td>\n", + " <td>162038</td>\n", + " </tr>\n", + " <tr>\n", + " <th>545</th>\n", + " <td>200908</td>\n", + " <td>move</td>\n", + " <td>146</td>\n", + " </tr>\n", + " <tr>\n", + " <th>546</th>\n", + " <td>200907</td>\n", + " <td>createaccount</td>\n", + " <td>124</td>\n", + " </tr>\n", + " <tr>\n", + " <th>547</th>\n", + " <td>200907</td>\n", + " <td>delete</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>548</th>\n", + " <td>200907</td>\n", + " <td>edit</td>\n", + " <td>160740</td>\n", + " </tr>\n", + " <tr>\n", + " <th>549</th>\n", + " <td>200907</td>\n", + " <td>move</td>\n", + " <td>128</td>\n", + " </tr>\n", + " <tr>\n", + " <th>550</th>\n", + " <td>200906</td>\n", + " <td>createaccount</td>\n", + " <td>141</td>\n", + " </tr>\n", + " <tr>\n", + " <th>551</th>\n", + " <td>200906</td>\n", + " <td>edit</td>\n", + " <td>178879</td>\n", + " </tr>\n", + " <tr>\n", + " <th>552</th>\n", + " <td>200906</td>\n", + " <td>move</td>\n", + " <td>156</td>\n", + " </tr>\n", + " <tr>\n", + " <th>553</th>\n", + " <td>200905</td>\n", + " <td>createaccount</td>\n", + " <td>156</td>\n", + " </tr>\n", + " <tr>\n", + " <th>554</th>\n", + " <td>200905</td>\n", + " <td>delete</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>555</th>\n", + " <td>200905</td>\n", + " <td>edit</td>\n", + " <td>211506</td>\n", + " </tr>\n", + " <tr>\n", + " <th>556</th>\n", + " <td>200905</td>\n", + " <td>move</td>\n", + " <td>215</td>\n", + " </tr>\n", + " <tr>\n", + " <th>557</th>\n", + " <td>200904</td>\n", + " <td>createaccount</td>\n", + " <td>24</td>\n", + " </tr>\n", + " <tr>\n", + " <th>558</th>\n", + " <td>200904</td>\n", + " <td>delete</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>559</th>\n", + " <td>200904</td>\n", + " <td>edit</td>\n", + " <td>178865</td>\n", + " </tr>\n", + " <tr>\n", + " <th>560</th>\n", + " <td>200904</td>\n", + " <td>move</td>\n", + " <td>256</td>\n", + " </tr>\n", + " <tr>\n", + " <th>561</th>\n", + " <td>200903</td>\n", + " <td>createaccount</td>\n", + " <td>418</td>\n", + " </tr>\n", + " <tr>\n", + " <th>562</th>\n", + " <td>200903</td>\n", + " <td>delete</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>563</th>\n", + " <td>200903</td>\n", + " <td>edit</td>\n", + " <td>98346</td>\n", + " </tr>\n", + " <tr>\n", + " <th>564</th>\n", + " <td>200903</td>\n", + " <td>move</td>\n", + " <td>241</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>565 rows × 3 columns</p>\n", + "</div>" + ], + "text/plain": [ + " LogMonth EditorActions Freq\n", + "0 201903 autocreateaccount 47\n", + "1 201903 createaccount 4780\n", + "2 201903 delete 5\n", + "3 201903 edit 37950\n", + "4 201903 move 84\n", + "5 201903 upload 3\n", + "6 201902 autocreateaccount 454\n", + "7 201902 createaccount 25204\n", + "8 201902 delete 5\n", + "9 201902 edit 210488\n", + "10 201902 move 445\n", + "11 201902 upload 10\n", + "12 201901 autocreateaccount 281\n", + "13 201901 createaccount 27924\n", + "14 201901 delete 21\n", + "15 201901 edit 223870\n", + "16 201901 move 568\n", + "17 201901 upload 4\n", + "18 201812 autocreateaccount 102\n", + "19 201812 createaccount 35405\n", + "20 201812 delete 5\n", + "21 201812 edit 189795\n", + "22 201812 move 959\n", + "23 201812 upload 21\n", + "24 201811 autocreateaccount 486\n", + "25 201811 createaccount 35421\n", + "26 201811 delete 2\n", + "27 201811 edit 216220\n", + "28 201811 move 1048\n", + "29 201811 upload 56\n", + ".. ... ... ...\n", + "535 200911 move 128\n", + "536 200910 createaccount 145\n", + "537 200910 edit 226571\n", + "538 200910 move 149\n", + "539 200909 createaccount 35\n", + "540 200909 edit 213070\n", + "541 200909 move 103\n", + "542 200908 createaccount 95\n", + "543 200908 delete 2\n", + "544 200908 edit 162038\n", + "545 200908 move 146\n", + "546 200907 createaccount 124\n", + "547 200907 delete 1\n", + "548 200907 edit 160740\n", + "549 200907 move 128\n", + "550 200906 createaccount 141\n", + "551 200906 edit 178879\n", + "552 200906 move 156\n", + "553 200905 createaccount 156\n", + "554 200905 delete 2\n", + "555 200905 edit 211506\n", + "556 200905 move 215\n", + "557 200904 createaccount 24\n", + "558 200904 delete 7\n", + "559 200904 edit 178865\n", + "560 200904 move 256\n", + "561 200903 createaccount 418\n", + "562 200903 delete 3\n", + "563 200903 edit 98346\n", + "564 200903 move 241\n", + "\n", + "[565 rows x 3 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Which editors' actions triggered a filter\n", + "df_ed_actions = pd.read_csv(\"quarry-34050-which-actions-triggered-an-abuse-filter-en-wiki-run346498.csv\", sep=',')\n", + "df_ed_actions" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LogMonth</th>\n", + " <th>Namespace</th>\n", + " <th>Freq</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>201903</td>\n", + " <td>-1</td>\n", + " <td>5177</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>201903</td>\n", + " <td>0</td>\n", + " <td>37653</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>201903</td>\n", + " <td>1</td>\n", + " <td>200</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>201903</td>\n", + " <td>2</td>\n", + " <td>1636</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>201903</td>\n", + " <td>3</td>\n", + " <td>604</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>201903</td>\n", + " <td>4</td>\n", + " <td>159</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>201903</td>\n", + " <td>5</td>\n", + " <td>13</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>201903</td>\n", + " <td>6</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>201903</td>\n", + " <td>10</td>\n", + " <td>98</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>201903</td>\n", + " <td>11</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>201903</td>\n", + " <td>14</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>201903</td>\n", + " <td>15</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>201903</td>\n", + " <td>100</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>201903</td>\n", + " <td>118</td>\n", + " <td>313</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>201903</td>\n", + " <td>119</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>201903</td>\n", + " <td>828</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>201902</td>\n", + " <td>-1</td>\n", + " <td>25658</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>201902</td>\n", + " <td>0</td>\n", + " <td>197552</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>201902</td>\n", + " <td>1</td>\n", + " <td>727</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>201902</td>\n", + " <td>2</td>\n", + " <td>8281</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>201902</td>\n", + " <td>3</td>\n", + " <td>2086</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>201902</td>\n", + " <td>4</td>\n", + " <td>679</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>201902</td>\n", + " <td>5</td>\n", + " <td>63</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>201902</td>\n", + " <td>6</td>\n", + " <td>65</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>201902</td>\n", + " <td>9</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>201902</td>\n", + " <td>10</td>\n", + " <td>503</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>201902</td>\n", + " <td>11</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>201902</td>\n", + " <td>13</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>201902</td>\n", + " <td>14</td>\n", + " <td>34</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>201902</td>\n", + " <td>15</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2233</th>\n", + " <td>200904</td>\n", + " <td>4</td>\n", + " <td>568</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2234</th>\n", + " <td>200904</td>\n", + " <td>5</td>\n", + " <td>183</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2235</th>\n", + " <td>200904</td>\n", + " <td>6</td>\n", + " <td>2186</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2236</th>\n", + " <td>200904</td>\n", + " <td>7</td>\n", + " <td>86</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2237</th>\n", + " <td>200904</td>\n", + " <td>9</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2238</th>\n", + " <td>200904</td>\n", + " <td>10</td>\n", + " <td>636</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2239</th>\n", + " <td>200904</td>\n", + " <td>11</td>\n", + " <td>25</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2240</th>\n", + " <td>200904</td>\n", + " <td>12</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2241</th>\n", + " <td>200904</td>\n", + " <td>13</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2242</th>\n", + " <td>200904</td>\n", + " <td>14</td>\n", + " <td>61</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2243</th>\n", + " <td>200904</td>\n", + " <td>15</td>\n", + " <td>16</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2244</th>\n", + " <td>200904</td>\n", + " <td>100</td>\n", + " <td>22</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2245</th>\n", + " <td>200904</td>\n", + " <td>101</td>\n", + " <td>30</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2246</th>\n", + " <td>200903</td>\n", + " <td>-1</td>\n", + " <td>418</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2247</th>\n", + " <td>200903</td>\n", + " <td>0</td>\n", + " <td>91967</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2248</th>\n", + " <td>200903</td>\n", + " <td>1</td>\n", + " <td>1599</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2249</th>\n", + " <td>200903</td>\n", + " <td>2</td>\n", + " <td>1986</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2250</th>\n", + " <td>200903</td>\n", + " <td>3</td>\n", + " <td>1510</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2251</th>\n", + " <td>200903</td>\n", + " <td>4</td>\n", + " <td>326</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2252</th>\n", + " <td>200903</td>\n", + " <td>5</td>\n", + " <td>107</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2253</th>\n", + " <td>200903</td>\n", + " <td>6</td>\n", + " <td>834</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2254</th>\n", + " <td>200903</td>\n", + " <td>7</td>\n", + " <td>47</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2255</th>\n", + " <td>200903</td>\n", + " <td>10</td>\n", + " <td>110</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2256</th>\n", + " <td>200903</td>\n", + " <td>11</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2257</th>\n", + " <td>200903</td>\n", + " <td>12</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2258</th>\n", + " <td>200903</td>\n", + " <td>13</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2259</th>\n", + " <td>200903</td>\n", + " <td>14</td>\n", + " <td>37</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2260</th>\n", + " <td>200903</td>\n", + " <td>15</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2261</th>\n", + " <td>200903</td>\n", + " <td>100</td>\n", + " <td>15</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2262</th>\n", + " <td>200903</td>\n", + " <td>101</td>\n", + " <td>16</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>2263 rows × 3 columns</p>\n", + "</div>" + ], + "text/plain": [ + " LogMonth Namespace Freq\n", + "0 201903 -1 5177\n", + "1 201903 0 37653\n", + "2 201903 1 200\n", + "3 201903 2 1636\n", + "4 201903 3 604\n", + "5 201903 4 159\n", + "6 201903 5 13\n", + "7 201903 6 7\n", + "8 201903 10 98\n", + "9 201903 11 8\n", + "10 201903 14 5\n", + "11 201903 15 2\n", + "12 201903 100 4\n", + "13 201903 118 313\n", + "14 201903 119 11\n", + "15 201903 828 1\n", + "16 201902 -1 25658\n", + "17 201902 0 197552\n", + "18 201902 1 727\n", + "19 201902 2 8281\n", + "20 201902 3 2086\n", + "21 201902 4 679\n", + "22 201902 5 63\n", + "23 201902 6 65\n", + "24 201902 9 2\n", + "25 201902 10 503\n", + "26 201902 11 1\n", + "27 201902 13 1\n", + "28 201902 14 34\n", + "29 201902 15 6\n", + "... ... ... ...\n", + "2233 200904 4 568\n", + "2234 200904 5 183\n", + "2235 200904 6 2186\n", + "2236 200904 7 86\n", + "2237 200904 9 5\n", + "2238 200904 10 636\n", + "2239 200904 11 25\n", + "2240 200904 12 6\n", + "2241 200904 13 11\n", + "2242 200904 14 61\n", + "2243 200904 15 16\n", + "2244 200904 100 22\n", + "2245 200904 101 30\n", + "2246 200903 -1 418\n", + "2247 200903 0 91967\n", + "2248 200903 1 1599\n", + "2249 200903 2 1986\n", + "2250 200903 3 1510\n", + "2251 200903 4 326\n", + "2252 200903 5 107\n", + "2253 200903 6 834\n", + "2254 200903 7 47\n", + "2255 200903 10 110\n", + "2256 200903 11 10\n", + "2257 200903 12 11\n", + "2258 200903 13 5\n", + "2259 200903 14 37\n", + "2260 200903 15 10\n", + "2261 200903 100 15\n", + "2262 200903 101 16\n", + "\n", + "[2263 rows x 3 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# In which namespaces did a filter trigger occur?\n", + "df_namespaces = pd.read_csv(\"quarry-34072-edits-in-which-namespaces-actions-triggered-an-abuse-filter-en-wiki-run346852.csv\", sep=',')\n", + "df_namespaces" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Manual tags" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('vandalism', 263), ('vandalism?', 162), ('unknown', 71), ('good_faith?', 63), ('misc', 59), ('sockpuppetry', 59), ('good_faith', 48), ('test', 43), ('spam?', 41), ('long_term_abuse', 35), ('sockpuppetry?', 35), ('harassment?', 31), ('harassment', 24), ('abuse?', 21), ('biased_pov', 17), ('spam', 17), ('biased_pov?', 15), ('unclear', 14), ('bad_style', 13), ('bad_style?', 12), ('bug?', 10), ('wiki_policy?', 9), ('long_term_abuse?', 9), ('misc?', 8), ('seo', 8), ('politically_motivated?', 8), ('maintenance', 7), ('trolling?', 7), ('maintenance?', 6), ('personal_attacks', 6), ('bug', 5), ('vandalbot', 5), ('page_move_vandalism', 5), ('silly_vandalism', 5), ('lazyness', 4), ('seo?', 4), ('test?', 4), ('hoaxing?', 4), ('personal_attacks?', 4), ('edit_warring?', 3), ('copyright', 3), ('image_vandalism', 3), ('talk_page_vandalism', 3), ('page_move_vandalism?', 3), ('conflict_of_interest', 3), ('stockbrocker_vandalism', 3), ('copyright?', 2), ('vandalbot?', 2), ('religious_vandalism?', 2), ('politically_motivated', 2), ('self_promotion?', 2), ('template_spam', 2), ('hoaxing', 2), ('silly_vandalism?', 2), ('doxxing?', 2), ('not_polite', 1), ('template_vandalism', 1), ('religious_vandalism', 1), ('self_promotion', 1), ('abuse', 1), ('template_vandalism?', 1), ('link_vandalism?', 1), ('abuse_of_tags_vandalism?', 1), ('avoidant_vandalism', 1), ('guideline_vio?', 1), ('username_vandalism?', 1), ('phishing?', 1), ('avoidant_vandalism?', 1), ('malware?', 1), ('malware', 1), ('conflict_of_interest?', 1), ('impersonation', 1), ('prank', 1)]\n" + ] + } + ], + "source": [ + "manual_tags = df['manual_tags']\n", + "manual_tags_list = [x.split(\", \") for x in list(manual_tags)]\n", + "all_tags = flatten(manual_tags_list)\n", + "\n", + "print(collections.Counter(all_tags).most_common())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "('vandalism', 263),\n", + "('vandalism?', 162),\n", + " ('spam?', 41),\n", + " ('spam', 17),\n", + " ('vandalbot', 5),\n", + " ('vandalbot?', 2),\n", + " ('page_move_vandalism', 5),\n", + " ('page_move_vandalism?', 3),\n", + " ('silly_vandalism', 5),\n", + " ('silly_vandalism?', 2),\n", + " ('trolling?', 7),\n", + " ('hoaxing?', 4),\n", + " ('hoaxing', 2),\n", + " ('copyright', 3),\n", + " ('copyright?', 2),\n", + " ('image_vandalism', 3),\n", + " ('talk_page_vandalism', 3),\n", + " ('template_vandalism?', 1),\n", + " ('template_vandalism', 1),\n", + " ('template_spam', 2),\n", + " ('link_vandalism?', 1),\n", + " ('abuse_of_tags_vandalism?', 1),\n", + " ('avoidant_vandalism', 1),\n", + " ('avoidant_vandalism?', 1),\n", + " ('username_vandalism?', 1),\n", + "\n", + "('prank', 1)\n", + "\n", + "('phishing?', 1),\n", + "('malware?', 1),\n", + "('malware', 1),\n", + "\n", + "('guideline_vio?', 1),\n", + "\n", + "('religious_vandalism?', 3),\n", + "('politically_motivated?', 8),\n", + "('politically_motivated', 2),\n", + "\n", + "('sockpuppetry', 59),\n", + "('sockpuppetry?', 35),\n", + "('long_term_abuse', 35),\n", + "('long_term_abuse?', 9),\n", + "('abuse', 1),\n", + "('abuse?', 21),\n", + "('harassment?', 31),\n", + "('harassment', 24),\n", + "('doxxing?', 2),\n", + "('personal_attacks', 6),\n", + "('personal_attacks?', 4),\n", + "('impersonation', 1),\n", + "('not_polite', 1),\n", + "\n", + "('biased_pov', 17),\n", + "('biased_pov?', 15),\n", + "\n", + "('conflict_of_interest', 3),\n", + "('stockbrocker_vandalism', 3),\n", + "('self_promotion?', 2),\n", + "('conflict_of_interest?', 1),\n", + "('self_promotion', 1),\n", + "\n", + "('seo', 8),\n", + "('seo?', 4),\n", + "\n", + "('bad_style', 13),\n", + "('bad_style?', 12),\n", + "('edit_warring?', 3),\n", + "\n", + "('good_faith?', 63),\n", + "('good_faith', 48),\n", + "\n", + "('lazyness', 4),\n", + "\n", + "('maintenance', 7),\n", + "('maintenance?', 5),\n", + "('maintenance? ', 1),\n", + "\n", + "('bug', 5),\n", + "('bug?', 10),\n", + "('wiki_policy?', 9),\n", + "\n", + "('test', 43),\n", + "('test?', 4),\n", + "\n", + "('unknown', 71),\n", + "('misc', 59),\n", + "('misc?', 8),\n", + "('unclear', 14)," + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>afl_filter</th>\n", + " <th>count(*)</th>\n", + " <th>manual_tags</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>135</td>\n", + " <td>175455</td>\n", + " <td>vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>30</td>\n", + " <td>160302</td>\n", + " <td>good_faith, vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>61</td>\n", + " <td>147377</td>\n", + " <td>good_faith</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>18</td>\n", + " <td>133640</td>\n", + " <td>lazyness</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>3</td>\n", + " <td>95916</td>\n", + " <td>good_faith</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>172</td>\n", + " <td>89710</td>\n", + " <td>good_faith</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>50</td>\n", + " <td>88827</td>\n", + " <td>vandalism, good_faith</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>98</td>\n", + " <td>80434</td>\n", + " <td>good_faith</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>65</td>\n", + " <td>74098</td>\n", + " <td>vandalism, good_faith?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>132</td>\n", + " <td>68607</td>\n", + " <td>vandalism, good_faith</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>46</td>\n", + " <td>47280</td>\n", + " <td>vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>225</td>\n", + " <td>45462</td>\n", + " <td>vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>180</td>\n", + " <td>37713</td>\n", + " <td>good_faith</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>79</td>\n", + " <td>36645</td>\n", + " <td>good_faith</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>189</td>\n", + " <td>35683</td>\n", + " <td>vandalism, harassment</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>149</td>\n", + " <td>32336</td>\n", + " <td>misc</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>9</td>\n", + " <td>28972</td>\n", + " <td>harassment</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>129</td>\n", + " <td>27780</td>\n", + " <td>vandalism, sockpuppetry</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>29</td>\n", + " <td>27130</td>\n", + " <td>good_faith</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>148</td>\n", + " <td>24914</td>\n", + " <td>biased_pov</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>231</td>\n", + " <td>21507</td>\n", + " <td>vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>220</td>\n", + " <td>19946</td>\n", + " <td>misc</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>39</td>\n", + " <td>18456</td>\n", + " <td>vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>80</td>\n", + " <td>18189</td>\n", + " <td>vandalism, biased_pov, seo</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>12</td>\n", + " <td>18159</td>\n", + " <td>vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>155</td>\n", + " <td>17517</td>\n", + " <td>misc</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>106</td>\n", + " <td>14513</td>\n", + " <td>misc</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>96</td>\n", + " <td>14399</td>\n", + " <td>good_faith?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>11</td>\n", + " <td>14368</td>\n", + " <td>vandalism, harassment</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>28</td>\n", + " <td>12264</td>\n", + " <td>good_faith?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>189</th>\n", + " <td>245</td>\n", + " <td>7</td>\n", + " <td>abuse?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>190</th>\n", + " <td>257</td>\n", + " <td>7</td>\n", + " <td>vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>191</th>\n", + " <td>275</td>\n", + " <td>6</td>\n", + " <td>page_move_vandalism?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>192</th>\n", + " <td>70</td>\n", + " <td>6</td>\n", + " <td>page_move_vandalism?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>193</th>\n", + " <td>214</td>\n", + " <td>6</td>\n", + " <td>self_promotion?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>194</th>\n", + " <td>207</td>\n", + " <td>6</td>\n", + " <td>avoidant_vandalism?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>195</th>\n", + " <td>38</td>\n", + " <td>6</td>\n", + " <td>unknown, abuse?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>196</th>\n", + " <td>57</td>\n", + " <td>6</td>\n", + " <td>personal_attacks, doxxing?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>197</th>\n", + " <td>49</td>\n", + " <td>5</td>\n", + " <td>spam</td>\n", + " </tr>\n", + " <tr>\n", + " <th>198</th>\n", + " <td>69</td>\n", + " <td>5</td>\n", + " <td>page_move_vandalism?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>199</th>\n", + " <td>109</td>\n", + " <td>5</td>\n", + " <td>unknown</td>\n", + " </tr>\n", + " <tr>\n", + " <th>200</th>\n", + " <td>20</td>\n", + " <td>5</td>\n", + " <td>good_faith?, vandalism?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>201</th>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>test</td>\n", + " </tr>\n", + " <tr>\n", + " <th>202</th>\n", + " <td>127</td>\n", + " <td>4</td>\n", + " <td>good_faith?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>203</th>\n", + " <td>173</td>\n", + " <td>4</td>\n", + " <td>unknown</td>\n", + " </tr>\n", + " <tr>\n", + " <th>204</th>\n", + " <td>40</td>\n", + " <td>4</td>\n", + " <td>vandalism, hoaxing?, personal_attacks?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>205</th>\n", + " <td>244</td>\n", + " <td>3</td>\n", + " <td>bug?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>206</th>\n", + " <td>184</td>\n", + " <td>3</td>\n", + " <td>vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>207</th>\n", + " <td>251</td>\n", + " <td>3</td>\n", + " <td>vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>208</th>\n", + " <td>243</td>\n", + " <td>3</td>\n", + " <td>malware</td>\n", + " </tr>\n", + " <tr>\n", + " <th>209</th>\n", + " <td>73</td>\n", + " <td>2</td>\n", + " <td>test</td>\n", + " </tr>\n", + " <tr>\n", + " <th>210</th>\n", + " <td>118</td>\n", + " <td>2</td>\n", + " <td>vandalism?, wiki_policy?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>211</th>\n", + " <td>162</td>\n", + " <td>2</td>\n", + " <td>unknown</td>\n", + " </tr>\n", + " <tr>\n", + " <th>212</th>\n", + " <td>142</td>\n", + " <td>1</td>\n", + " <td>unknown</td>\n", + " </tr>\n", + " <tr>\n", + " <th>213</th>\n", + " <td>120</td>\n", + " <td>1</td>\n", + " <td>doxxing?</td>\n", + " </tr>\n", + " <tr>\n", + " <th>214</th>\n", + " <td>196</td>\n", + " <td>1</td>\n", + " <td>vandalism</td>\n", + " </tr>\n", + " <tr>\n", + " <th>215</th>\n", + " <td>121</td>\n", + " <td>1</td>\n", + " <td>test</td>\n", + " </tr>\n", + " <tr>\n", + " <th>216</th>\n", + " <td>198</td>\n", + " <td>1</td>\n", + " <td>test</td>\n", + " </tr>\n", + " <tr>\n", + " <th>217</th>\n", + " <td>145</td>\n", + " <td>1</td>\n", + " <td>sockpuppetry</td>\n", + " </tr>\n", + " <tr>\n", + " <th>218</th>\n", + " <td>230</td>\n", + " <td>1</td>\n", + " <td>unknown</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>219 rows × 3 columns</p>\n", + "</div>" + ], + "text/plain": [ + " afl_filter count(*) manual_tags\n", + "0 135 175455 vandalism\n", + "1 30 160302 good_faith, vandalism\n", + "2 61 147377 good_faith\n", + "3 18 133640 lazyness\n", + "4 3 95916 good_faith\n", + "5 172 89710 good_faith\n", + "6 50 88827 vandalism, good_faith\n", + "7 98 80434 good_faith\n", + "8 65 74098 vandalism, good_faith?\n", + "9 132 68607 vandalism, good_faith\n", + "10 46 47280 vandalism\n", + "11 225 45462 vandalism\n", + "12 180 37713 good_faith\n", + "13 79 36645 good_faith\n", + "14 189 35683 vandalism, harassment\n", + "15 149 32336 misc\n", + "16 9 28972 harassment\n", + "17 129 27780 vandalism, sockpuppetry\n", + "18 29 27130 good_faith\n", + "19 148 24914 biased_pov\n", + "20 231 21507 vandalism\n", + "21 220 19946 misc\n", + "22 39 18456 vandalism\n", + "23 80 18189 vandalism, biased_pov, seo\n", + "24 12 18159 vandalism\n", + "25 155 17517 misc\n", + "26 106 14513 misc\n", + "27 96 14399 good_faith?\n", + "28 11 14368 vandalism, harassment\n", + "29 28 12264 good_faith?\n", + ".. ... ... ...\n", + "189 245 7 abuse?\n", + "190 257 7 vandalism\n", + "191 275 6 page_move_vandalism?\n", + "192 70 6 page_move_vandalism?\n", + "193 214 6 self_promotion?\n", + "194 207 6 avoidant_vandalism?\n", + "195 38 6 unknown, abuse?\n", + "196 57 6 personal_attacks, doxxing?\n", + "197 49 5 spam\n", + "198 69 5 page_move_vandalism?\n", + "199 109 5 unknown\n", + "200 20 5 good_faith?, vandalism?\n", + "201 2 4 test\n", + "202 127 4 good_faith?\n", + "203 173 4 unknown\n", + "204 40 4 vandalism, hoaxing?, personal_attacks?\n", + "205 244 3 bug?\n", + "206 184 3 vandalism\n", + "207 251 3 vandalism\n", + "208 243 3 malware\n", + "209 73 2 test\n", + "210 118 2 vandalism?, wiki_policy?\n", + "211 162 2 unknown\n", + "212 142 1 unknown\n", + "213 120 1 doxxing?\n", + "214 196 1 vandalism\n", + "215 121 1 test\n", + "216 198 1 test\n", + "217 145 1 sockpuppetry\n", + "218 230 1 unknown\n", + "\n", + "[219 rows x 3 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# manual tags per year\n", + "df_logs_2009 = pd.read_csv(\"data/log-entries-yearly/2009.csv\", sep=',')\n", + "df_ids_manual_tags = df[['af_id', 'manual_tags']]\n", + "df_2009_tags = df_logs_2009.join(df_ids_manual_tags.set_index('af_id'), on='afl_filter', how='inner')\n", + "df_2009_tags" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('vandalism', 66), ('vandalism?', 37), ('good_faith?', 24), ('good_faith', 21), ('misc', 18), ('sockpuppetry', 10), ('unknown', 10), ('test', 9), ('abuse?', 7), ('spam?', 6), ('harassment', 5), ('harassment?', 5), ('sockpuppetry?', 5), ('biased_pov?', 4), ('bad_style', 4), ('politically_motivated?', 3), ('maintenance', 3), ('personal_attacks?', 3), ('page_move_vandalism?', 3), ('biased_pov', 2), ('seo', 2), ('misc?', 2), ('copyright?', 2), ('long_term_abuse', 2), ('unclear', 2), ('wiki_policy?', 2), ('hoaxing?', 2), ('silly_vandalism', 2), ('spam', 2), ('bug?', 2), ('doxxing?', 2), ('lazyness', 1), ('bad_style?', 1), ('seo?', 1), ('personal_attack', 1), ('maintenance?', 1), ('trolling?', 1), ('religious_vandalism?', 1), ('template_vandalism?', 1), ('link_vandalism?', 1), ('politically_motivated', 1), ('guideline_vio?', 1), ('silly_vandalism?', 1), ('template_spam', 1), ('self_promotion?', 1), ('avoidant_vandalism?', 1), ('personal_attacks', 1), ('malware', 1)]\n" + ] + } + ], + "source": [ + "df_2009_tags['manual_tags']\n", + "tags_list_2009 = flatten([x.split(\", \") for x in list(df_2009_tags['manual_tags'])])\n", + "# would be interesting to multiply with hitcount\n", + "\n", + "print(collections.Counter(tags_list_2009).most_common())" ] }, { -- GitLab