diff --git a/quarries/quarry-37500 b/quarries/quarry-37500 new file mode 100644 index 0000000000000000000000000000000000000000..93d0706374ae31208b0491b4d42edc9238c44b7f --- /dev/null +++ b/quarries/quarry-37500 @@ -0,0 +1,5 @@ +use enwiki_p; +select left(afl_timestamp, 4) LogYear, afl_actions FilterActions, count(*) Freq +from abuse_filter_log +group by left(afl_timestamp, 4), afl_actions +order by 1 desc; diff --git a/src/explore.ipynb b/src/explore.ipynb index 398709c68d486d23ec6a9ce91210f04b9f56a498..fc100bb7bfcf56c70035b05f05079ff0106d157d 100644 --- a/src/explore.ipynb +++ b/src/explore.ipynb @@ -1458,8 +1458,6 @@ } ], "source": [ - "# filter hits per month (all filters) (data quarry from 05.03.2019, that's why hitcount Mar 2019 is so small)\n", - "# Mar 2009 is also lower, since the first filter was introduced on 17.3.2009 and not at the beginning of the month\n", "df_hits = pd.read_csv(\"quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv\", sep=',')\n", "df_hits['LogMonth'] = pd.to_datetime(df_hits['LogMonth'], format=\"%Y%m\")\n", "\n", @@ -1494,6 +1492,14 @@ "#TODO: why is the peak there? there's an upward tendency" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data quarry from 05.03.2019, that's why hitcount Mar 2019 is so small.\n", + "Mar 2009 is also lower, since the first filter was introduced on 17.3.2009 and not at the beginning of the month." + ] + }, { "cell_type": "code", "execution_count": 40, @@ -6292,6 +6298,8 @@ "source": [ "To be fair, I don't see any particularly interesting or conspicious pattern, beside the 71920 attempts at account creations. Neither the pages edited are extraordinary, nor are there particular pages with extra-orbitant hits; users are not particularly interesting either and it's mostly the most active filters that got triggered anyway.\n", "\n", + "The 3 IP editors with +1000 filter triggers triggered above all (or maybe exclusively) filters dealing with linkspam.\n", + "\n", "But maybe it's exactly them that make the 71920 hits difference to all the \"standard\" numbers. I'm comparing this with September 2016 (238406 hits) for reference." ] }, @@ -6423,7 +6431,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The comparison of number of edits per user reveals that in January there were a couple of particularly active IPs. It would be interesting to know what they were trying to do." + "The comparison of number of edits per user reveals that in January there were a couple of particularly active IPs. It would be interesting to know what they were trying to do. (Answer: they were spamming.)" ] }, { @@ -6433,6 +6441,13 @@ "### Actions over the years" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of the next snippet is to comprehend whether there is/was a tendency towards more liberal or stricter actions." + ] + }, { "cell_type": "code", "execution_count": 6, @@ -7032,6 +7047,138 @@ " print(df_actions.fillna('log only'))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A per month break-down is difficult to plot on DIN A4 paper (maybe?); I'm aggregating them per year to start." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " LogYear FilterActions Freq\n", + "0 2019-01-01 log only 498557\n", + "1 2019-01-01 disallow 312772\n", + "2 2019-01-01 disallow,tag 18\n", + "3 2019-01-01 tag 473854\n", + "4 2019-01-01 warn 205715\n", + "5 2018-01-01 log only 1019817\n", + "6 2018-01-01 disallow 366209\n", + "7 2018-01-01 disallow,tag 71\n", + "8 2018-01-01 tag 944445\n", + "9 2018-01-01 warn 452407\n", + "10 2017-01-01 log only 727988\n", + "11 2017-01-01 disallow 395566\n", + "12 2017-01-01 disallow,tag 961\n", + "13 2017-01-01 tag 1057178\n", + "14 2017-01-01 warn 496777\n", + "15 2016-01-01 log only 1188795\n", + "16 2016-01-01 disallow 397976\n", + "17 2016-01-01 disallow,tag 27752\n", + "18 2016-01-01 tag 1005328\n", + "19 2016-01-01 warn 637696\n", + "20 2015-01-01 log only 884479\n", + "21 2015-01-01 disallow 369453\n", + "22 2015-01-01 disallow,tag 36971\n", + "23 2015-01-01 tag 884762\n", + "24 2015-01-01 warn 633934\n", + "25 2014-01-01 log only 128362\n", + "26 2014-01-01 disallow 322553\n", + "27 2014-01-01 disallow,tag 3970\n", + "28 2014-01-01 tag 555330\n", + "29 2014-01-01 warn 566693\n", + "30 2013-01-01 log only 85726\n", + "31 2013-01-01 disallow 410817\n", + "32 2013-01-01 tag 594869\n", + "33 2013-01-01 warn 653766\n", + "34 2012-01-01 log only 92226\n", + "35 2012-01-01 aftv5flagabuse 5755\n", + "36 2012-01-01 blockautopromote 37\n", + "37 2012-01-01 disallow 452072\n", + "38 2012-01-01 tag 698173\n", + "39 2012-01-01 warn 733247\n", + "40 2011-01-01 log only 170676\n", + "41 2011-01-01 blockautopromote 40\n", + "42 2011-01-01 blockautopromote,tag 3\n", + "43 2011-01-01 disallow 283517\n", + "44 2011-01-01 tag 1009932\n", + "45 2011-01-01 warn 595725\n", + "46 2010-01-01 log only 56999\n", + "47 2010-01-01 blockautopromote 1\n", + "48 2010-01-01 disallow 283117\n", + "49 2010-01-01 disallow,tag 24\n", + "50 2010-01-01 tag 1222474\n", + "51 2010-01-01 warn 605113\n", + "52 2009-01-01 log only 192680\n", + "53 2009-01-01 blockautopromote 1056\n", + "54 2009-01-01 blockautopromote,tag 22\n", + "55 2009-01-01 disallow 98648\n", + "56 2009-01-01 disallow,tag 321\n", + "57 2009-01-01 tag 854153\n", + "58 2009-01-01 warn 662415\n" + ] + } + ], + "source": [ + "# Which actions were triggered how often over the years\n", + "df_actions_year = pd.read_csv(\"quarry-37500-number-of-abuse-filters-filter-actions-per-year-en-wiki-run389252.csv\", sep=',')\n", + "df_actions_year['LogYear'] = pd.to_datetime(df_actions_year['LogYear'], format=\"%Y\")\n", + "\n", + "with pd.option_context('display.max_rows', None, 'display.max_columns', None):\n", + " print(df_actions_year.fillna('log only'))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "ufunc subtract cannot use operands with types dtype('<M8[ns]') and dtype('float64')", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-56-f24e07750944>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mxticks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrotation\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'20'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mylabel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Num filters'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_values\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/uni/ma-arbeit-filters/src/env3/lib/python3.6/site-packages/matplotlib/pyplot.py\u001b[0m in \u001b[0;36mbar\u001b[0;34m(x, height, width, bottom, align, data, **kwargs)\u001b[0m\n\u001b[1;32m 2455\u001b[0m return gca().bar(\n\u001b[1;32m 2456\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwidth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mwidth\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbottom\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbottom\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malign\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2457\u001b[0;31m **({\"data\": data} if data is not None else {}), **kwargs)\n\u001b[0m\u001b[1;32m 2458\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2459\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/uni/ma-arbeit-filters/src/env3/lib/python3.6/site-packages/matplotlib/__init__.py\u001b[0m in \u001b[0;36minner\u001b[0;34m(ax, data, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1808\u001b[0m \u001b[0;34m\"the Matplotlib list!)\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlabel_namer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1809\u001b[0m RuntimeWarning, stacklevel=2)\n\u001b[0;32m-> 1810\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1811\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1812\u001b[0m inner.__doc__ = _add_data_doc(inner.__doc__,\n", + "\u001b[0;32m~/uni/ma-arbeit-filters/src/env3/lib/python3.6/site-packages/matplotlib/axes/_axes.py\u001b[0m in \u001b[0;36mbar\u001b[0;34m(self, x, height, width, bottom, align, **kwargs)\u001b[0m\n\u001b[1;32m 2275\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0malign\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'center'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2276\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0morientation\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'vertical'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2277\u001b[0;31m \u001b[0mleft\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mwidth\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2278\u001b[0m \u001b[0mbottom\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2279\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0morientation\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'horizontal'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: ufunc subtract cannot use operands with types dtype('<M8[ns]') and dtype('float64')" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#ap_actions_labels = [x[0] for x in active_public_actions]\n", + "x_values = df_actions_year['LogYear']\n", + "y_values = df_actions_year['Freq']\n", + "\n", + "plt.xlabel('actions')\n", + "plt.xticks(rotation='20')\n", + "plt.ylabel('Num filters')\n", + "plt.bar(x_values, y_values)\n", + "plt.show()" + ] + }, { "cell_type": "code", "execution_count": 8,