diff --git a/src/explore.ipynb b/src/explore.ipynb index 3578be77528fe349a6cf0aee634a6f9cbb66e290..ef63de256bece4089b3a6a22b1d2593050a7a1b5 100644 --- a/src/explore.ipynb +++ b/src/explore.ipynb @@ -49,6 +49,22 @@ "df_origin = pd.read_csv(\"quarry-32518-all-filters-sorted-num-hits.csv\", sep=',')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And the dataset with the 2nd round of labeling:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df_2nd = pd.read_csv(\"20190106115600_filters-sorted-by-hits-manual-tags-2nd-round.csv\", sep='\\t')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -308,7 +324,12 @@ "# --> so available groups are \"default\" and \"feedback\"\n", "# TODO: question: what do they mean?\n", "# From https://www.mediawiki.org/wiki/Extension:AbuseFilter/abuse_filter_table :\n", - "# \"The group this filter belongs to, as defined in $wgAbuseFilterValidGroups.\" still don't get it" + "# \"The group this filter belongs to, as defined in $wgAbuseFilterValidGroups.\" still don't get it\n", + "'''\n", + "Feedback seems to be a legacy thing from a plugin which was supposed to collect readers feedback on articles.\n", + "It wasn't deemed useful enough though and was eventually turned off.\n", + "All the filters that referred to it are in the meantime marked as deleted.\n", + "'''" ] }, { @@ -490,6 +511,15 @@ "\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is noticeable that quite some filters were last modified in June 2016 (93 filters). Potentially, a lot more filters were modified at this time and also modified later again.\n", + "An inquiry into the filters history (https://en.wikipedia.org/wiki/Special:AbuseFilter/history?user=&filter=669) shows that an edit filter manager did a big cleanup and (mostly) deleted stale filters.\n", + "All of this doesn't seem to be particularly exciting or worth discussing though." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -499,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -633,6 +663,7 @@ ], "source": [ "# filter hits per month (all filters) (data quarry from 05.03.2019, that's why hitcount Mar 2019 is so small)\n", + "# Mar 2009 is also lower, since the first filter was introduced on 17.3.2009 and not at the beginning of the month\n", "df_hits = pd.read_csv(\"quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv\", sep=',')\n", "df_hits['LogMonth'] = pd.to_datetime(df_hits['LogMonth'], format=\"%Y%m\")\n", "\n",