From ec9a1bcea074972d40e1206ea7d3bcbefa270fd5 Mon Sep 17 00:00:00 2001
From: Lyudmila Vaseva <vaseva@mi.fu-berlin.de>
Date: Wed, 6 Mar 2019 02:12:27 +0100
Subject: [PATCH] Explore hitcounts in more detail

---
 quarries/quarry-34014                         |   5 +
 ...been-there-per-month-en-wiki-run346197.csv | 122 +++++
 src/explore.ipynb                             | 423 +++++++++++++++++-
 3 files changed, 548 insertions(+), 2 deletions(-)
 create mode 100644 quarries/quarry-34014
 create mode 100644 quarries/quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv

diff --git a/quarries/quarry-34014 b/quarries/quarry-34014
new file mode 100644
index 0000000..576faf5
--- /dev/null
+++ b/quarries/quarry-34014
@@ -0,0 +1,5 @@
+use enwiki_p;
+select left(afl_timestamp, 6) LogMonth, count(*) Freq
+from abuse_filter_log
+group by left(afl_timestamp, 6)
+order by 1 desc;
diff --git a/quarries/quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv b/quarries/quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv
new file mode 100644
index 0000000..aaab81d
--- /dev/null
+++ b/quarries/quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv
@@ -0,0 +1,122 @@
+LogMonth,Freq
+201903,34309
+201902,236606
+201901,252668
+201812,226287
+201811,253233
+201810,256438
+201809,230354
+201808,216045
+201807,205477
+201806,209374
+201805,238235
+201804,242857
+201803,255431
+201802,213548
+201801,235705
+201712,213359
+201711,254671
+201710,224244
+201709,212790
+201708,195593
+201707,176778
+201706,184406
+201705,231250
+201704,225176
+201703,257081
+201702,246199
+201701,256925
+201612,226680
+201611,258655
+201610,254070
+201609,238406
+201608,192681
+201607,171567
+201606,201867
+201605,294276
+201604,319474
+201603,360327
+201602,366637
+201601,372907
+201512,307003
+201511,338129
+201510,264966
+201509,272765
+201508,215203
+201507,207626
+201506,236916
+201505,250012
+201504,201602
+201503,188296
+201502,163529
+201501,163552
+201412,137314
+201411,154933
+201410,165732
+201409,132150
+201408,93636
+201407,80837
+201406,94967
+201405,132788
+201404,129260
+201403,157123
+201402,146198
+201401,151970
+201312,131054
+201311,156195
+201310,163029
+201309,122300
+201308,117010
+201307,113163
+201306,109264
+201305,163644
+201304,146456
+201303,156797
+201302,196511
+201301,169755
+201212,147379
+201211,186991
+201210,204204
+201209,160734
+201208,122418
+201207,110906
+201206,123762
+201205,178005
+201204,173853
+201203,184071
+201202,197130
+201201,192057
+201112,175397
+201111,211174
+201110,224586
+201109,171945
+201108,133965
+201107,133186
+201106,143617
+201105,173431
+201104,160696
+201103,177539
+201102,173222
+201101,181135
+201012,157738
+201011,195329
+201010,206730
+201009,174706
+201008,138323
+201007,127687
+201006,146164
+201005,206822
+201004,197381
+201003,217092
+201002,206380
+201001,193376
+200912,166506
+200911,210227
+200910,226865
+200909,213208
+200908,162281
+200907,160993
+200906,179176
+200905,211879
+200904,179152
+200903,99008
diff --git a/src/explore.ipynb b/src/explore.ipynb
index f71a308..f1cb27e 100644
--- a/src/explore.ipynb
+++ b/src/explore.ipynb
@@ -24,8 +24,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    " df = pd.read_csv(\"20190106115600_filters-sorted-by-hits-manual-tags.csv\", sep='\\t')\n",
-    " df_origin = pd.read_csv(\"quarry-32518-all-filters-sorted-num-hits.csv\", sep=',')"
+    "df = pd.read_csv(\"20190106115600_filters-sorted-by-hits-manual-tags.csv\", sep='\\t')\n",
+    "df_origin = pd.read_csv(\"quarry-32518-all-filters-sorted-num-hits.csv\", sep=',')\n",
+    "df_hits = pd.read_csv(\"quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv\", sep=',')"
    ]
   },
   {
@@ -208,6 +209,424 @@
     "# TODO: question: what do they mean?"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>LogMonth</th>\n",
+       "      <th>Freq</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>201903</td>\n",
+       "      <td>34309</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>201902</td>\n",
+       "      <td>236606</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>201901</td>\n",
+       "      <td>252668</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>201812</td>\n",
+       "      <td>226287</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>201811</td>\n",
+       "      <td>253233</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>201810</td>\n",
+       "      <td>256438</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>201809</td>\n",
+       "      <td>230354</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>201808</td>\n",
+       "      <td>216045</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>201807</td>\n",
+       "      <td>205477</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>201806</td>\n",
+       "      <td>209374</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>201805</td>\n",
+       "      <td>238235</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>201804</td>\n",
+       "      <td>242857</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>201803</td>\n",
+       "      <td>255431</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>201802</td>\n",
+       "      <td>213548</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>201801</td>\n",
+       "      <td>235705</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>201712</td>\n",
+       "      <td>213359</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>201711</td>\n",
+       "      <td>254671</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>201710</td>\n",
+       "      <td>224244</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>201709</td>\n",
+       "      <td>212790</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>201708</td>\n",
+       "      <td>195593</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>201707</td>\n",
+       "      <td>176778</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>201706</td>\n",
+       "      <td>184406</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>201705</td>\n",
+       "      <td>231250</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>201704</td>\n",
+       "      <td>225176</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>201703</td>\n",
+       "      <td>257081</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>201702</td>\n",
+       "      <td>246199</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>201701</td>\n",
+       "      <td>256925</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>201612</td>\n",
+       "      <td>226680</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>201611</td>\n",
+       "      <td>258655</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>201610</td>\n",
+       "      <td>254070</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>91</th>\n",
+       "      <td>201108</td>\n",
+       "      <td>133965</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>92</th>\n",
+       "      <td>201107</td>\n",
+       "      <td>133186</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>93</th>\n",
+       "      <td>201106</td>\n",
+       "      <td>143617</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>94</th>\n",
+       "      <td>201105</td>\n",
+       "      <td>173431</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>95</th>\n",
+       "      <td>201104</td>\n",
+       "      <td>160696</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>96</th>\n",
+       "      <td>201103</td>\n",
+       "      <td>177539</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>201102</td>\n",
+       "      <td>173222</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>201101</td>\n",
+       "      <td>181135</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>201012</td>\n",
+       "      <td>157738</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>100</th>\n",
+       "      <td>201011</td>\n",
+       "      <td>195329</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>101</th>\n",
+       "      <td>201010</td>\n",
+       "      <td>206730</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>102</th>\n",
+       "      <td>201009</td>\n",
+       "      <td>174706</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>103</th>\n",
+       "      <td>201008</td>\n",
+       "      <td>138323</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>104</th>\n",
+       "      <td>201007</td>\n",
+       "      <td>127687</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>105</th>\n",
+       "      <td>201006</td>\n",
+       "      <td>146164</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>106</th>\n",
+       "      <td>201005</td>\n",
+       "      <td>206822</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>107</th>\n",
+       "      <td>201004</td>\n",
+       "      <td>197381</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>108</th>\n",
+       "      <td>201003</td>\n",
+       "      <td>217092</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>109</th>\n",
+       "      <td>201002</td>\n",
+       "      <td>206380</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>110</th>\n",
+       "      <td>201001</td>\n",
+       "      <td>193376</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>111</th>\n",
+       "      <td>200912</td>\n",
+       "      <td>166506</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>112</th>\n",
+       "      <td>200911</td>\n",
+       "      <td>210227</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>113</th>\n",
+       "      <td>200910</td>\n",
+       "      <td>226865</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>114</th>\n",
+       "      <td>200909</td>\n",
+       "      <td>213208</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>115</th>\n",
+       "      <td>200908</td>\n",
+       "      <td>162281</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>116</th>\n",
+       "      <td>200907</td>\n",
+       "      <td>160993</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>117</th>\n",
+       "      <td>200906</td>\n",
+       "      <td>179176</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>118</th>\n",
+       "      <td>200905</td>\n",
+       "      <td>211879</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>119</th>\n",
+       "      <td>200904</td>\n",
+       "      <td>179152</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>120</th>\n",
+       "      <td>200903</td>\n",
+       "      <td>99008</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>121 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     LogMonth    Freq\n",
+       "0      201903   34309\n",
+       "1      201902  236606\n",
+       "2      201901  252668\n",
+       "3      201812  226287\n",
+       "4      201811  253233\n",
+       "5      201810  256438\n",
+       "6      201809  230354\n",
+       "7      201808  216045\n",
+       "8      201807  205477\n",
+       "9      201806  209374\n",
+       "10     201805  238235\n",
+       "11     201804  242857\n",
+       "12     201803  255431\n",
+       "13     201802  213548\n",
+       "14     201801  235705\n",
+       "15     201712  213359\n",
+       "16     201711  254671\n",
+       "17     201710  224244\n",
+       "18     201709  212790\n",
+       "19     201708  195593\n",
+       "20     201707  176778\n",
+       "21     201706  184406\n",
+       "22     201705  231250\n",
+       "23     201704  225176\n",
+       "24     201703  257081\n",
+       "25     201702  246199\n",
+       "26     201701  256925\n",
+       "27     201612  226680\n",
+       "28     201611  258655\n",
+       "29     201610  254070\n",
+       "..        ...     ...\n",
+       "91     201108  133965\n",
+       "92     201107  133186\n",
+       "93     201106  143617\n",
+       "94     201105  173431\n",
+       "95     201104  160696\n",
+       "96     201103  177539\n",
+       "97     201102  173222\n",
+       "98     201101  181135\n",
+       "99     201012  157738\n",
+       "100    201011  195329\n",
+       "101    201010  206730\n",
+       "102    201009  174706\n",
+       "103    201008  138323\n",
+       "104    201007  127687\n",
+       "105    201006  146164\n",
+       "106    201005  206822\n",
+       "107    201004  197381\n",
+       "108    201003  217092\n",
+       "109    201002  206380\n",
+       "110    201001  193376\n",
+       "111    200912  166506\n",
+       "112    200911  210227\n",
+       "113    200910  226865\n",
+       "114    200909  213208\n",
+       "115    200908  162281\n",
+       "116    200907  160993\n",
+       "117    200906  179176\n",
+       "118    200905  211879\n",
+       "119    200904  179152\n",
+       "120    200903   99008\n",
+       "\n",
+       "[121 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# filter hits per month (all filters) (data quarry from 05.03.2019, that's why hitcount Mar 2019 is so small)\n",
+    "df_hits"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
-- 
GitLab