From ec9a1bcea074972d40e1206ea7d3bcbefa270fd5 Mon Sep 17 00:00:00 2001 From: Lyudmila Vaseva <vaseva@mi.fu-berlin.de> Date: Wed, 6 Mar 2019 02:12:27 +0100 Subject: [PATCH] Explore hitcounts in more detail --- quarries/quarry-34014 | 5 + ...been-there-per-month-en-wiki-run346197.csv | 122 +++++ src/explore.ipynb | 423 +++++++++++++++++- 3 files changed, 548 insertions(+), 2 deletions(-) create mode 100644 quarries/quarry-34014 create mode 100644 quarries/quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv diff --git a/quarries/quarry-34014 b/quarries/quarry-34014 new file mode 100644 index 0000000..576faf5 --- /dev/null +++ b/quarries/quarry-34014 @@ -0,0 +1,5 @@ +use enwiki_p; +select left(afl_timestamp, 6) LogMonth, count(*) Freq +from abuse_filter_log +group by left(afl_timestamp, 6) +order by 1 desc; diff --git a/quarries/quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv b/quarries/quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv new file mode 100644 index 0000000..aaab81d --- /dev/null +++ b/quarries/quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv @@ -0,0 +1,122 @@ +LogMonth,Freq +201903,34309 +201902,236606 +201901,252668 +201812,226287 +201811,253233 +201810,256438 +201809,230354 +201808,216045 +201807,205477 +201806,209374 +201805,238235 +201804,242857 +201803,255431 +201802,213548 +201801,235705 +201712,213359 +201711,254671 +201710,224244 +201709,212790 +201708,195593 +201707,176778 +201706,184406 +201705,231250 +201704,225176 +201703,257081 +201702,246199 +201701,256925 +201612,226680 +201611,258655 +201610,254070 +201609,238406 +201608,192681 +201607,171567 +201606,201867 +201605,294276 +201604,319474 +201603,360327 +201602,366637 +201601,372907 +201512,307003 +201511,338129 +201510,264966 +201509,272765 +201508,215203 +201507,207626 +201506,236916 +201505,250012 +201504,201602 +201503,188296 +201502,163529 +201501,163552 +201412,137314 +201411,154933 +201410,165732 +201409,132150 +201408,93636 +201407,80837 +201406,94967 +201405,132788 +201404,129260 +201403,157123 +201402,146198 +201401,151970 +201312,131054 +201311,156195 +201310,163029 +201309,122300 +201308,117010 +201307,113163 +201306,109264 +201305,163644 +201304,146456 +201303,156797 +201302,196511 +201301,169755 +201212,147379 +201211,186991 +201210,204204 +201209,160734 +201208,122418 +201207,110906 +201206,123762 +201205,178005 +201204,173853 +201203,184071 +201202,197130 +201201,192057 +201112,175397 +201111,211174 +201110,224586 +201109,171945 +201108,133965 +201107,133186 +201106,143617 +201105,173431 +201104,160696 +201103,177539 +201102,173222 +201101,181135 +201012,157738 +201011,195329 +201010,206730 +201009,174706 +201008,138323 +201007,127687 +201006,146164 +201005,206822 +201004,197381 +201003,217092 +201002,206380 +201001,193376 +200912,166506 +200911,210227 +200910,226865 +200909,213208 +200908,162281 +200907,160993 +200906,179176 +200905,211879 +200904,179152 +200903,99008 diff --git a/src/explore.ipynb b/src/explore.ipynb index f71a308..f1cb27e 100644 --- a/src/explore.ipynb +++ b/src/explore.ipynb @@ -24,8 +24,9 @@ "metadata": {}, "outputs": [], "source": [ - " df = pd.read_csv(\"20190106115600_filters-sorted-by-hits-manual-tags.csv\", sep='\\t')\n", - " df_origin = pd.read_csv(\"quarry-32518-all-filters-sorted-num-hits.csv\", sep=',')" + "df = pd.read_csv(\"20190106115600_filters-sorted-by-hits-manual-tags.csv\", sep='\\t')\n", + "df_origin = pd.read_csv(\"quarry-32518-all-filters-sorted-num-hits.csv\", sep=',')\n", + "df_hits = pd.read_csv(\"quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv\", sep=',')" ] }, { @@ -208,6 +209,424 @@ "# TODO: question: what do they mean?" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LogMonth</th>\n", + " <th>Freq</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>201903</td>\n", + " <td>34309</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>201902</td>\n", + " <td>236606</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>201901</td>\n", + " <td>252668</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>201812</td>\n", + " <td>226287</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>201811</td>\n", + " <td>253233</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>201810</td>\n", + " <td>256438</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>201809</td>\n", + " <td>230354</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>201808</td>\n", + " <td>216045</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>201807</td>\n", + " <td>205477</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>201806</td>\n", + " <td>209374</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>201805</td>\n", + " <td>238235</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>201804</td>\n", + " <td>242857</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>201803</td>\n", + " <td>255431</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>201802</td>\n", + " <td>213548</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>201801</td>\n", + " <td>235705</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>201712</td>\n", + " <td>213359</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>201711</td>\n", + " <td>254671</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>201710</td>\n", + " <td>224244</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>201709</td>\n", + " <td>212790</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>201708</td>\n", + " <td>195593</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>201707</td>\n", + " <td>176778</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>201706</td>\n", + " <td>184406</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>201705</td>\n", + " <td>231250</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>201704</td>\n", + " <td>225176</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>201703</td>\n", + " <td>257081</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>201702</td>\n", + " <td>246199</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>201701</td>\n", + " <td>256925</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>201612</td>\n", + " <td>226680</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>201611</td>\n", + " <td>258655</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>201610</td>\n", + " <td>254070</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>91</th>\n", + " <td>201108</td>\n", + " <td>133965</td>\n", + " </tr>\n", + " <tr>\n", + " <th>92</th>\n", + " <td>201107</td>\n", + " <td>133186</td>\n", + " </tr>\n", + " <tr>\n", + " <th>93</th>\n", + " <td>201106</td>\n", + " <td>143617</td>\n", + " </tr>\n", + " <tr>\n", + " <th>94</th>\n", + " <td>201105</td>\n", + " <td>173431</td>\n", + " </tr>\n", + " <tr>\n", + " <th>95</th>\n", + " <td>201104</td>\n", + " <td>160696</td>\n", + " </tr>\n", + " <tr>\n", + " <th>96</th>\n", + " <td>201103</td>\n", + " <td>177539</td>\n", + " </tr>\n", + " <tr>\n", + " <th>97</th>\n", + " <td>201102</td>\n", + " <td>173222</td>\n", + " </tr>\n", + " <tr>\n", + " <th>98</th>\n", + " <td>201101</td>\n", + " <td>181135</td>\n", + " </tr>\n", + " <tr>\n", + " <th>99</th>\n", + " <td>201012</td>\n", + " <td>157738</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100</th>\n", + " <td>201011</td>\n", + " <td>195329</td>\n", + " </tr>\n", + " <tr>\n", + " <th>101</th>\n", + " <td>201010</td>\n", + " <td>206730</td>\n", + " </tr>\n", + " <tr>\n", + " <th>102</th>\n", + " <td>201009</td>\n", + " <td>174706</td>\n", + " </tr>\n", + " <tr>\n", + " <th>103</th>\n", + " <td>201008</td>\n", + " <td>138323</td>\n", + " </tr>\n", + " <tr>\n", + " <th>104</th>\n", + " <td>201007</td>\n", + " <td>127687</td>\n", + " </tr>\n", + " <tr>\n", + " <th>105</th>\n", + " <td>201006</td>\n", + " <td>146164</td>\n", + " </tr>\n", + " <tr>\n", + " <th>106</th>\n", + " <td>201005</td>\n", + " <td>206822</td>\n", + " </tr>\n", + " <tr>\n", + " <th>107</th>\n", + " <td>201004</td>\n", + " <td>197381</td>\n", + " </tr>\n", + " <tr>\n", + " <th>108</th>\n", + " <td>201003</td>\n", + " <td>217092</td>\n", + " </tr>\n", + " <tr>\n", + " <th>109</th>\n", + " <td>201002</td>\n", + " <td>206380</td>\n", + " </tr>\n", + " <tr>\n", + " <th>110</th>\n", + " <td>201001</td>\n", + " <td>193376</td>\n", + " </tr>\n", + " <tr>\n", + " <th>111</th>\n", + " <td>200912</td>\n", + " <td>166506</td>\n", + " </tr>\n", + " <tr>\n", + " <th>112</th>\n", + " <td>200911</td>\n", + " <td>210227</td>\n", + " </tr>\n", + " <tr>\n", + " <th>113</th>\n", + " <td>200910</td>\n", + " <td>226865</td>\n", + " </tr>\n", + " <tr>\n", + " <th>114</th>\n", + " <td>200909</td>\n", + " <td>213208</td>\n", + " </tr>\n", + " <tr>\n", + " <th>115</th>\n", + " <td>200908</td>\n", + " <td>162281</td>\n", + " </tr>\n", + " <tr>\n", + " <th>116</th>\n", + " <td>200907</td>\n", + " <td>160993</td>\n", + " </tr>\n", + " <tr>\n", + " <th>117</th>\n", + " <td>200906</td>\n", + " <td>179176</td>\n", + " </tr>\n", + " <tr>\n", + " <th>118</th>\n", + " <td>200905</td>\n", + " <td>211879</td>\n", + " </tr>\n", + " <tr>\n", + " <th>119</th>\n", + " <td>200904</td>\n", + " <td>179152</td>\n", + " </tr>\n", + " <tr>\n", + " <th>120</th>\n", + " <td>200903</td>\n", + " <td>99008</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>121 rows × 2 columns</p>\n", + "</div>" + ], + "text/plain": [ + " LogMonth Freq\n", + "0 201903 34309\n", + "1 201902 236606\n", + "2 201901 252668\n", + "3 201812 226287\n", + "4 201811 253233\n", + "5 201810 256438\n", + "6 201809 230354\n", + "7 201808 216045\n", + "8 201807 205477\n", + "9 201806 209374\n", + "10 201805 238235\n", + "11 201804 242857\n", + "12 201803 255431\n", + "13 201802 213548\n", + "14 201801 235705\n", + "15 201712 213359\n", + "16 201711 254671\n", + "17 201710 224244\n", + "18 201709 212790\n", + "19 201708 195593\n", + "20 201707 176778\n", + "21 201706 184406\n", + "22 201705 231250\n", + "23 201704 225176\n", + "24 201703 257081\n", + "25 201702 246199\n", + "26 201701 256925\n", + "27 201612 226680\n", + "28 201611 258655\n", + "29 201610 254070\n", + ".. ... ...\n", + "91 201108 133965\n", + "92 201107 133186\n", + "93 201106 143617\n", + "94 201105 173431\n", + "95 201104 160696\n", + "96 201103 177539\n", + "97 201102 173222\n", + "98 201101 181135\n", + "99 201012 157738\n", + "100 201011 195329\n", + "101 201010 206730\n", + "102 201009 174706\n", + "103 201008 138323\n", + "104 201007 127687\n", + "105 201006 146164\n", + "106 201005 206822\n", + "107 201004 197381\n", + "108 201003 217092\n", + "109 201002 206380\n", + "110 201001 193376\n", + "111 200912 166506\n", + "112 200911 210227\n", + "113 200910 226865\n", + "114 200909 213208\n", + "115 200908 162281\n", + "116 200907 160993\n", + "117 200906 179176\n", + "118 200905 211879\n", + "119 200904 179152\n", + "120 200903 99008\n", + "\n", + "[121 rows x 2 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# filter hits per month (all filters) (data quarry from 05.03.2019, that's why hitcount Mar 2019 is so small)\n", + "df_hits" + ] + }, { "cell_type": "markdown", "metadata": {}, -- GitLab