diff --git a/src/explore.ipynb b/src/explore.ipynb index ce598d4328250fa7a0640ec59a69d7e046e62238..56b8cdfaf4a61550fc311d2c86d67698a5dadf70 100644 --- a/src/explore.ipynb +++ b/src/explore.ipynb @@ -661,7 +661,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -1756,86 +1756,527 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 6, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "10.68.16.39 1689\n", - "37.113.52.15 1249\n", - "95.152.44.52 1133\n", - "5.165.178.194 715\n", - "94.181.143.10 697\n", - "95.152.42.158 674\n", - "5.166.250.109 559\n", - "5.166.224.152 556\n", - "Acheter cialis 533\n", - "37.113.51.96 473\n", - "93.124.7.25 430\n", - "5.167.114.39 317\n", - "93.124.34.23 307\n", - "36.250.176.0 291\n", - "Wwekik2222kdjdj 279\n", - "37.113.37.111 274\n", - "94.181.170.143 247\n", - "Theadityapratap 239\n", - "94.181.156.128 229\n", - "93.124.46.78 222\n", - "Achat cialis 222\n", - "93.124.28.116 213\n", - "176.97.116.140 202\n", - "5.165.186.39 192\n", - "Acquistare cialis 190\n", - "93.124.74.221 187\n", - "AbhiJahazi 186\n", - "37.113.34.32 186\n", - "37.113.28.187 185\n", - "64.62.219.98 180\n", - " ... \n", - "HortonTimsonOlvsL 1\n", - "75.108.207.104 1\n", - "Ambalamb 1\n", - "98.183.198.159 1\n", - "2600:1003:B460:10FF:3580:4CD7:FCA8:76A8 1\n", - "Arth21 1\n", - "LinaPom774069 1\n", - "50.133.191.168 1\n", - "37.44.110.64 1\n", - "76.9.63.87 1\n", - "2602:306:BCCB:3360:A9C9:3DCF:17AC:FE29 1\n", - "82.40.75.53 1\n", - "2601:646:8801:FB67:542A:8A08:C2C6:ED03 1\n", - "95.239.155.109 1\n", - "2600:1002:B001:834C:AB:517:31CF:FEB6 1\n", - "120.145.0.23 1\n", - "94.3.229.99 1\n", - "49.204.22.233 1\n", - "71.85.159.151 1\n", - "204.174.144.1 1\n", - "96.29.192.142 1\n", - "CGrater 1\n", - "Lotta21 1\n", - "JuwelRana 1\n", - "207.165.194.249 1\n", - "Sujithbdu 1\n", - "Arsyad44 1\n", - "108.89.84.24 1\n", - "41.174.157.12 1\n", - "PorthBohnVGUK 1\n", - "Name: afl_user_text, Length: 139586, dtype: int64" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "10.68.16.39 1689\n", + "37.113.52.15 1249\n", + "95.152.44.52 1133\n", + "5.165.178.194 715\n", + "94.181.143.10 697\n", + "95.152.42.158 674\n", + "5.166.250.109 559\n", + "5.166.224.152 556\n", + "Acheter cialis 533\n", + "37.113.51.96 473\n", + "93.124.7.25 430\n", + "5.167.114.39 317\n", + "93.124.34.23 307\n", + "36.250.176.0 291\n", + "Wwekik2222kdjdj 279\n", + "37.113.37.111 274\n", + "94.181.170.143 247\n", + "Theadityapratap 239\n", + "94.181.156.128 229\n", + "93.124.46.78 222\n", + "Achat cialis 222\n", + "93.124.28.116 213\n", + "176.97.116.140 202\n", + "5.165.186.39 192\n", + "Acquistare cialis 190\n", + "93.124.74.221 187\n", + "AbhiJahazi 186\n", + "37.113.34.32 186\n", + "37.113.28.187 185\n", + "64.62.219.98 180\n", + "222.77.212.168 173\n", + "64.62.219.148 173\n", + "37.113.22.12 171\n", + "188.120.134.68 169\n", + "37.113.28.212 157\n", + "5.166.255.126 156\n", + "205.213.52.13 153\n", + "68.84.101.104 150\n", + "37.113.22.212 149\n", + "AesterSl 146\n", + "5.166.231.32 146\n", + "93.124.29.148 144\n", + "96.37.204.168 142\n", + "5.167.126.42 142\n", + "66.102.230.110 141\n", + "178.125.162.54 140\n", + "Alecdaly 138\n", + "5.167.116.53 136\n", + "5.165.185.198 136\n", + "Kennethtima 135\n", + "86.58.36.235 131\n", + "178.125.193.125 126\n", + "Bilal Akbulut 122\n", + "Rajendra Ramtel 118\n", + "EugeneRix 114\n", + "Toulton19 114\n", + "178.125.185.25 110\n", + "BizzellCorp 109\n", + "84.90.219.128 108\n", + "63.231.140.34 106\n", + "93.124.126.23 105\n", + "178.137.212.49 104\n", + "176.109.183.238 101\n", + "64.62.219.70 100\n", + "107.0.17.138 99\n", + "Ginaginagina123 99\n", + "Abdul moksha de jabar 98\n", + "Rajukumar141011 97\n", + "Vimalvishwakar6 96\n", + "184.97.149.210 95\n", + "71.72.91.197 94\n", + "199.231.184.74 93\n", + "Avosic 92\n", + "RadioYousuf 92\n", + "178.207.224.217 89\n", + "AidInternationalFund 89\n", + "MichaelsMeanies 89\n", + "188.120.153.125 89\n", + "5.166.254.110 89\n", + "94.158.43.41 89\n", + "178.125.132.158 88\n", + "178.125.237.213 86\n", + "93.124.37.176 86\n", + "Epicgenius 85\n", + "Kimbuere 85\n", + "24.103.48.226 84\n", + "45.26.44.116 84\n", + "Materialscientist 84\n", + "184.97.185.44 83\n", + "5.165.176.37 83\n", + "37.115.220.115 83\n", + "70.90.150.225 81\n", + "64.134.64.190 79\n", + "Prix cialis 79\n", + "71.31.94.254 79\n", + "Tylerbooty 78\n", + "Megafreackingman 78\n", + "Govbharti 77\n", + "SMSSPriscilla 76\n", + "178.125.167.141 76\n", + "Harrogateafc 76\n", + "2601:244:4400:FE89:41EB:45B2:3F91:ECE4 76\n", + "Colourmoon 76\n", + "OMG420 76\n", + "81.88.116.27 76\n", + "68.46.222.11 76\n", + "Selkirk sucks 75\n", + "100.37.131.59 75\n", + "37.211.55.95 74\n", + "2601:204:100:4000:D0FB:FCEA:55B0:4E21 74\n", + "216.186.17.100 74\n", + "2607:FB90:1227:A5E4:0:1C:64E6:EE01 73\n", + "176.109.229.187 73\n", + "Hellenic Foundation for Culture 72\n", + "94.181.162.157 72\n", + "5.167.125.122 72\n", + "108.4.65.75 71\n", + "Otsh2014 71\n", + "GoldRingChip 71\n", + "IMS331 71\n", + "Bab boye12 69\n", + "172.99.17.45 69\n", + "178.125.170.44 69\n", + "Matt16081608 68\n", + "97.93.174.233 68\n", + "Lakikw 68\n", + "71.55.151.230 68\n", + "Thenationawakes1 68\n", + "78.146.161.91 67\n", + "120.37.205.132 67\n", + "Cantorwally 67\n", + "188.120.151.141 67\n", + "ChiShip Couriers 66\n", + "Salman solanki 66\n", + "Got Mold Home Warranty 66\n", + "Young grace 66\n", + "177.33.15.40 66\n", + "RenegadeDevelopment 66\n", + "Lowfloatindex 66\n", + "95.152.15.191 66\n", + "NishantBohra 66\n", + "96.226.126.242 66\n", + "178.125.204.125 65\n", + "Tj-b223 65\n", + "46.185.121.71 65\n", + "TUPAC SHAKUR1 64\n", + "207.237.79.152 64\n", + "Gaurav kishor sharma 63\n", + "24.11.33.156 63\n", + "199.249.227.127 63\n", + "Abi99098 63\n", + "Liukairen 63\n", + "FtLauderGuy 63\n", + "Snowman123 s 62\n", + "Dxasadakbar 62\n", + "Patrick FOULON 62\n", + "Notafly 61\n", + "216.145.67.8 61\n", + "168.221.157.196 61\n", + "Sportshub 61\n", + "216.56.8.89 61\n", + "Saumali choudhury 60\n", + "194.154.22.133 60\n", + "145.103.114.17 59\n", + "MélanieMartins 59\n", + "WymanMemorialProject 59\n", + "Yttube 58\n", + "Uttamvastra 58\n", + "Profdeo22 58\n", + "Flocea80 58\n", + "64.187.127.177 58\n", + "31.49.220.136 58\n", + "EscondidoHistory 58\n", + "173.191.55.126 57\n", + "166.177.248.45 57\n", + "Toby7232 57\n", + "Buttgrease 57\n", + "203.177.97.174 57\n", + "Surrey Cockfucker 57\n", + "73.32.62.197 57\n", + "Ehtesham888 56\n", + "City-Of-Derby-Academy 56\n", + "2602:306:3796:ABD0:B56E:DFDC:1599:F9BE 56\n", + "Flakkariice 56\n", + "162.246.175.91 55\n", + "81.131.77.174 55\n", + "Aeontv 55\n", + "49.146.204.250 55\n", + "178.125.225.205 55\n", + "Dickbuttchocalateeater 55\n", + "2.102.9.193 55\n", + "178.125.142.105 55\n", + "70.124.133.228 55\n", + "Stevennguyen123 55\n", + "Peenident 55\n", + "EveCrew82 54\n", + "Azeemtelecome 54\n", + "2601:188:0:ABE6:65F5:930C:B0B2:CD63 54\n", + "24.186.183.227 54\n", + "Ahmadrezabayani 54\n", + "AlkReadEditView history 54\n", + "183.83.244.172 53\n", + "GeeAichhBee 53\n", + "Muvarna 53\n", + "FartMCtart 53\n", + "2.108.3.204 53\n", + "BadWolfFilms 53\n", + "37.113.19.124 53\n", + "194.81.223.13 53\n", + "Damai.cn 52\n", + "117.68.211.255 52\n", + "Gazorpazorp723 52\n", + "Qikiterast 52\n", + "Sangbadkonika 52\n", + "49.191.131.43 52\n", + "199.120.252.122 52\n", + "125.86.30.16 52\n", + "205.237.30.148 52\n", + "Mysterymanboy 52\n", + "95.87.152.60 51\n", + "Delaviegas 51\n", + "Philafrenzy 51\n", + "125.86.3.193 51\n", + "14.100.132.7 51\n", + "Teamunite 51\n", + "87.252.67.241 51\n", + "206.82.18.126 51\n", + "CitizenZee 50\n", + "40.135.233.254 50\n", + "Wik781 50\n", + "1valkilmer 50\n", + "66.87.100.83 50\n", + "QuarterBones 50\n", + "Wiki manager 11111 50\n", + "Examideas 49\n", + "217.165.64.211 49\n", + "Thantsinaungmrit 49\n", + "24.149.10.170 49\n", + "82.6.186.144 49\n", + "Hihihihihihhihhihihihihi 49\n", + "126.236.49.58 49\n", + "115.133.117.249 49\n", + "Shrunk12 49\n", + "UltraCake 48\n", + "2602:304:CEBF:8590:15D4:6EB8:1C89:E9AE 48\n", + "Ishraqkhan 48\n", + "75.82.112.108 48\n", + "158.59.127.107 48\n", + "Abdousaied76 48\n", + "Sleepfuture 48\n", + " ... \n", + "Khan turbo 1\n", + "72.204.184.11 1\n", + "205.144.100.200 1\n", + "2601:742:201:6DF0:3C08:CD75:A38E:49B9 1\n", + "79.218.249.138 1\n", + "Joshidivya 1\n", + "90.229.246.44 1\n", + "68.117.48.231 1\n", + "84.84.158.168 1\n", + "2605:6001:E605:5700:7527:2E8C:D4A:EB12 1\n", + "Sheffieldbusandtruck 1\n", + "Half-Mad-Media 1\n", + "46.23.105.20 1\n", + "68.55.144.63 1\n", + "Sasclassics 1\n", + "Marymusicale 1\n", + "121.54.54.140 1\n", + "FombyBuzzelliEror 1\n", + "97.82.85.26 1\n", + "77.119.129.116 1\n", + "172.90.218.82 1\n", + "Mrs Wright18 1\n", + "162.205.13.36 1\n", + "69.115.190.107 1\n", + "KleinsorgeStrawhornKjTLI 1\n", + "ZapienIngmirecQJC 1\n", + "Usov3516 1\n", + "How To Hide A Body 1\n", + "Pruthvi nayak 1\n", + "Galaeantonio 1\n", + "Rbrusseltje 1\n", + "174.1.104.139 1\n", + "2600:1002:B127:E70C:D112:D4E1:C139:5C95 1\n", + "Hassan ali ghanieh nazha 1\n", + "PokBirdoTVhl 1\n", + "TheDukeW 1\n", + "94.200.149.22 1\n", + "108.4.71.26 1\n", + "86.25.170.196 1\n", + "67.83.188.159 1\n", + "CurticeRipkaYQVro 1\n", + "172.0.192.93 1\n", + "2602:306:247D:919:907C:53DA:FA2D:D854 1\n", + "Arunkumara1983 1\n", + "66.191.182.116 1\n", + "178.3.37.203 1\n", + "AdamVarch 1\n", + "174.126.73.115 1\n", + "Mhdtarek 1\n", + "ZanghiZarembalnKz 1\n", + "173.24.33.41 1\n", + "198.109.26.125 1\n", + "41.84.213.213 1\n", + "2602:306:80F5:5C70:38C0:CEF0:32D8:B301 1\n", + "86.128.211.144 1\n", + "Habib errant 1\n", + "RumbleyLirianosCww 1\n", + "50.156.94.221 1\n", + "Hajecjbolsns 1\n", + "Juliand665 1\n", + "Soundharganesh 1\n", + "Blackafrikana 1\n", + "DelisaLazoafwml 1\n", + "Mlowc 1\n", + "Shiok 1\n", + "162.252.228.15 1\n", + "182.239.74.76 1\n", + "39.54.193.196 1\n", + "118.200.29.8 1\n", + "66.87.121.23 1\n", + "RomanWeingartnerXSzC 1\n", + "RizzottoAutioHZWu 1\n", + "73.170.48.34 1\n", + "82.207.230.144 1\n", + "BaimKnorephSZe 1\n", + "166.216.194.74 1\n", + "Wnotlaw 1\n", + "Fairyswaggie 1\n", + "Iamvandanakariyal 1\n", + "173.231.72.226 1\n", + "50.90.246.190 1\n", + "117.232.235.183 1\n", + "199.231.178.195 1\n", + "Pappu banarjee 1\n", + "2607:FB90:5CAA:3DCD:F345:F85C:39E7:1D90 1\n", + "DacyViarTFmyg 1\n", + "Frustration2 1\n", + "223.234.201.14 1\n", + "SilbermanSoreyZrJVX 1\n", + "87.112.38.150 1\n", + "Sudheerak 1\n", + "Hannah case 1\n", + "Theonenetworth 1\n", + "Sanjita cruz man 1\n", + "67.79.7.75 1\n", + "2601:4A:8002:6740:4D7D:5F5B:8425:8B7E 1\n", + "172.10.122.3 1\n", + "94.210.110.232 1\n", + "Trevorwilliamhawkes 1\n", + "138.37.234.109 1\n", + "172.56.27.86 1\n", + "Pavanannabathuni 1\n", + "Amyhanyu08 1\n", + "Emojisaremylife 1\n", + "SlayterSiburtEHZn 1\n", + "75.115.103.54 1\n", + "Zofanakos7 1\n", + "Unhasu 1\n", + "Camillacabello97 1\n", + "CamilloMifflinZLeG 1\n", + "Arjaybolgado27 1\n", + "Francea cruz 1\n", + "1.39.87.116 1\n", + "62.155.199.40 1\n", + "Delt01 1\n", + "68.184.83.241 1\n", + "GunnerDenbowAMwVM 1\n", + "220.244.98.88 1\n", + "97.32.74.47 1\n", + "104.229.28.168 1\n", + "Tugioy45 1\n", + "Smiling1 1\n", + "Advocate ahmad 1\n", + "50.134.18.56 1\n", + "86.152.0.86 1\n", + "Dominickhotor 1\n", + "Leonard Juarez 1\n", + "209.52.88.36 1\n", + "2A02:1205:5057:A730:30D7:BB5D:18B9:4EB0 1\n", + "70.66.8.253 1\n", + "2A02:C7D:9B68:D800:9C19:B69D:7C1A:EC65 1\n", + "208.83.7.117 1\n", + "Dahhaixszjin 1\n", + "90.214.98.100 1\n", + "97.103.133.51 1\n", + "Sameet kumble 1\n", + "80.12.55.65 1\n", + "112.196.176.28 1\n", + "NARRATIVEPSYCHOLOGY 1\n", + "Justin cobb 1\n", + "96.243.230.6 1\n", + "217.30.100.196 1\n", + "66.114.26.214 1\n", + "24.3.35.166 1\n", + "24.86.188.68 1\n", + "2602:306:38C5:90C0:9D60:2478:6069:5118 1\n", + "49.204.27.1 1\n", + "Ultimate Beast Master 1\n", + "Narang.jaya 1\n", + "Twenty Three 12 1\n", + "117.247.51.203 1\n", + "94.7.75.47 1\n", + "KaylaKilgore 1\n", + "Lanryadd 1\n", + "108.171.133.171 1\n", + "Sana abdullah ismail 1\n", + "91.125.222.242 1\n", + "2001:569:7757:AB00:4D0:C86A:1BFF:F5F7 1\n", + "75.169.254.249 1\n", + "131.245.205.106 1\n", + "206.229.102.50 1\n", + "208.99.235.6 1\n", + "198.147.8.14 1\n", + "KoolRiley 1\n", + "Bennetrajadvocate 1\n", + "Roxanneelbaff 1\n", + "2602:30B:8279:C629:24F4:9B32:380B:D9EB 1\n", + "71.14.66.184 1\n", + "Difaf 1\n", + "PrindleTuinstrafOKB 1\n", + "Va.nic94 1\n", + "Raven za ninja 1\n", + "Cameronisboss 1\n", + "188.135.45.37 1\n", + "Spydsensemm 1\n", + "80.42.191.98 1\n", + "Consolidate-c-company canada 1\n", + "MaximIsazaLfNiZ 1\n", + "50.173.211.246 1\n", + "Dura1990 1\n", + "37.47.11.27 1\n", + "71.181.78.186 1\n", + "Sam Meurs 1\n", + "166.177.186.133 1\n", + "Jane023 1\n", + "BrannockLykevWoyp 1\n", + "WPD love 1\n", + "McquaidMateusZfEG 1\n", + "219.74.158.134 1\n", + "Stevenspower 1\n", + "122.169.6.227 1\n", + "SamsaGregor 1\n", + "MischlerSablemPDQ 1\n", + "Monkeys4 1\n", + "46.240.245.94 1\n", + "MrSpinno 1\n", + "SpierGaytanDiUzN 1\n", + "Fizzyjamz 1\n", + "Subham Kumar 1\n", + "KudlaHodumspAJK 1\n", + "151.199.191.58 1\n", + "Donkeltie 1\n", + "86.158.41.239 1\n", + "103.1.70.48 1\n", + "Rockstarmrktng 1\n", + "MccomseyFacklerXZTXm 1\n", + "Concertmusic 1\n", + "Dracula 152 1\n", + "208.53.195.38 1\n", + "Andrew Tevis 1\n", + "93.38.248.23 1\n", + "2.216.125.148 1\n", + "Mamoudmassaquoi 1\n", + "Raider Duck 1\n", + "98.116.108.39 1\n", + "24.13.130.36 1\n", + "Poonuggets6254627 1\n", + "KiviShippyWnMTZ 1\n", + "Satyarthi vivek kushawaha 1\n", + "MiscioneKomarvUVA 1\n", + "121.75.230.0 1\n", + "74.15.79.236 1\n", + "129.31.241.10 1\n", + "Alan0540 1\n", + "VuyovichRybergwWai 1\n", + "88.110.124.164 1\n", + "Dapdune 1\n", + "Md.Mushraf Khan 1\n", + "2602:306:CE67:B3C0:ADAF:F458:FD16:D11F 1\n", + "EruthlieTheHealer 1\n", + "ApprenticeFan 1\n", + "TRAPAJ TIGER 1\n", + "BlaineSelbigvyTjI 1\n", + "2A02:C7D:A198:7A00:69C9:928E:DE7:4E07 1\n", + "SevillanoOuimettexYarj 1\n", + "Rumplegodiam 1\n", + "SkovMalechaJDkmz 1\n", + "JamieLorenzo 1\n", + "AV23840 1\n", + "Calendarortodox262 1\n", + "Lucas de Oliveira Bezerra 1\n", + "Jallen8812 1\n", + "Mdizzlero87 1\n", + "82.18.147.201 1\n", + "68.147.130.179 1\n", + "AcAwty19 1\n", + "Subash Bhusal (Nepal) 1\n", + "76.177.122.178 1\n", + "216.246.166.89 1\n", + "Bodo1609 1\n", + "Name: afl_user_text, Length: 139586, dtype: int64\n" + ] } ], "source": [ "df_jan2016 = pd.read_csv(\"quarry-37494-abuselog-entries-en-wiki-in-january-2016-run389216.csv\", sep=',')\n", "\n", "# Number of hits per editor\n", - "df_jan2016['afl_user_text'].value_counts() #TODO intersect users with actions/pages" + "#TODO intersect users with actions/pages\n", + "\n", + "\n", + "with pd.option_context('display.max_rows', 500, 'display.max_columns', 500):\n", + " print(df_jan2016['afl_user_text'].value_counts())" ] }, { @@ -1855,6 +2296,7 @@ } ], "source": [ + "# So, even if we check the top X users (according to hits), they are still responsible for only 4% of the hits\n", "most_user_hits = [1689, 1249, 1133, 715, 697, 674, 559, 556, 533, 473, 430, 317, 307, 291, 279, 274, 247, 239, 229, 222, 222, \n", " 213, 202, 192, 190, 187, 186, 186, 185]\n", "sum(most_user_hits)" @@ -6726,201 +7168,1297 @@ "275175 95.152.44.52 NaN 0 \n", "275208 95.152.44.52 NaN 0 \n", "\n", - " afl_patrolled_by afl_rev_id afl_log_id \n", - "252411 0 NaN NaN \n", - "252412 0 NaN NaN \n", - "252426 0 NaN NaN \n", - "252435 0 NaN NaN \n", - "252436 0 NaN NaN \n", - "252437 0 NaN NaN \n", - "252442 0 NaN NaN \n", - "252451 0 NaN NaN \n", - "252452 0 NaN NaN \n", - "252453 0 NaN NaN \n", - "252468 0 NaN NaN \n", - "252469 0 NaN NaN \n", - "252471 0 NaN NaN \n", - "252472 0 NaN NaN \n", - "252475 0 NaN NaN \n", - "252476 0 NaN NaN \n", - "252481 0 NaN NaN \n", - "252482 0 NaN NaN \n", - "252483 0 NaN NaN \n", - "252484 0 NaN NaN \n", - "252485 0 NaN NaN \n", - "252489 0 NaN NaN \n", - "252496 0 NaN NaN \n", - "252506 0 NaN NaN \n", - "252507 0 NaN NaN \n", - "252510 0 NaN NaN \n", - "252511 0 NaN NaN \n", - "252512 0 NaN NaN \n", - "252513 0 NaN NaN \n", - "252520 0 NaN NaN \n", - "... ... ... ... \n", - "273921 0 NaN NaN \n", - "273929 0 NaN NaN \n", - "273949 0 NaN NaN \n", - "273963 0 NaN NaN \n", - "274041 0 NaN NaN \n", - "274156 0 NaN NaN \n", - "274183 0 NaN NaN \n", - "274188 0 NaN NaN \n", - "274205 0 NaN NaN \n", - "274250 0 NaN NaN \n", - "274291 0 NaN NaN \n", - "274303 0 NaN NaN \n", - "274497 0 NaN NaN \n", - "274498 0 NaN NaN \n", - "274499 0 NaN NaN \n", - "274504 0 NaN NaN \n", - "274505 0 NaN NaN \n", - "274547 0 NaN NaN \n", - "274568 0 NaN NaN \n", - "274570 0 NaN NaN \n", - "274587 0 NaN NaN \n", - "274638 0 NaN NaN \n", - "274645 0 NaN NaN \n", - "274682 0 NaN NaN \n", - "274694 0 NaN NaN \n", - "274701 0 NaN NaN \n", - "274743 0 NaN NaN \n", - "274977 0 NaN NaN \n", - "275175 0 NaN NaN \n", - "275208 0 NaN NaN \n", + " afl_patrolled_by afl_rev_id afl_log_id \n", + "252411 0 NaN NaN \n", + "252412 0 NaN NaN \n", + "252426 0 NaN NaN \n", + "252435 0 NaN NaN \n", + "252436 0 NaN NaN \n", + "252437 0 NaN NaN \n", + "252442 0 NaN NaN \n", + "252451 0 NaN NaN \n", + "252452 0 NaN NaN \n", + "252453 0 NaN NaN \n", + "252468 0 NaN NaN \n", + "252469 0 NaN NaN \n", + "252471 0 NaN NaN \n", + "252472 0 NaN NaN \n", + "252475 0 NaN NaN \n", + "252476 0 NaN NaN \n", + "252481 0 NaN NaN \n", + "252482 0 NaN NaN \n", + "252483 0 NaN NaN \n", + "252484 0 NaN NaN \n", + "252485 0 NaN NaN \n", + "252489 0 NaN NaN \n", + "252496 0 NaN NaN \n", + "252506 0 NaN NaN \n", + "252507 0 NaN NaN \n", + "252510 0 NaN NaN \n", + "252511 0 NaN NaN \n", + "252512 0 NaN NaN \n", + "252513 0 NaN NaN \n", + "252520 0 NaN NaN \n", + "... ... ... ... \n", + "273921 0 NaN NaN \n", + "273929 0 NaN NaN \n", + "273949 0 NaN NaN \n", + "273963 0 NaN NaN \n", + "274041 0 NaN NaN \n", + "274156 0 NaN NaN \n", + "274183 0 NaN NaN \n", + "274188 0 NaN NaN \n", + "274205 0 NaN NaN \n", + "274250 0 NaN NaN \n", + "274291 0 NaN NaN \n", + "274303 0 NaN NaN \n", + "274497 0 NaN NaN \n", + "274498 0 NaN NaN \n", + "274499 0 NaN NaN \n", + "274504 0 NaN NaN \n", + "274505 0 NaN NaN \n", + "274547 0 NaN NaN \n", + "274568 0 NaN NaN \n", + "274570 0 NaN NaN \n", + "274587 0 NaN NaN \n", + "274638 0 NaN NaN \n", + "274645 0 NaN NaN \n", + "274682 0 NaN NaN \n", + "274694 0 NaN NaN \n", + "274701 0 NaN NaN \n", + "274743 0 NaN NaN \n", + "274977 0 NaN NaN \n", + "275175 0 NaN NaN \n", + "275208 0 NaN NaN \n", + "\n", + "[1133 rows x 16 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# What were the very active IPs doing\n", + "df_jan2016[df_jan2016['afl_user_text'] == '95.152.44.52']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A lot of triggers of\n", + "466 \"Userspace & talk page spamming\"\n", + "271 \"Possible spambot\"\n", + "\n", + "Following 2 diffs https://en.wikipedia.org/wiki/Special:AbuseLog/14413885 and https://en.wikipedia.org/wiki/Special:AbuseLog/14413886\n", + "reveal it's the exact same spambot operating from another IP (which seems to belong to the same russian provider)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "edit 300781\n", + "createaccount 71902\n", + "move 193\n", + "autocreateaccount 18\n", + "gatheredit 9\n", + "delete 4\n", + "Name: afl_action, dtype: int64" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Number of hits per editor's actions\n", + "df_jan2016['afl_action'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Int64Index([527, 61, 650, 633, 279, 636, 384, 135, 30, 172,\n", + " ...\n", + " 597, 579, 709, 68, 554, 749, 596, 718, 459, 694],\n", + " dtype='int64', length=138)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_jan2016['afl_filter'].value_counts().index" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>filter</th>\n", + " <th>hits</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>527</th>\n", + " <td>527</td>\n", + " <td>71853</td>\n", + " </tr>\n", + " <tr>\n", + " <th>61</th>\n", + " <td>61</td>\n", + " <td>27072</td>\n", + " </tr>\n", + " <tr>\n", + " <th>650</th>\n", + " <td>650</td>\n", + " <td>24264</td>\n", + " </tr>\n", + " <tr>\n", + " <th>633</th>\n", + " <td>633</td>\n", + " <td>21099</td>\n", + " </tr>\n", + " <tr>\n", + " <th>279</th>\n", + " <td>279</td>\n", + " <td>18460</td>\n", + " </tr>\n", + " <tr>\n", + " <th>636</th>\n", + " <td>636</td>\n", + " <td>17279</td>\n", + " </tr>\n", + " <tr>\n", + " <th>384</th>\n", + " <td>384</td>\n", + " <td>15080</td>\n", + " </tr>\n", + " <tr>\n", + " <th>135</th>\n", + " <td>135</td>\n", + " <td>10028</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>30</td>\n", + " <td>7829</td>\n", + " </tr>\n", + " <tr>\n", + " <th>172</th>\n", + " <td>172</td>\n", + " <td>7471</td>\n", + " </tr>\n", + " <tr>\n", + " <th>271</th>\n", + " <td>271</td>\n", + " <td>7192</td>\n", + " </tr>\n", + " <tr>\n", + " <th>380</th>\n", + " <td>380</td>\n", + " <td>6554</td>\n", + " </tr>\n", + " <tr>\n", + " <th>80</th>\n", + " <td>80</td>\n", + " <td>6530</td>\n", + " </tr>\n", + " <tr>\n", + " <th>364</th>\n", + " <td>364</td>\n", + " <td>6238</td>\n", + " </tr>\n", + " <tr>\n", + " <th>686</th>\n", + " <td>686</td>\n", + " <td>6089</td>\n", + " </tr>\n", + " <tr>\n", + " <th>712</th>\n", + " <td>712</td>\n", + " <td>5597</td>\n", + " </tr>\n", + " <tr>\n", + " <th>466</th>\n", + " <td>466</td>\n", + " <td>5555</td>\n", + " </tr>\n", + " <tr>\n", + " <th>432</th>\n", + " <td>432</td>\n", + " <td>5398</td>\n", + " </tr>\n", + " <tr>\n", + " <th>220</th>\n", + " <td>220</td>\n", + " <td>5385</td>\n", + " </tr>\n", + " <tr>\n", + " <th>550</th>\n", + " <td>550</td>\n", + " <td>5215</td>\n", + " </tr>\n", + " <tr>\n", + " <th>189</th>\n", + " <td>189</td>\n", + " <td>4730</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>4656</td>\n", + " </tr>\n", + " <tr>\n", + " <th>148</th>\n", + " <td>148</td>\n", + " <td>4470</td>\n", + " </tr>\n", + " <tr>\n", + " <th>260</th>\n", + " <td>260</td>\n", + " <td>4148</td>\n", + " </tr>\n", + " <tr>\n", + " <th>614</th>\n", + " <td>614</td>\n", + " <td>4120</td>\n", + " </tr>\n", + " <tr>\n", + " <th>231</th>\n", + " <td>231</td>\n", + " <td>3398</td>\n", + " </tr>\n", + " <tr>\n", + " <th>631</th>\n", + " <td>631</td>\n", + " <td>3349</td>\n", + " </tr>\n", + " <tr>\n", + " <th>225</th>\n", + " <td>225</td>\n", + " <td>3245</td>\n", + " </tr>\n", + " <tr>\n", + " <th>46</th>\n", + " <td>46</td>\n", + " <td>3174</td>\n", + " </tr>\n", + " <tr>\n", + " <th>680</th>\n", + " <td>680</td>\n", + " <td>3134</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>242</th>\n", + " <td>242</td>\n", + " <td>19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>706</th>\n", + " <td>706</td>\n", + " <td>19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>16</td>\n", + " <td>19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>734</th>\n", + " <td>734</td>\n", + " <td>18</td>\n", + " </tr>\n", + " <tr>\n", + " <th>264</th>\n", + " <td>264</td>\n", + " <td>18</td>\n", + " </tr>\n", + " <tr>\n", + " <th>710</th>\n", + " <td>710</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>666</th>\n", + " <td>666</td>\n", + " <td>15</td>\n", + " </tr>\n", + " <tr>\n", + " <th>722</th>\n", + " <td>722</td>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <th>167</th>\n", + " <td>167</td>\n", + " <td>13</td>\n", + " </tr>\n", + " <tr>\n", + " <th>294</th>\n", + " <td>294</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>624</th>\n", + " <td>624</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <th>727</th>\n", + " <td>727</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>651</th>\n", + " <td>651</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>637</th>\n", + " <td>637</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>745</th>\n", + " <td>745</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>52</th>\n", + " <td>52</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>674</th>\n", + " <td>674</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>748</th>\n", + " <td>748</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>690</th>\n", + " <td>690</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>597</th>\n", + " <td>597</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>579</th>\n", + " <td>579</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>709</th>\n", + " <td>709</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>68</th>\n", + " <td>68</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>554</th>\n", + " <td>554</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>749</th>\n", + " <td>749</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>596</th>\n", + " <td>596</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>718</th>\n", + " <td>718</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>459</th>\n", + " <td>459</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>694</th>\n", + " <td>694</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>138 rows × 2 columns</p>\n", + "</div>" + ], + "text/plain": [ + " filter hits\n", + "527 527 71853\n", + "61 61 27072\n", + "650 650 24264\n", + "633 633 21099\n", + "279 279 18460\n", + "636 636 17279\n", + "384 384 15080\n", + "135 135 10028\n", + "30 30 7829\n", + "172 172 7471\n", + "271 271 7192\n", + "380 380 6554\n", + "80 80 6530\n", + "364 364 6238\n", + "686 686 6089\n", + "712 712 5597\n", + "466 466 5555\n", + "432 432 5398\n", + "220 220 5385\n", + "550 550 5215\n", + "189 189 4730\n", + "3 3 4656\n", + "148 148 4470\n", + "260 260 4148\n", + "614 614 4120\n", + "231 231 3398\n", + "631 631 3349\n", + "225 225 3245\n", + "46 46 3174\n", + "680 680 3134\n", + ".. ... ...\n", + "242 242 19\n", + "706 706 19\n", + "16 16 19\n", + "734 734 18\n", + "264 264 18\n", + "710 710 17\n", + "666 666 15\n", + "722 722 14\n", + "167 167 13\n", + "294 294 10\n", + "624 624 9\n", + "727 727 8\n", + "651 651 8\n", + "637 637 8\n", + "745 745 6\n", + "52 52 6\n", + "674 674 6\n", + "748 748 5\n", + "690 690 5\n", + "2 2 5\n", + "597 597 4\n", + "579 579 4\n", + "709 709 4\n", + "68 68 4\n", + "554 554 3\n", + "749 749 3\n", + "596 596 1\n", + "718 718 1\n", + "459 459 1\n", + "694 694 1\n", + "\n", + "[138 rows x 2 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Number of hits per filter\n", + "hits_jan2016 = pd.DataFrame(data={'filter': df_jan2016['afl_filter'].value_counts().index, 'hits': df_jan2016['afl_filter'].value_counts()})\n", + "hits_jan2016" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>filter</th>\n", + " <th>hits</th>\n", + " <th>af_id</th>\n", + " <th>af_public_comments</th>\n", + " <th>manual_tags</th>\n", + " <th>af_actions</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>527</td>\n", + " <td>71853</td>\n", + " <td>527</td>\n", + " <td>T34234: log/throttle possible sleeper account ...</td>\n", + " <td>sockpuppetry</td>\n", + " <td>throttle</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>61</td>\n", + " <td>27072</td>\n", + " <td>61</td>\n", + " <td>New user removing references</td>\n", + " <td>good_faith_refs</td>\n", + " <td>tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>650</td>\n", + " <td>24264</td>\n", + " <td>650</td>\n", + " <td>Creation of a new article without any categories</td>\n", + " <td>general_tracking</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>633</td>\n", + " <td>21099</td>\n", + " <td>633</td>\n", + " <td>Possible canned edit summary</td>\n", + " <td>general_vandalism</td>\n", + " <td>tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>279</td>\n", + " <td>18460</td>\n", + " <td>279</td>\n", + " <td>Repeated attempts to vandalize</td>\n", + " <td>unclear</td>\n", + " <td>throttle,tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>636</td>\n", + " <td>17279</td>\n", + " <td>636</td>\n", + " <td>Unexplained removal of sourced content</td>\n", + " <td>good_faith_deletion</td>\n", + " <td>warn</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>384</td>\n", + " <td>15080</td>\n", + " <td>384</td>\n", + " <td>Addition of bad words or other vandalism</td>\n", + " <td>profanity_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>135</td>\n", + " <td>10028</td>\n", + " <td>135</td>\n", + " <td>Repeating characters</td>\n", + " <td>silly_vandalism</td>\n", + " <td>warn,tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>30</td>\n", + " <td>7829</td>\n", + " <td>30</td>\n", + " <td>Large deletion from article by new editors</td>\n", + " <td>good_faith_deletion</td>\n", + " <td>warn,tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>172</td>\n", + " <td>7471</td>\n", + " <td>172</td>\n", + " <td>Section blanking</td>\n", + " <td>good_faith_deletion</td>\n", + " <td>tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>271</td>\n", + " <td>7192</td>\n", + " <td>271</td>\n", + " <td>Possible spambot</td>\n", + " <td>spam</td>\n", + " <td>warn</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>380</td>\n", + " <td>6554</td>\n", + " <td>380</td>\n", + " <td>Multiple obscenities</td>\n", + " <td>profanity_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>80</td>\n", + " <td>6530</td>\n", + " <td>80</td>\n", + " <td>Link spamming</td>\n", + " <td>spam</td>\n", + " <td>throttle,warn,tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>364</td>\n", + " <td>6238</td>\n", + " <td>364</td>\n", + " <td>Changing the name in a BLP infobox</td>\n", + " <td>hoaxing</td>\n", + " <td>tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>686</td>\n", + " <td>6089</td>\n", + " <td>686</td>\n", + " <td>IP adding possibly unreferenced material to BLP</td>\n", + " <td>hoaxing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>712</td>\n", + " <td>5597</td>\n", + " <td>712</td>\n", + " <td>Possibly changing date of birth in infobox</td>\n", + " <td>hoaxing</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>466</td>\n", + " <td>5555</td>\n", + " <td>466</td>\n", + " <td>Userspace & talk page spamming</td>\n", + " <td>spam</td>\n", + " <td>warn,tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>432</td>\n", + " <td>5398</td>\n", + " <td>432</td>\n", + " <td>Starting new line with lowercase letters</td>\n", + " <td>good_faith_orthography</td>\n", + " <td>warn,tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>220</td>\n", + " <td>5385</td>\n", + " <td>220</td>\n", + " <td>Adding external images/links</td>\n", + " <td>good_faith_external_resources</td>\n", + " <td>warn,tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>550</td>\n", + " <td>5215</td>\n", + " <td>550</td>\n", + " <td>nowiki tags inserted into an article</td>\n", + " <td>unclear</td>\n", + " <td>tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>189</td>\n", + " <td>4730</td>\n", + " <td>189</td>\n", + " <td>BLP vandalism or libel</td>\n", + " <td>personal_attacks</td>\n", + " <td>tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>3</td>\n", + " <td>4656</td>\n", + " <td>3</td>\n", + " <td>New user blanking articles</td>\n", + " <td>good_faith_deletion</td>\n", + " <td>warn,tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>148</td>\n", + " <td>4470</td>\n", + " <td>148</td>\n", + " <td>Users creating autobiographies</td>\n", + " <td>self_promotion</td>\n", + " <td>tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>260</td>\n", + " <td>4148</td>\n", + " <td>260</td>\n", + " <td>Common vandal phrases</td>\n", + " <td>profanity_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>614</td>\n", + " <td>4120</td>\n", + " <td>614</td>\n", + " <td>Yolo swag and other vandalism trends</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>231</td>\n", + " <td>3398</td>\n", + " <td>231</td>\n", + " <td>Long string of characters containing no spaces</td>\n", + " <td>silly_vandalism</td>\n", + " <td>warn,tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>631</td>\n", + " <td>3349</td>\n", + " <td>631</td>\n", + " <td>Extraneous toolbar markup</td>\n", + " <td>good_faith_test_edits</td>\n", + " <td>tag</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>225</td>\n", + " <td>3245</td>\n", + " <td>225</td>\n", + " <td>Vandalism in all caps</td>\n", + " <td>profanity_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>46</td>\n", + " <td>3174</td>\n", + " <td>46</td>\n", + " <td>\"Poop\" vandalism</td>\n", + " <td>profanity_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>680</td>\n", + " <td>3134</td>\n", + " <td>680</td>\n", + " <td>Adding emoji unicode characters</td>\n", + " <td>general_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>108</th>\n", + " <td>242</td>\n", + " <td>19</td>\n", + " <td>242</td>\n", + " <td>Redirecting a substantial existing page - new ...</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>throttle,disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>109</th>\n", + " <td>706</td>\n", + " <td>19</td>\n", + " <td>706</td>\n", + " <td>Dutch football vandalism</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>110</th>\n", + " <td>16</td>\n", + " <td>19</td>\n", + " <td>16</td>\n", + " <td>Prolific socker I</td>\n", + " <td>sockpuppetry</td>\n", + " <td>throttle,disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>111</th>\n", + " <td>734</td>\n", + " <td>18</td>\n", + " <td>734</td>\n", + " <td>McAusten</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>112</th>\n", + " <td>264</td>\n", + " <td>18</td>\n", + " <td>264</td>\n", + " <td>Specific-page vandalism</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>113</th>\n", + " <td>710</td>\n", + " <td>17</td>\n", + " <td>710</td>\n", + " <td>Muhammad vandal</td>\n", + " <td>religiously_motivated</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>114</th>\n", + " <td>666</td>\n", + " <td>15</td>\n", + " <td>666</td>\n", + " <td>Aas Mohammad</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>115</th>\n", + " <td>722</td>\n", + " <td>14</td>\n", + " <td>722</td>\n", + " <td>Please Accept</td>\n", + " <td>good_faith_edit_summary</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>116</th>\n", + " <td>167</td>\n", + " <td>13</td>\n", + " <td>167</td>\n", + " <td>Botched submissions to Articles for creation</td>\n", + " <td>good_faith_wiki_syntax</td>\n", + " <td>warn</td>\n", + " </tr>\n", + " <tr>\n", + " <th>117</th>\n", + " <td>294</td>\n", + " <td>10</td>\n", + " <td>294</td>\n", + " <td>Personal attacks</td>\n", + " <td>personal_attacks</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>118</th>\n", + " <td>624</td>\n", + " <td>9</td>\n", + " <td>624</td>\n", + " <td>CheckUser Sock block</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>119</th>\n", + " <td>727</td>\n", + " <td>8</td>\n", + " <td>727</td>\n", + " <td>Samwalton9 test filter</td>\n", + " <td>test</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>120</th>\n", + " <td>651</td>\n", + " <td>8</td>\n", + " <td>651</td>\n", + " <td>AfricaTanz</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>warn,disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>121</th>\n", + " <td>637</td>\n", + " <td>8</td>\n", + " <td>637</td>\n", + " <td>MusikAnimal test filter</td>\n", + " <td>test</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>122</th>\n", + " <td>745</td>\n", + " <td>6</td>\n", + " <td>745</td>\n", + " <td>Talk page abuse</td>\n", + " <td>talk_page_vandalism</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>123</th>\n", + " <td>52</td>\n", + " <td>6</td>\n", + " <td>52</td>\n", + " <td>Edit summary vandalism II</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>124</th>\n", + " <td>674</td>\n", + " <td>6</td>\n", + " <td>674</td>\n", + " <td>Long term infobox vandal</td>\n", + " <td>long_term_abuse</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>125</th>\n", + " <td>748</td>\n", + " <td>5</td>\n", + " <td>748</td>\n", + " <td>pp-30-500 editing restriction</td>\n", + " <td>unclear</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>126</th>\n", + " <td>690</td>\n", + " <td>5</td>\n", + " <td>690</td>\n", + " <td>Smoothest Ashu</td>\n", + " <td>sockpuppetry</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>127</th>\n", + " <td>2</td>\n", + " <td>5</td>\n", + " <td>2</td>\n", + " <td>Test filter: for testing private filters</td>\n", + " <td>test</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>128</th>\n", + " <td>597</td>\n", + " <td>4</td>\n", + " <td>597</td>\n", + " <td>Europefan</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>129</th>\n", + " <td>579</td>\n", + " <td>4</td>\n", + " <td>579</td>\n", + " <td>Possible sockpuppet account creations</td>\n", + " <td>sockpuppetry</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>130</th>\n", + " <td>709</td>\n", + " <td>4</td>\n", + " <td>709</td>\n", + " <td>LTA filter</td>\n", + " <td>long_term_abuse</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>131</th>\n", + " <td>68</td>\n", + " <td>4</td>\n", + " <td>68</td>\n", + " <td>Pagemove throttle for new users</td>\n", + " <td>page_move_vandalism</td>\n", + " <td>throttle,disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>132</th>\n", + " <td>554</td>\n", + " <td>3</td>\n", + " <td>554</td>\n", + " <td>top100 blog charts</td>\n", + " <td>spam</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>133</th>\n", + " <td>749</td>\n", + " <td>3</td>\n", + " <td>749</td>\n", + " <td>SS disruption</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>134</th>\n", + " <td>596</td>\n", + " <td>1</td>\n", + " <td>596</td>\n", + " <td>Elevator vandal</td>\n", + " <td>hidden_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>135</th>\n", + " <td>718</td>\n", + " <td>1</td>\n", + " <td>718</td>\n", + " <td>Prolific socker III</td>\n", + " <td>sockpuppetry</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>136</th>\n", + " <td>459</td>\n", + " <td>1</td>\n", + " <td>459</td>\n", + " <td>Long-term block evasion</td>\n", + " <td>long_term_abuse</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " <tr>\n", + " <th>137</th>\n", + " <td>694</td>\n", + " <td>1</td>\n", + " <td>694</td>\n", + " <td>Moves to or from the Module namespace</td>\n", + " <td>page_move_vandalism</td>\n", + " <td>disallow</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>138 rows × 6 columns</p>\n", + "</div>" + ], + "text/plain": [ + " filter hits af_id af_public_comments \\\n", + "0 527 71853 527 T34234: log/throttle possible sleeper account ... \n", + "1 61 27072 61 New user removing references \n", + "2 650 24264 650 Creation of a new article without any categories \n", + "3 633 21099 633 Possible canned edit summary \n", + "4 279 18460 279 Repeated attempts to vandalize \n", + "5 636 17279 636 Unexplained removal of sourced content \n", + "6 384 15080 384 Addition of bad words or other vandalism \n", + "7 135 10028 135 Repeating characters \n", + "8 30 7829 30 Large deletion from article by new editors \n", + "9 172 7471 172 Section blanking \n", + "10 271 7192 271 Possible spambot \n", + "11 380 6554 380 Multiple obscenities \n", + "12 80 6530 80 Link spamming \n", + "13 364 6238 364 Changing the name in a BLP infobox \n", + "14 686 6089 686 IP adding possibly unreferenced material to BLP \n", + "15 712 5597 712 Possibly changing date of birth in infobox \n", + "16 466 5555 466 Userspace & talk page spamming \n", + "17 432 5398 432 Starting new line with lowercase letters \n", + "18 220 5385 220 Adding external images/links \n", + "19 550 5215 550 nowiki tags inserted into an article \n", + "20 189 4730 189 BLP vandalism or libel \n", + "21 3 4656 3 New user blanking articles \n", + "22 148 4470 148 Users creating autobiographies \n", + "23 260 4148 260 Common vandal phrases \n", + "24 614 4120 614 Yolo swag and other vandalism trends \n", + "25 231 3398 231 Long string of characters containing no spaces \n", + "26 631 3349 631 Extraneous toolbar markup \n", + "27 225 3245 225 Vandalism in all caps \n", + "28 46 3174 46 \"Poop\" vandalism \n", + "29 680 3134 680 Adding emoji unicode characters \n", + ".. ... ... ... ... \n", + "108 242 19 242 Redirecting a substantial existing page - new ... \n", + "109 706 19 706 Dutch football vandalism \n", + "110 16 19 16 Prolific socker I \n", + "111 734 18 734 McAusten \n", + "112 264 18 264 Specific-page vandalism \n", + "113 710 17 710 Muhammad vandal \n", + "114 666 15 666 Aas Mohammad \n", + "115 722 14 722 Please Accept \n", + "116 167 13 167 Botched submissions to Articles for creation \n", + "117 294 10 294 Personal attacks \n", + "118 624 9 624 CheckUser Sock block \n", + "119 727 8 727 Samwalton9 test filter \n", + "120 651 8 651 AfricaTanz \n", + "121 637 8 637 MusikAnimal test filter \n", + "122 745 6 745 Talk page abuse \n", + "123 52 6 52 Edit summary vandalism II \n", + "124 674 6 674 Long term infobox vandal \n", + "125 748 5 748 pp-30-500 editing restriction \n", + "126 690 5 690 Smoothest Ashu \n", + "127 2 5 2 Test filter: for testing private filters \n", + "128 597 4 597 Europefan \n", + "129 579 4 579 Possible sockpuppet account creations \n", + "130 709 4 709 LTA filter \n", + "131 68 4 68 Pagemove throttle for new users \n", + "132 554 3 554 top100 blog charts \n", + "133 749 3 749 SS disruption \n", + "134 596 1 596 Elevator vandal \n", + "135 718 1 718 Prolific socker III \n", + "136 459 1 459 Long-term block evasion \n", + "137 694 1 694 Moves to or from the Module namespace \n", + "\n", + " manual_tags af_actions \n", + "0 sockpuppetry throttle \n", + "1 good_faith_refs tag \n", + "2 general_tracking NaN \n", + "3 general_vandalism tag \n", + "4 unclear throttle,tag \n", + "5 good_faith_deletion warn \n", + "6 profanity_vandalism disallow \n", + "7 silly_vandalism warn,tag \n", + "8 good_faith_deletion warn,tag \n", + "9 good_faith_deletion tag \n", + "10 spam warn \n", + "11 profanity_vandalism disallow \n", + "12 spam throttle,warn,tag \n", + "13 hoaxing tag \n", + "14 hoaxing NaN \n", + "15 hoaxing NaN \n", + "16 spam warn,tag \n", + "17 good_faith_orthography warn,tag \n", + "18 good_faith_external_resources warn,tag \n", + "19 unclear tag \n", + "20 personal_attacks tag \n", + "21 good_faith_deletion warn,tag \n", + "22 self_promotion tag \n", + "23 profanity_vandalism disallow \n", + "24 hidden_vandalism disallow \n", + "25 silly_vandalism warn,tag \n", + "26 good_faith_test_edits tag \n", + "27 profanity_vandalism disallow \n", + "28 profanity_vandalism disallow \n", + "29 general_vandalism disallow \n", + ".. ... ... \n", + "108 hidden_vandalism throttle,disallow \n", + "109 hidden_vandalism disallow \n", + "110 sockpuppetry throttle,disallow \n", + "111 hidden_vandalism disallow \n", + "112 hidden_vandalism disallow \n", + "113 religiously_motivated disallow \n", + "114 hidden_vandalism disallow \n", + "115 good_faith_edit_summary NaN \n", + "116 good_faith_wiki_syntax warn \n", + "117 personal_attacks disallow \n", + "118 hidden_vandalism NaN \n", + "119 test NaN \n", + "120 hidden_vandalism warn,disallow \n", + "121 test NaN \n", + "122 talk_page_vandalism NaN \n", + "123 hidden_vandalism disallow \n", + "124 long_term_abuse disallow \n", + "125 unclear NaN \n", + "126 sockpuppetry disallow \n", + "127 test NaN \n", + "128 hidden_vandalism disallow \n", + "129 sockpuppetry NaN \n", + "130 long_term_abuse disallow \n", + "131 page_move_vandalism throttle,disallow \n", + "132 spam disallow \n", + "133 hidden_vandalism NaN \n", + "134 hidden_vandalism disallow \n", + "135 sockpuppetry disallow \n", + "136 long_term_abuse disallow \n", + "137 page_move_vandalism disallow \n", "\n", - "[1133 rows x 16 columns]" + "[138 rows x 6 columns]" ] }, - "execution_count": 53, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# What were the very active IPs doing\n", - "df_jan2016[df_jan2016['afl_user_text'] == '95.152.44.52']" + "hits_jan2016_tags = hits_jan2016.merge(df_2nd[['af_id', 'af_public_comments', 'manual_tags', 'af_actions']], how='inner', left_on='filter', right_on='af_id')\n", + "hits_jan2016_tags" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "A lot of triggers of\n", - "466 \"Userspace & talk page spamming\"\n", - "271 \"Possible spambot\"\n", - "\n", - "Following 2 diffs https://en.wikipedia.org/wiki/Special:AbuseLog/14413885 and https://en.wikipedia.org/wiki/Special:AbuseLog/14413886\n", - "reveal it's the exact same spambot operating from another IP (which seems to belong to the same russian provider)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "edit 300781\n", - "createaccount 71902\n", - "move 193\n", - "autocreateaccount 18\n", - "gatheredit 9\n", - "delete 4\n", - "Name: afl_action, dtype: int64" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Number of hits per editor's actions\n", - "df_jan2016['afl_action'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "527 71853\n", - "61 27072\n", - "650 24264\n", - "633 21099\n", - "279 18460\n", - "636 17279\n", - "384 15080\n", - "135 10028\n", - "30 7829\n", - "172 7471\n", - "271 7192\n", - "380 6554\n", - "80 6530\n", - "364 6238\n", - "686 6089\n", - "712 5597\n", - "466 5555\n", - "432 5398\n", - "220 5385\n", - "550 5215\n", - "189 4730\n", - "3 4656\n", - "148 4470\n", - "260 4148\n", - "614 4120\n", - "231 3398\n", - "631 3349\n", - "225 3245\n", - "46 3174\n", - "680 3134\n", - " ... \n", - "242 19\n", - "706 19\n", - "16 19\n", - "734 18\n", - "264 18\n", - "710 17\n", - "666 15\n", - "722 14\n", - "167 13\n", - "294 10\n", - "624 9\n", - "727 8\n", - "651 8\n", - "637 8\n", - "745 6\n", - "52 6\n", - "674 6\n", - "748 5\n", - "690 5\n", - "2 5\n", - "597 4\n", - "579 4\n", - "709 4\n", - "68 4\n", - "554 3\n", - "749 3\n", - "596 1\n", - "718 1\n", - "459 1\n", - "694 1\n", - "Name: afl_filter, Length: 138, dtype: int64" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Number of hits per filter\n", - "df_jan2016['afl_filter'].value_counts()" + "Newest filters with a lot of hits are ID 686 (IP adding possibly unreferenced material to BLP) and 712 (Possibly changing date of birth in infobox)" ] }, { @@ -13808,7 +15346,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 26, "metadata": { "scrolled": true }, @@ -13859,7 +15397,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 27, "metadata": { "scrolled": true }, @@ -13992,7 +15530,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -14108,7 +15646,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 29, "metadata": {}, "outputs": [ {