Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
DBS-project
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Deploy
Releases
Container registry
Model registry
Analyze
Contributor analytics
Repository analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
hlm
DBS-project
Commits
54773273
Commit
54773273
authored
8 years ago
by
Maria Hartmann
Browse files
Options
Downloads
Patches
Plain Diff
nr. 2,3
parent
432f4acc
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Clean_commented.java
+643
-0
643 additions, 0 deletions
Clean_commented.java
with
643 additions
and
0 deletions
Clean_commented.java
0 → 100644
+
643
−
0
View file @
54773273
import
java.io.BufferedReader
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
import
java.io.FileReader
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Scanner
;
import
java.util.regex.Matcher
;
import
java.sql.*
;
import
javax.naming.spi.DirStateFactory.Result
;
class
Tweet
{
// representing entity tweet
private
String
original_author
;
private
String
tweet_text
;
private
String
tweet_time
;
private
int
tweet_ID
;
private
int
retweet_count
;
private
int
favorite_count
;
// constructor
public
Tweet
(
String
original_author
,
String
tweet_text
,
String
tweet_time
,
int
tweet_ID
,
int
retweet_count
,
int
favorite_count
){
this
.
original_author
=
original_author
;
this
.
tweet_text
=
tweet_text
;
this
.
tweet_time
=
tweet_time
;
this
.
tweet_ID
=
tweet_ID
;
this
.
retweet_count
=
retweet_count
;
this
.
favorite_count
=
favorite_count
;
}
// end of constructor
public
String
get_original_author
(){
return
this
.
original_author
;
}
public
String
get_tweet_text
(){
return
this
.
tweet_text
;
}
public
String
get_tweet_time
(){
return
this
.
tweet_time
;
}
public
int
get_tweet_ID
(){
return
this
.
tweet_ID
;
}
public
int
get_retweet_count
(){
return
this
.
retweet_count
;
}
public
int
get_favorite_count
(){
return
this
.
favorite_count
;
}
}
// end of class
class
Hashtag
{
//representing entity hashtag from ER model
private
int
hashtag_ID
;
private
String
hashtag_text
;
public
Hashtag
(
int
hashtag_ID
,
String
hashtag_text
){
this
.
hashtag_ID
=
hashtag_ID
;
this
.
hashtag_text
=
hashtag_text
;
}
// end of constructor
public
int
get_hashtag_ID
(){
return
this
.
hashtag_ID
;
}
public
String
get_hashtag_text
(){
return
this
.
hashtag_text
;
}
}
// end of class hashtag
class
User
{
//representing user entity from ER model
private
String
handle
;
private
int
user_id
;
public
User
(
int
user_id
,
String
handle
){
this
.
user_id
=
user_id
;
this
.
handle
=
handle
;
}
// end of constructor
public
String
get_handle
(){
return
this
.
handle
;
}
public
int
get_user_id
(){
return
this
.
user_id
;
}
}
// end of class user
class
user2tweet
{
//representing user-tweet relation from ER model
private
int
user_id
;
private
int
tweet_id
;
public
user2tweet
(
int
user_id
,
int
tweet_id
){
this
.
user_id
=
user_id
;
this
.
tweet_id
=
tweet_id
;
}
// end of constructor
public
int
get_user_id
(){
return
this
.
user_id
;
}
public
int
get_tweet_id
(){
return
this
.
tweet_id
;
}
}
// end of class
class
tweet2hashtag
{
//representing tweet-hashtag relation
private
int
tweet_id
;
private
int
hashtag_id
;
public
tweet2hashtag
(
int
tweet_id
,
int
hashtag_id
){
this
.
tweet_id
=
tweet_id
;
this
.
hashtag_id
=
hashtag_id
;
}
public
int
get_tweet_id
(){
return
this
.
tweet_id
;
}
public
int
get_hashtag_id
(){
return
this
.
hashtag_id
;
}
}
class
tweet2Retweet
{
int
tweet_id_Original
;
int
tweet_id_Copy
;
public
tweet2Retweet
(
int
tweet_id_Original
,
int
tweet_id_Copy
){
this
.
tweet_id_Original
=
tweet_id_Original
;
this
.
tweet_id_Copy
=
tweet_id_Copy
;
}
// end of constructtor
public
int
get_tweet_id_Original
()
{
return
this
.
tweet_id_Original
;
}
public
int
get_tweet_id_Copy
()
{
return
this
.
tweet_id_Copy
;
}
}
// end of class
//#############################################################################################################################################
public
class
Clean
{
public
ArrayList
<
Tweet
>
tweetList
=
new
ArrayList
<
Tweet
>();
public
ArrayList
<
Hashtag
>
HashtagList
=
new
ArrayList
<
Hashtag
>();
public
ArrayList
<
User
>
tweetUserList
=
new
ArrayList
<
User
>();
public
ArrayList
<
String
>
userList
=
new
ArrayList
<
String
>();
public
ArrayList
<
String
>
hashtagList_entire
=
new
ArrayList
<
String
>
();
public
ArrayList
<
Tweet
>
get_tweetList
(){
return
this
.
tweetList
;
}
public
ArrayList
<
Hashtag
>
get_HashtagList
(){
return
this
.
HashtagList
;
}
public
ArrayList
<
User
>
get_tweetUserList
(){
return
this
.
tweetUserList
;
}
public
void
printTweet
(){
for
(
int
i
=
0
;
i
<
this
.
tweetList
.
size
();
i
++){
Tweet
tweet
=
this
.
tweetList
.
get
(
i
);
System
.
out
.
println
(
tweet
.
get_tweet_ID
()+
" "
+
tweet
.
get_tweet_text
());
}
}
//#####################################################################################################################################
public
static
void
justRead
(
String
xfileLocation
){
//reads content of a given file
Scanner
scanIn
=
null
;
String
inputLine
=
""
;
int
counter
=
0
;
try
{
scanIn
=
new
Scanner
(
new
BufferedReader
(
new
FileReader
(
xfileLocation
)));
scanIn
.
next
();
//skip header line
while
(
scanIn
.
hasNextLine
())
{
//while there are unread lines
inputLine
=
scanIn
.
nextLine
();
//read next line
System
.
out
.
println
(
counter
+
" "
+
inputLine
);
counter
++;
//count number of lines read (minus header)
}
scanIn
.
close
();
}
catch
(
FileNotFoundException
e
)
{
e
.
printStackTrace
();
}
}
// end of class
//############################################################################################################################################################
public
static
ArrayList
<
String
>
removeDup
(
ArrayList
<
String
>
hashList
){
//remove duplicate hashtags
ArrayList
<
String
>
temp
=
new
ArrayList
<
String
>();
for
(
int
i
=
0
;
i
<
hashList
.
size
();
i
++)
{
//iterate hashtag list
String
hash
=
hashList
.
get
(
i
);
//check for every hashtag
if
(
temp
.
contains
(
hash
)==
false
){
//whether it has been read before
temp
.
add
(
hash
);
//if not, save it / otherwise discard
}
// end of if
}
// end of for
return
temp
;
//return list of unique hashtags
}
public
void
concati
(
ArrayList
<
String
>
hashList
)
{
// concatenate sublists of hashtags
for
(
int
i
=
0
;
i
<
hashList
.
size
();
i
++){
String
hash
=
hashList
.
get
(
i
);
if
(
this
.
hashtagList_entire
.
contains
(
hash
)==
false
){
this
.
hashtagList_entire
.
add
(
hash
);
}
// end of if
}
// end of while
}
public
static
int
charToASCII
(
final
char
character
)
{
//converts character to correspondin ASCII value
return
(
int
)
character
;
}
public
static
boolean
checkASCII
(
char
chr
)
{
//check for invalid (e.g. special) characters
int
valASCII
=(
int
)
chr
;
if
(
(
valASCII
>=
65
&&
valASCII
<=
90
)
//capital Letter
||
(
valASCII
>=
97
&&
valASCII
<=
122
)
// small letter
||
(
valASCII
>=
48
&&
valASCII
<=
57
)){
// numbers
return
true
;
//if char is either letter or number: return true (valid)
}
else
{
return
false
;
//otherwise return false (reject)
}
}
public
static
ArrayList
<
String
>
identifyHash
(
String
input
)
{
//identify hashtags in string
int
i
=
0
;
int
index_start_hashtag
=
0
;
//index of first character of hashtag
int
index_end_hashtag
=
0
;
//index of last character of hashtag
ArrayList
<
String
>
hashList
=
new
ArrayList
<
String
>();
//list of all hashtags found
for
(
i
=
0
;
i
<
input
.
length
();
i
++)
{
//loop through input string (char array)
char
sign
=
input
.
charAt
(
i
);
if
(
sign
==
'#'
)
{
//if hashtag symbol is found
char
sign2
=
input
.
charAt
(
i
+
1
);
if
(
checkASCII
(
sign2
))
{
//check if next char following hashtag symbol is valid hashtag character
index_start_hashtag
=
i
;
int
count_length
=
i
+
1
;
//count length of hashtag
char
iterate
=
0
;
int
j
=
0
;
while
(
checkASCII
(
iterate
)||(
int
)
iterate
==
0
)
{
//while next char belongs to hashtag
//
if
(
count_length
==
input
.
length
())
{
//if end of string is reached
j
=
count_length
;
break
;
//stop
}
else
{
//otherwise
iterate
=
input
.
charAt
(
count_length
);
//look at next character
j
=
count_length
;
count_length
++;
//increment length
}
}
// end of while
index_end_hashtag
=
j
;
String
hash
=
input
.
substring
(
index_start_hashtag
,
index_end_hashtag
);
//extract hashtag using indices
hashList
.
add
(
hash
);
//add hashtag to list
i
=
count_length
;
}
else
{
//do nothing
}
// end of if/ else
}
}
// end of for
return
hashList
;
//return
}
// end of identifyHash
public
static
void
displayHashes
(
ArrayList
<
String
>
hashList
)
{
//print hashtag list
for
(
int
i
=
0
;
i
<
hashList
.
size
();
i
++){
System
.
out
.
println
(
hashList
.
get
(
i
));
}
// end of for
}
// end of displayHashes
public
ArrayList
<
String
>
orderAlpha
(
ArrayList
<
String
>
hashList
)
{
//order hashtags alphabetically? bubblesort?!
int
size
=
hashList
.
size
();
int
i
;
for
(
i
=
0
;
i
<
size
;
i
++){
for
(
int
j
=
i
+
1
;
j
<
size
;
j
++){
if
(
hashList
.
get
(
i
).
compareTo
(
hashList
.
get
(
j
))<
0
){
String
tmp
=
hashList
.
get
(
i
);
hashList
.
set
(
i
,
hashList
.
get
(
j
));
hashList
.
set
(
j
,
tmp
);
}
}
//end of inner for loop
}
//end of outer for loop
return
hashList
;
}
//############################################################################################################################################################
public
static
String
replaceAllSemi
(
String
str
)
{
// replace
for
(
int
i
=
0
;
i
<
str
.
length
();
i
++){
char
sign
=
str
.
charAt
(
i
);
if
(
sign
==
';'
){
str
=
str
.
substring
(
0
,
i
-
1
)+
","
+
str
.
substring
(
i
+
1
);
}
}
return
str
;
}
// end of replaceAllSemi
//################################################################################################
//
// public static String identifyOriginalAuthor(String tuple){
//
// }// end of identifyOriginalAuthor
//
//################################################################################################
public
void
readCSV
(
String
xfileLocation
)
throws
FileNotFoundException
{
Scanner
scanIn
=
null
;
String
inputLine
=
""
;
String
tmp
=
null
;
boolean
mem_activ
=
false
;
int
counter
=
0
;
int
untilBoolean
=
0
;
int
subINDEXend
=
0
;
int
subINDEXstart
=
0
;
//----------------------------------------------------------------
Tweet
tweetTuple
=
null
;
ArrayList
<
String
>
hashtagList_singleTuple
=
new
ArrayList
<
String
>
();
String
originalAuthor
=
""
;
//----------------------------------------------------------------
String
finaleTuple
=
""
;
scanIn
=
new
Scanner
(
new
BufferedReader
(
new
FileReader
(
xfileLocation
)));
scanIn
.
next
();
while
(
scanIn
.
hasNextLine
()){
inputLine
=
scanIn
.
nextLine
();
if
(
inputLine
.
isEmpty
()
||
inputLine
.
trim
().
equals
(
""
)
||
inputLine
.
trim
().
equals
(
"\\n"
))
{
//-----------------------------------/ClEANING/-------------------------------------
}
else
{
if
(
mem_activ
==
true
)
{
//if there was a ';' in tweet_text
inputLine
=
tmp
.
concat
(
" "
+
inputLine
);
//concatenate text substrings
mem_activ
=
false
;
//reset flag
}
if
((
inputLine
.
endsWith
(
";False"
)||
inputLine
.
endsWith
(
";True"
))==
false
){
//detect substrings of tweet_text if text contains ';'
tmp
=
inputLine
;
//save first substring
mem_activ
=
true
;
//set flag
}
else
{
counter
++;
//represents IDs
subINDEXend
=
inputLine
.
indexOf
(
";2016"
);
//find boundaries of tweet_text
subINDEXstart
=
inputLine
.
indexOf
(
";"
)+
1
;
String
sub2Index
=
inputLine
.
substring
(
subINDEXstart
,
subINDEXend
);
//System.out.println(counter+" "+sub2Index);
if
(
sub2Index
.
contains
(
";False;"
)){
untilBoolean
=
sub2Index
.
indexOf
(
";False;"
);
}
else
if
(
sub2Index
.
contains
(
";True;"
)){
untilBoolean
=
sub2Index
.
indexOf
(
";True;"
);
}
//endIndex=subINDEXstart+untilBoolean;
String
tweetClean
=
sub2Index
.
substring
(
0
,
untilBoolean
);
if
(
tweetClean
.
contains
(
";"
))
{
//replace semicolons within text with ','
inputLine
=
replaceAllSemi
(
tweetClean
);
}
if
(
tweetClean
.
contains
(
"'"
)){
inputLine
=
inputLine
.
replaceAll
(
"'"
,
"''"
);
}
else
{}
finaleTuple
=
inputLine
;
//---------------------------------------------------------------------------------------------------------------------
//
String
[]
values
=
finaleTuple
.
split
(
";"
);
//-----------------------------------/Generate instance ofTweet/------------------
if
(
values
[
2
].
equals
(
"False"
)){
//if tweet is no retweet
originalAuthor
=
values
[
0
];
//set original author to handle
}
else
{
//if tweet is retweet
originalAuthor
=
values
[
3
];
//set original author to value of eponymous field
}
String
tweet_text
=
values
[
1
];
//set remaining variables
String
tweet_time
=
values
[
4
];
int
ID
=
counter
;
int
retweet_count
=
Integer
.
parseInt
(
values
[
7
]);
int
favorite_count
=
Integer
.
parseInt
(
values
[
8
]);
tweetTuple
=
new
Tweet
(
originalAuthor
,
tweet_text
,
tweet_time
,
ID
,
retweet_count
,
favorite_count
);
//create Tweet instance
this
.
tweetList
.
add
(
tweetTuple
);
//add to list of tweets
this
.
userList
.
add
(
values
[
0
]);
// Stores all users
//
hashtagList_singleTuple
=
identifyHash
(
tweet_text
);
//extract hashtags
hashtagList_singleTuple
=
removeDup
(
hashtagList_singleTuple
);
//remove duplicates$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
concati
(
hashtagList_singleTuple
);
//add to list of all hashtags
}
// end of while
//-----------------------------------/Generate instance of Hashtags/----------------------------------------------------------------------------------
this
.
hashtagList_entire
=
removeDup
(
this
.
hashtagList_entire
);
Collections
.
sort
(
this
.
hashtagList_entire
);
//keinedisplayHashes(hashtagList_entire);
for
(
int
i
=
0
;
i
<
this
.
hashtagList_entire
.
size
();
i
++)
{
//create hashtag instances
String
hashtag
=
this
.
hashtagList_entire
.
get
(
i
);
Hashtag
hash
=
new
Hashtag
(
i
,
hashtag
);
HashtagList
.
add
(
hash
);
// stores all Hashtags (with alphabetical order) with ID
}
//-----------------------------------/Generate instance of User/------------------------------------
this
.
userList
=
removeDup
(
this
.
userList
);
for
(
int
i
=
0
;
i
<
this
.
userList
.
size
();
i
++)
{
//create user instances
String
userNamer
=
this
.
userList
.
get
(
i
);
User
tweetUser
=
new
User
(
i
,
userNamer
);
tweetUserList
.
add
(
tweetUser
);
}
//end of for-loop
scanIn
.
close
();
}}}
//end of readCSV
//################################################################################################################################################################
//-----MAIN-----
//################################################################################################################################################################
public
static
void
main
(
String
[]
args
)
throws
IOException
{
String
xfileLocation2
=
"/home/ubuntumac/Dropbox/TI2/DBS/M.csv"
;
ArrayList
<
Tweet
>
tweetList
=
null
;
ArrayList
<
Hashtag
>
hashtagList
=
null
;
ArrayList
<
User
>
userList
=
null
;
Clean
database
=
new
Clean
();
database
.
readCSV
(
xfileLocation2
);
hashtagList
=
database
.
HashtagList
;
tweetList
=
database
.
tweetList
;
userList
=
database
.
tweetUserList
;
int
size_tweetList
=
tweetList
.
size
();
int
size_userList
=
userList
.
size
();
System
.
out
.
println
(
size_userList
);
//csv.export_Hashtag(database);
//csv.export_Tweet(database);
//csv.export_User(database);
// database.printTweet();
//######################################## test: identifyHash and removeDup
// ArrayList<String> hashList=null;
// ArrayList<String> ShorthashList=null;
// String input="zehdzh #Trump is a #motherfucker hzgf #Hillary saves America #Trump";
//
// System.out.println("Old: ");
// hashList=identifyHash(input);
// displayHashes(hashList);
// System.out.println("NEW: ");
// ShorthashList=removeDup(hashList);
// displayHashes(ShorthashList);
// char chr='m';
// int ascii=(int)chr;
// System.out.println(ascii);
//######################################## test: Export into DATABASE
String
dbServer
=
"localhost"
;
String
dbPort
=
"5432"
;
String
dbName
=
"Election"
;
String
dbUser
=
"postgres"
;
String
password
=
"mama"
;
// Connection to postgres
Connection
conn
=
null
;
Statement
stmt1
,
stmt2
,
stmt3
,
stmt4
,
stmt5
,
stmt6
,
stmt8
;
PreparedStatement
stmt7
=
null
;
try
{
conn
=
DriverManager
.
getConnection
(
"jdbc:postgresql://"
+
dbServer
+
":"
+
dbPort
+
"/"
+
dbName
,
dbUser
,
password
);
System
.
out
.
println
(
"Connection to database "
+
dbName
+
"@"
+
dbServer
+
":"
+
dbPort
+
" successfully established."
);
}
catch
(
SQLException
sqle
){
System
.
out
.
println
(
"The connection could not be established."
);
sqle
.
printStackTrace
();
System
.
exit
(
0
);
}
// Set-up Tables
try
{
stmt1
=
conn
.
createStatement
();
stmt2
=
conn
.
createStatement
();
stmt3
=
conn
.
createStatement
();
stmt4
=
conn
.
createStatement
();
stmt5
=
conn
.
createStatement
();
stmt6
=
conn
.
createStatement
();
stmt8
=
conn
.
createStatement
();
String
Tweet_SQLNew
=
"INSERT INTO tweet(id ,retweet_count, favorite_count, tweet_text, tweet_time, original_author )"
+
"VALUES(?,?,?,?,?,?)"
;
String
Tweet_SQLNew2
=
"INSERT INTO tweet(id ,retweet_count, favorite_count, tweet_text, original_author )"
+
"VALUES(?,?,?,?,?)"
;
stmt7
=
conn
.
prepareStatement
(
Tweet_SQLNew
);
//------------CREATE the Table-----------------------------tweet---------
String
creat_tweet
=
"CREATE TABLE tweet (id serial NOT NULL,"
+
"retweet_count int NOT NULL, "
+
"favorite_count int NOT NULL, "
+
"tweet_text text NOT NULL,"
+
"tweet_time timestamp,"
+
"original_author text NOT NULL,"
+
"PRIMARY KEY (id),"
+
"CONSTRAINT vorgaenger_ID FOREIGN KEY(id) REFERENCES tweet(id))"
;
stmt1
.
executeUpdate
(
creat_tweet
);
//------------CREATE the Table-----------------------------tweeter_user---------
// String creat_user="CREATE TABLE tweeter_user (id serial NOT NULL," +
// "handle text NOT NULL, " +
// "PRIMARY KEY (id))";
// stmt2.executeUpdate(creat_user);
////------------CREATE the Table-----------------------------tweeter_hashtag---------
// String creat_hashtags="CREATE TABLE hashtags (id serial NOT NULL," +
// "hashtag_text text NOT NULL," +
// "PRIMARY KEY (id))";
// stmt3.executeUpdate(creat_hashtags);
//
//------------Transfer the Table-----------------------------hashTag---------
// for(int i=0; i<size_hashtagList;i++){
// Hashtag tmpHash=hashtagList.get(i);
//
// String ID=Integer.toString(tmpHash.get_hashtag_ID());
// String hashtag_text=tmpHash.get_hashtag_text();
//
//
// String hashtag_SQL="INSERT INTO hashtags(id, hashtag_text)" +
// "VALUES("+ID+",'"+hashtag_text+"')";
// stmt4.executeUpdate(hashtag_SQL);
// }
//------------Transfer the Table-----------------------------Tweet---------
for
(
int
i
=
0
;
i
<
size_tweetList
;
i
++){
Tweet
tmpTweet
=
tweetList
.
get
(
i
);
int
tweet_ID
=
tmpTweet
.
get_tweet_ID
();
String
tweet_text
=
tmpTweet
.
get_tweet_text
();
String
original_author
=
tmpTweet
.
get_original_author
();
String
tweet_time
=
tmpTweet
.
get_tweet_time
();
int
retweet_count
=
tmpTweet
.
get_retweet_count
();
int
favorite_count
=
tmpTweet
.
get_favorite_count
();
// String Tweet_SQL="INSERT INTO tweet(id ,retweet_count, favorite_count, tweet_text, tweet_time, original_author )" +
// "VALUES("+tweet_ID+","
// +retweet_count+","
// +favorite_count+",'"
// +tweet_text+"','"
// +tweet_time+"','"
// +original_author+"')";
//
stmt7
.
setInt
(
1
,
tweet_ID
);
stmt7
.
setInt
(
2
,
retweet_count
);
stmt7
.
setInt
(
3
,
favorite_count
);
stmt7
.
setTimestamp
(
5
,
null
);
stmt7
.
setString
(
4
,
tweet_text
);
stmt7
.
setString
(
6
,
original_author
);
stmt7
.
executeUpdate
();
String
Tweet_SQL
=
"UPDATE tweet SET tweet_time='"
+
tweet_time
+
"' WHERE id="
+
tweet_ID
;
stmt5
.
executeUpdate
(
Tweet_SQL
);
}
//------------Transfer the Table-----------------------------USER---------
// for(int i=0; i<size_userList;i++){
// User tmpUser=userList.get(i);
// String user_id=Integer.toString(tmpUser.user_id);
// String handle=tmpUser.handle;
//
// String user_SQL="INSERT INTO tweeter_user(id, handle)" +
// "VALUES("+user_id+",'"+handle+"')";
// stmt6.executeUpdate(user_SQL);
//
//
//
// }
// end of Transfer the Table-----------------------------USER---------
}
catch
(
SQLException
e
)
{
e
.
printStackTrace
();
}
// END of database
}
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment