diff --git a/Clean_commented.java b/Clean_commented.java new file mode 100644 index 0000000000000000000000000000000000000000..44a8472638c4aa153bc6ce1066317771d9d653fc --- /dev/null +++ b/Clean_commented.java @@ -0,0 +1,643 @@ + + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Scanner; +import java.util.regex.Matcher; + +import java.sql.*; + +import javax.naming.spi.DirStateFactory.Result; + + + +class Tweet {// representing entity tweet + private String original_author; + private String tweet_text; + private String tweet_time; + private int tweet_ID; + private int retweet_count; + private int favorite_count; + + // constructor + public Tweet(String original_author, + String tweet_text, + String tweet_time, + int tweet_ID, + int retweet_count, + int favorite_count){ + + this.original_author=original_author; + this.tweet_text=tweet_text; + this.tweet_time=tweet_time; + this.tweet_ID=tweet_ID; + this.retweet_count=retweet_count; + this.favorite_count=favorite_count; + }// end of constructor + + public String get_original_author(){ + return this.original_author; + } + + public String get_tweet_text(){ + return this.tweet_text; + } + + public String get_tweet_time(){ + return this.tweet_time; + } + + public int get_tweet_ID(){ + return this.tweet_ID; + } + + public int get_retweet_count(){ + return this.retweet_count; + } + + public int get_favorite_count(){ + return this.favorite_count; + } + + +}// end of class + +class Hashtag {//representing entity hashtag from ER model + private int hashtag_ID; + private String hashtag_text; + + public Hashtag(int hashtag_ID,String hashtag_text){ + this.hashtag_ID=hashtag_ID; + this.hashtag_text=hashtag_text; + }// end of constructor + + public int get_hashtag_ID(){ + return this.hashtag_ID; + } + + public String get_hashtag_text(){ + return this.hashtag_text; + } + + +}// end of class hashtag + + +class User {//representing user entity from ER model + private String handle; + private int user_id; + + public User(int user_id, String handle){ + this.user_id=user_id; + this.handle=handle; + }// end of constructor + + public String get_handle(){ + return this.handle; + } + + public int get_user_id(){ + return this.user_id; + } +}// end of class user + + +class user2tweet { //representing user-tweet relation from ER model + private int user_id; + private int tweet_id; + + public user2tweet(int user_id, int tweet_id){ + this.user_id=user_id; + this.tweet_id=tweet_id; + }// end of constructor + + public int get_user_id(){ + return this.user_id; + } + public int get_tweet_id(){ + return this.tweet_id; + } +}// end of class + +class tweet2hashtag { //representing tweet-hashtag relation + private int tweet_id; + private int hashtag_id; + + public tweet2hashtag(int tweet_id, int hashtag_id){ + this.tweet_id=tweet_id; + this.hashtag_id=hashtag_id; + } + public int get_tweet_id(){ + return this.tweet_id; + } + public int get_hashtag_id(){ + return this.hashtag_id; + } +} + + +class tweet2Retweet{ + int tweet_id_Original; + int tweet_id_Copy; + + public tweet2Retweet(int tweet_id_Original,int tweet_id_Copy){ + this.tweet_id_Original=tweet_id_Original; + this.tweet_id_Copy=tweet_id_Copy; + }// end of constructtor + public int get_tweet_id_Original() { + return this.tweet_id_Original; + } + public int get_tweet_id_Copy() { + return this.tweet_id_Copy; + } + +}// end of class + +//############################################################################################################################################# + +public class Clean { + + + public ArrayList<Tweet> tweetList=new ArrayList<Tweet>(); + public ArrayList<Hashtag> HashtagList=new ArrayList<Hashtag>(); + public ArrayList<User> tweetUserList=new ArrayList<User>(); + public ArrayList<String> userList=new ArrayList<String>(); + public ArrayList<String> hashtagList_entire=new ArrayList<String> (); + + + public ArrayList<Tweet> get_tweetList(){ + return this.tweetList; + } + + public ArrayList<Hashtag> get_HashtagList(){ + return this.HashtagList; + } + + public ArrayList<User> get_tweetUserList(){ + return this.tweetUserList; + } + + public void printTweet(){ + for(int i=0; i<this.tweetList.size();i++){ + Tweet tweet=this.tweetList.get(i); + System.out.println(tweet.get_tweet_ID()+" "+tweet.get_tweet_text()); + } + + } +//##################################################################################################################################### + public static void justRead(String xfileLocation){ //reads content of a given file + Scanner scanIn=null; + String inputLine=""; + int counter=0; + + try { + scanIn=new Scanner (new BufferedReader(new FileReader(xfileLocation))); + scanIn.next();//skip header line + while (scanIn.hasNextLine()) {//while there are unread lines + + inputLine = scanIn.nextLine();//read next line + System.out.println(counter+" "+inputLine); + counter++;//count number of lines read (minus header) + } + scanIn.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + }// end of class +//############################################################################################################################################################ + + public static ArrayList<String> removeDup(ArrayList<String> hashList){//remove duplicate hashtags + ArrayList<String> temp= new ArrayList<String>(); + for(int i=0; i<hashList.size();i++) {//iterate hashtag list + String hash=hashList.get(i);//check for every hashtag + if(temp.contains(hash)==false){//whether it has been read before + temp.add(hash);//if not, save it / otherwise discard + }// end of if + }// end of for + return temp;//return list of unique hashtags + } + + public void concati(ArrayList<String> hashList) {// concatenate sublists of hashtags + for(int i=0; i<hashList.size();i++){ + String hash=hashList.get(i); + if(this.hashtagList_entire.contains(hash)==false){ + this.hashtagList_entire.add(hash); + }// end of if + }// end of while + } + + public static int charToASCII(final char character) {//converts character to correspondin ASCII value + return (int)character; + } + + public static boolean checkASCII(char chr) {//check for invalid (e.g. special) characters + int valASCII=(int)chr; + if( (valASCII>=65 && valASCII<=90) //capital Letter + || (valASCII>=97 && valASCII<=122) // small letter + || (valASCII>=48 && valASCII<=57)){// numbers + return true;//if char is either letter or number: return true (valid) + }else{ + return false;//otherwise return false (reject) + } + } + + public static ArrayList<String> identifyHash(String input) {//identify hashtags in string + int i=0; + int index_start_hashtag=0; //index of first character of hashtag + int index_end_hashtag=0; //index of last character of hashtag + ArrayList<String> hashList = new ArrayList<String>();//list of all hashtags found + for(i=0; i<input.length();i++) { //loop through input string (char array) + char sign=input.charAt(i); + + if(sign=='#') {//if hashtag symbol is found + char sign2=input.charAt(i+1); + if(checkASCII(sign2)) { //check if next char following hashtag symbol is valid hashtag character + index_start_hashtag=i; + int count_length=i+1;//count length of hashtag + char iterate=0; + int j=0; + while(checkASCII(iterate)||(int)iterate==0) {//while next char belongs to hashtag +// + if(count_length==input.length()) {//if end of string is reached + j=count_length; + break; //stop + } else { //otherwise + iterate=input.charAt(count_length);//look at next character + j=count_length; + count_length++; //increment length + } + }// end of while + index_end_hashtag=j; + + String hash=input.substring(index_start_hashtag,index_end_hashtag);//extract hashtag using indices + hashList.add(hash);//add hashtag to list + i=count_length; + } else { + //do nothing + }// end of if/ else + } + }// end of for + + return hashList;//return + }// end of identifyHash + + public static void displayHashes(ArrayList<String> hashList) {//print hashtag list + for(int i=0; i<hashList.size(); i++){ + System.out.println(hashList.get(i)); + }// end of for + }// end of displayHashes + + public ArrayList<String> orderAlpha(ArrayList<String> hashList) {//order hashtags alphabetically? bubblesort?! + int size=hashList.size(); + int i; + for(i=0; i<size;i++){ + for(int j=i+1; j<size;j++){ + if(hashList.get(i).compareTo(hashList.get(j))<0){ + String tmp=hashList.get(i); + hashList.set(i, hashList.get(j)); + hashList.set(j,tmp); + } + + }//end of inner for loop + }//end of outer for loop + return hashList; + } + +//############################################################################################################################################################ + + + + + public static String replaceAllSemi(String str) {// replace + + for(int i=0; i<str.length();i++){ + char sign=str.charAt(i); + if(sign==';'){ + str=str.substring(0,i-1)+","+str.substring(i+1); + } + } + return str; + }// end of replaceAllSemi + +//################################################################################################ +// +// public static String identifyOriginalAuthor(String tuple){ +// +// }// end of identifyOriginalAuthor +// +//################################################################################################ + public void readCSV(String xfileLocation) throws FileNotFoundException{ + + Scanner scanIn=null; + String inputLine=""; + String tmp=null; + boolean mem_activ=false; + int counter=0; + int untilBoolean=0; + int subINDEXend=0; + int subINDEXstart=0; + +//---------------------------------------------------------------- + Tweet tweetTuple=null; + ArrayList<String> hashtagList_singleTuple=new ArrayList<String> (); + + String originalAuthor=""; +//---------------------------------------------------------------- + + String finaleTuple=""; + scanIn=new Scanner (new BufferedReader(new FileReader(xfileLocation))); + scanIn.next(); + while (scanIn.hasNextLine()){ + inputLine = scanIn.nextLine(); + if (inputLine.isEmpty() || inputLine.trim().equals("") || inputLine.trim().equals("\\n")) { + +//-----------------------------------/ClEANING/------------------------------------- + } else { + if(mem_activ==true) {//if there was a ';' in tweet_text + inputLine=tmp.concat(" "+inputLine);//concatenate text substrings + mem_activ=false;//reset flag + } + if((inputLine.endsWith(";False")||inputLine.endsWith(";True"))==false){//detect substrings of tweet_text if text contains ';' + tmp=inputLine;//save first substring + mem_activ=true;//set flag + } else { + counter++;//represents IDs + + subINDEXend=inputLine.indexOf(";2016");//find boundaries of tweet_text + subINDEXstart=inputLine.indexOf(";")+1; + String sub2Index=inputLine.substring(subINDEXstart,subINDEXend); + //System.out.println(counter+" "+sub2Index); + + if(sub2Index.contains(";False;")){ + untilBoolean=sub2Index.indexOf(";False;"); + }else if (sub2Index.contains(";True;")){ + untilBoolean=sub2Index.indexOf(";True;"); + } + //endIndex=subINDEXstart+untilBoolean; + String tweetClean=sub2Index.substring(0,untilBoolean); + + if(tweetClean.contains(";")) {//replace semicolons within text with ',' + inputLine=replaceAllSemi(tweetClean); + }if(tweetClean.contains("'")){ + inputLine=inputLine.replaceAll("'","''"); + } else {} + finaleTuple=inputLine; +//--------------------------------------------------------------------------------------------------------------------- +// + String[] values = finaleTuple.split(";"); + +//-----------------------------------/Generate instance ofTweet/------------------ + + if(values[2].equals("False")){//if tweet is no retweet + originalAuthor=values[0];//set original author to handle + }else{ //if tweet is retweet + originalAuthor=values[3];//set original author to value of eponymous field + } + String tweet_text=values[1];//set remaining variables + String tweet_time=values[4]; + int ID=counter; + int retweet_count=Integer.parseInt(values[7]); + int favorite_count=Integer.parseInt(values[8]); + + tweetTuple=new Tweet (originalAuthor, tweet_text, tweet_time, ID, retweet_count, favorite_count);//create Tweet instance + this.tweetList.add(tweetTuple);//add to list of tweets + + this.userList.add(values[0]);// Stores all users +// + hashtagList_singleTuple=identifyHash(tweet_text);//extract hashtags + hashtagList_singleTuple=removeDup(hashtagList_singleTuple);//remove duplicates$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ + concati(hashtagList_singleTuple); //add to list of all hashtags + + + }// end of while + +//-----------------------------------/Generate instance of Hashtags/---------------------------------------------------------------------------------- + this.hashtagList_entire=removeDup(this.hashtagList_entire); + Collections.sort(this.hashtagList_entire); + //keinedisplayHashes(hashtagList_entire); + + for(int i=0; i<this.hashtagList_entire.size();i++) { //create hashtag instances + String hashtag=this.hashtagList_entire.get(i); + Hashtag hash=new Hashtag(i, hashtag); + HashtagList.add(hash);// stores all Hashtags (with alphabetical order) with ID + } + +//-----------------------------------/Generate instance of User/------------------------------------ + this.userList=removeDup(this.userList); + for(int i=0; i<this.userList.size(); i++) {//create user instances + String userNamer=this.userList.get(i); + User tweetUser=new User(i,userNamer ); + tweetUserList.add(tweetUser); + }//end of for-loop + + scanIn.close(); + }}}//end of readCSV +//################################################################################################################################################################ + //-----MAIN----- +//################################################################################################################################################################ + public static void main(String[] args) throws IOException { + + String xfileLocation2="/home/ubuntumac/Dropbox/TI2/DBS/M.csv"; + ArrayList<Tweet> tweetList=null; + ArrayList<Hashtag> hashtagList=null; + ArrayList<User> userList=null; + + Clean database=new Clean(); + + database.readCSV(xfileLocation2); + + hashtagList=database.HashtagList; + tweetList=database.tweetList; + userList=database.tweetUserList; + int size_tweetList=tweetList.size(); + int size_userList=userList.size(); + + System.out.println(size_userList); + + + + //csv.export_Hashtag(database); + //csv.export_Tweet(database); + //csv.export_User(database); +// database.printTweet(); + +//######################################## test: identifyHash and removeDup +// ArrayList<String> hashList=null; +// ArrayList<String> ShorthashList=null; +// String input="zehdzh #Trump is a #motherfucker hzgf #Hillary saves America #Trump"; +// +// System.out.println("Old: "); +// hashList=identifyHash(input); +// displayHashes(hashList); +// System.out.println("NEW: "); +// ShorthashList=removeDup(hashList); +// displayHashes(ShorthashList); + +// char chr='m'; +// int ascii=(int)chr; +// System.out.println(ascii); + +//######################################## test: Export into DATABASE + + + String dbServer="localhost"; + String dbPort="5432"; + String dbName="Election"; + String dbUser="postgres"; + String password="mama"; + + + // Connection to postgres + Connection conn=null; + Statement stmt1,stmt2,stmt3,stmt4,stmt5,stmt6, stmt8; + PreparedStatement stmt7=null; + try{ + conn= DriverManager.getConnection("jdbc:postgresql://" + dbServer + ":" + dbPort + "/" + dbName, dbUser , password); + System.out.println("Connection to database " + dbName + "@" + dbServer + ":" + dbPort + " successfully established."); + + + }catch(SQLException sqle){ + System.out.println("The connection could not be established."); + sqle.printStackTrace(); + System.exit(0); + } + + + + // Set-up Tables + try { + stmt1=conn.createStatement(); + stmt2=conn.createStatement(); + stmt3=conn.createStatement(); + stmt4=conn.createStatement(); + stmt5=conn.createStatement(); + stmt6=conn.createStatement(); + stmt8=conn.createStatement(); + + + + String Tweet_SQLNew="INSERT INTO tweet(id ,retweet_count, favorite_count, tweet_text, tweet_time, original_author )" + + "VALUES(?,?,?,?,?,?)"; + + String Tweet_SQLNew2="INSERT INTO tweet(id ,retweet_count, favorite_count, tweet_text, original_author )" + + "VALUES(?,?,?,?,?)"; + + stmt7=conn.prepareStatement(Tweet_SQLNew); + + +//------------CREATE the Table-----------------------------tweet--------- + + String creat_tweet="CREATE TABLE tweet (id serial NOT NULL," + + "retweet_count int NOT NULL, " + + "favorite_count int NOT NULL, " + + "tweet_text text NOT NULL," + + "tweet_time timestamp," + + "original_author text NOT NULL," + + "PRIMARY KEY (id),"+ + "CONSTRAINT vorgaenger_ID FOREIGN KEY(id) REFERENCES tweet(id))"; + + + + stmt1.executeUpdate(creat_tweet); + +//------------CREATE the Table-----------------------------tweeter_user--------- + +// String creat_user="CREATE TABLE tweeter_user (id serial NOT NULL," + +// "handle text NOT NULL, " + +// "PRIMARY KEY (id))"; +// stmt2.executeUpdate(creat_user); + + +////------------CREATE the Table-----------------------------tweeter_hashtag--------- + +// String creat_hashtags="CREATE TABLE hashtags (id serial NOT NULL," + +// "hashtag_text text NOT NULL," + +// "PRIMARY KEY (id))"; +// stmt3.executeUpdate(creat_hashtags); +// + +//------------Transfer the Table-----------------------------hashTag--------- + +// for(int i=0; i<size_hashtagList;i++){ +// Hashtag tmpHash=hashtagList.get(i); +// +// String ID=Integer.toString(tmpHash.get_hashtag_ID()); +// String hashtag_text=tmpHash.get_hashtag_text(); +// +// +// String hashtag_SQL="INSERT INTO hashtags(id, hashtag_text)" + +// "VALUES("+ID+",'"+hashtag_text+"')"; +// stmt4.executeUpdate(hashtag_SQL); +// } + +//------------Transfer the Table-----------------------------Tweet--------- + for(int i=0; i<size_tweetList;i++){ + + Tweet tmpTweet=tweetList.get(i); + + int tweet_ID=tmpTweet.get_tweet_ID(); + String tweet_text=tmpTweet.get_tweet_text(); + String original_author=tmpTweet.get_original_author(); + String tweet_time=tmpTweet.get_tweet_time(); + int retweet_count=tmpTweet.get_retweet_count(); + int favorite_count=tmpTweet.get_favorite_count(); + + + +// String Tweet_SQL="INSERT INTO tweet(id ,retweet_count, favorite_count, tweet_text, tweet_time, original_author )" + +// "VALUES("+tweet_ID+"," +// +retweet_count+"," +// +favorite_count+",'" +// +tweet_text+"','" +// +tweet_time+"','" +// +original_author+"')"; +// + + + + stmt7.setInt(1, tweet_ID); + stmt7.setInt(2, retweet_count); + stmt7.setInt(3, favorite_count); + stmt7.setTimestamp(5, null); + stmt7.setString(4, tweet_text); + stmt7.setString(6, original_author); + stmt7.executeUpdate(); + + String Tweet_SQL="UPDATE tweet SET tweet_time='"+tweet_time+"' WHERE id="+tweet_ID; + stmt5.executeUpdate(Tweet_SQL); + } + +//------------Transfer the Table-----------------------------USER--------- +// for(int i=0; i<size_userList;i++){ +// User tmpUser=userList.get(i); +// String user_id=Integer.toString(tmpUser.user_id); +// String handle=tmpUser.handle; +// +// String user_SQL="INSERT INTO tweeter_user(id, handle)" + +// "VALUES("+user_id+",'"+handle+"')"; +// stmt6.executeUpdate(user_SQL); +// +// +// +// } + // end of Transfer the Table-----------------------------USER--------- + + } catch (SQLException e) { + e.printStackTrace(); + }// END of database + + + + + + + } +} + + +