From dc2ff2f4ba554d376b48d93888e05bf96145bf03 Mon Sep 17 00:00:00 2001
From: Eike Cochu <eike@cochu.com>
Date: Tue, 7 Jun 2016 09:15:32 +0200
Subject: [PATCH] added bbcreader to transform bbc dataset

---
 BBCReader/.classpath                          |  15 +++
 BBCReader/.gitignore                          |   1 +
 BBCReader/.project                            |  23 ++++
 .../.settings/org.eclipse.jdt.core.prefs      |   5 +
 .../.settings/org.eclipse.m2e.core.prefs      |   4 +
 BBCReader/pom.xml                             |  33 ++++++
 BBCReader/src/de/vipra/bbc/BBCArticle.java    |  42 +++++++
 .../src/de/vipra/bbc/FileTransformer.java     | 109 ++++++++++++++++++
 8 files changed, 232 insertions(+)
 create mode 100644 BBCReader/.classpath
 create mode 100644 BBCReader/.gitignore
 create mode 100644 BBCReader/.project
 create mode 100644 BBCReader/.settings/org.eclipse.jdt.core.prefs
 create mode 100644 BBCReader/.settings/org.eclipse.m2e.core.prefs
 create mode 100644 BBCReader/pom.xml
 create mode 100644 BBCReader/src/de/vipra/bbc/BBCArticle.java
 create mode 100644 BBCReader/src/de/vipra/bbc/FileTransformer.java

diff --git a/BBCReader/.classpath b/BBCReader/.classpath
new file mode 100644
index 00000000..83e44047
--- /dev/null
+++ b/BBCReader/.classpath
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>
diff --git a/BBCReader/.gitignore b/BBCReader/.gitignore
new file mode 100644
index 00000000..b83d2226
--- /dev/null
+++ b/BBCReader/.gitignore
@@ -0,0 +1 @@
+/target/
diff --git a/BBCReader/.project b/BBCReader/.project
new file mode 100644
index 00000000..cfbff769
--- /dev/null
+++ b/BBCReader/.project
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>BBCReader</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.m2e.core.maven2Nature</nature>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/BBCReader/.settings/org.eclipse.jdt.core.prefs b/BBCReader/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 00000000..d59e09c9
--- /dev/null
+++ b/BBCReader/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,5 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
+org.eclipse.jdt.core.compiler.compliance=1.8
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.source=1.8
diff --git a/BBCReader/.settings/org.eclipse.m2e.core.prefs b/BBCReader/.settings/org.eclipse.m2e.core.prefs
new file mode 100644
index 00000000..14b697b7
--- /dev/null
+++ b/BBCReader/.settings/org.eclipse.m2e.core.prefs
@@ -0,0 +1,4 @@
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1
diff --git a/BBCReader/pom.xml b/BBCReader/pom.xml
new file mode 100644
index 00000000..23a9dcb3
--- /dev/null
+++ b/BBCReader/pom.xml
@@ -0,0 +1,33 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+	<groupId>BBCReader</groupId>
+	<artifactId>BBCReader</artifactId>
+	<version>0.0.1-SNAPSHOT</version>
+	<properties>
+		<maven.compiler.source>1.8</maven.compiler.source>
+		<maven.compiler.target>1.8</maven.compiler.target>
+	</properties>
+	<dependencies>
+		<dependency>
+			<groupId>commons-io</groupId>
+			<artifactId>commons-io</artifactId>
+			<version>2.5</version>
+		</dependency>
+		<dependency>
+			<groupId>com.fasterxml.jackson.core</groupId>
+			<artifactId>jackson-databind</artifactId>
+			<version>2.7.0</version>
+		</dependency>
+		<dependency>
+			<groupId>com.fasterxml.jackson.core</groupId>
+			<artifactId>jackson-annotations</artifactId>
+			<version>2.7.0</version>
+		</dependency>
+		<dependency>
+			<groupId>org.jsoup</groupId>
+			<artifactId>jsoup</artifactId>
+			<version>1.9.2</version>
+		</dependency>
+	</dependencies>
+</project>
\ No newline at end of file
diff --git a/BBCReader/src/de/vipra/bbc/BBCArticle.java b/BBCReader/src/de/vipra/bbc/BBCArticle.java
new file mode 100644
index 00000000..ee972775
--- /dev/null
+++ b/BBCReader/src/de/vipra/bbc/BBCArticle.java
@@ -0,0 +1,42 @@
+package de.vipra.bbc;
+
+public class BBCArticle {
+
+	private String title;
+	private String text;
+	private String url;
+	private String date;
+
+	public String getTitle() {
+		return title;
+	}
+
+	public void setTitle(String title) {
+		this.title = title;
+	}
+
+	public String getText() {
+		return text;
+	}
+
+	public void setText(String text) {
+		this.text = text;
+	}
+
+	public String getUrl() {
+		return url;
+	}
+
+	public void setUrl(String url) {
+		this.url = url;
+	}
+
+	public String getDate() {
+		return date;
+	}
+
+	public void setDate(String date) {
+		this.date = date;
+	}
+
+}
diff --git a/BBCReader/src/de/vipra/bbc/FileTransformer.java b/BBCReader/src/de/vipra/bbc/FileTransformer.java
new file mode 100644
index 00000000..cff6ba77
--- /dev/null
+++ b/BBCReader/src/de/vipra/bbc/FileTransformer.java
@@ -0,0 +1,109 @@
+package de.vipra.bbc;
+
+import java.io.File;
+import java.net.URLEncoder;
+import java.nio.charset.Charset;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.GregorianCalendar;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.FileUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.SerializationFeature;
+
+public class FileTransformer {
+
+	public static final String BBC_SOURCE_PATH = "/home/eike/Downloads/bbc";
+	public static final String BBC_TARGET_PATH = "/home/eike/Downloads/bbc-transformed";
+
+	public static final String BBC_SEARCH_URL = "http://www.bbc.co.uk/search?filter=news&q=";
+
+	public static final Pattern TITLE_TEXT_REGEX = Pattern.compile("(.+?)\\n\\n(.+)$", Pattern.DOTALL);
+
+	public static final SimpleDateFormat sdfBBC = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
+	public static final SimpleDateFormat sdfVIPRA = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
+
+	public static void main(String[] args) throws Exception {
+		final File sourceDir = new File(BBC_SOURCE_PATH);
+		if (!sourceDir.exists() || !sourceDir.isDirectory())
+			throw new Exception("Source directory not found or not a directory");
+
+		final File bbcTarget = new File(BBC_TARGET_PATH);
+		if (bbcTarget.exists())
+			FileUtils.deleteDirectory(bbcTarget);
+		bbcTarget.mkdirs();
+
+		final List<BBCArticle> articles = new ArrayList<>();
+
+		for (final File topicDir : sourceDir.listFiles((d) -> d.isDirectory())) {
+			final String topic = topicDir.getName();
+			int i = 1;
+			for (final File file : topicDir.listFiles()) {
+				System.out.println(file.getAbsolutePath());
+				try {
+					articles.add(getArticle(topic, file, i++));
+				} catch (Exception e) {
+					System.err.println(e.getMessage() + " (" + file.getAbsolutePath() + ")");
+				}
+			}
+		}
+
+		ObjectMapper mapper = new ObjectMapper();
+		mapper.enable(SerializationFeature.INDENT_OUTPUT);
+		mapper.writeValue(new File(bbcTarget, "bbc.json"), articles);
+	}
+
+	public static final BBCArticle getArticle(final String topic, final File file, final int number) throws Exception {
+		final BBCArticle article = new BBCArticle();
+		final String content = FileUtils.readFileToString(file, Charset.defaultCharset());
+		final Matcher matcher = TITLE_TEXT_REGEX.matcher(content);
+		if (!matcher.find())
+			throw new Exception("Invalid article structure");
+
+		article.setTitle(matcher.group(1));
+		article.setText(matcher.group(2));
+
+		getBBCData(article);
+
+		return article;
+	}
+
+	public static final void getBBCData(final BBCArticle article) throws Exception {
+		final String src = BBC_SEARCH_URL + URLEncoder.encode(article.getTitle(), "UTF-8");
+		final Document doc = Jsoup.connect(src).get();
+		final Elements results = doc.select("article");
+		if (results.size() == 0)
+			throw new Exception("not found: " + article.getTitle() + ", no search results");
+
+		for (Element el : results) {
+			String strDate = el.select(".display-date").attr("datetime");
+			Date date = sdfBBC.parse(strDate);
+			final Calendar c = new GregorianCalendar();
+			c.setTime(date);
+			if (c.get(Calendar.YEAR) != 2004 && c.get(Calendar.YEAR) != 2005)
+				continue;
+
+			article.setDate(sdfVIPRA.format(date));
+
+			Elements elTitle = el.select("h1 > a");
+			String title = elTitle.text();
+			if (!article.getTitle().toLowerCase().trim().equals(title.toLowerCase().trim()))
+				continue;
+
+			String url = elTitle.attr("href");
+			article.setUrl(url);
+			return;
+		}
+	}
+
+}
-- 
GitLab