From dc2ff2f4ba554d376b48d93888e05bf96145bf03 Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Tue, 7 Jun 2016 09:15:32 +0200 Subject: [PATCH] added bbcreader to transform bbc dataset --- BBCReader/.classpath | 15 +++ BBCReader/.gitignore | 1 + BBCReader/.project | 23 ++++ .../.settings/org.eclipse.jdt.core.prefs | 5 + .../.settings/org.eclipse.m2e.core.prefs | 4 + BBCReader/pom.xml | 33 ++++++ BBCReader/src/de/vipra/bbc/BBCArticle.java | 42 +++++++ .../src/de/vipra/bbc/FileTransformer.java | 109 ++++++++++++++++++ 8 files changed, 232 insertions(+) create mode 100644 BBCReader/.classpath create mode 100644 BBCReader/.gitignore create mode 100644 BBCReader/.project create mode 100644 BBCReader/.settings/org.eclipse.jdt.core.prefs create mode 100644 BBCReader/.settings/org.eclipse.m2e.core.prefs create mode 100644 BBCReader/pom.xml create mode 100644 BBCReader/src/de/vipra/bbc/BBCArticle.java create mode 100644 BBCReader/src/de/vipra/bbc/FileTransformer.java diff --git a/BBCReader/.classpath b/BBCReader/.classpath new file mode 100644 index 00000000..83e44047 --- /dev/null +++ b/BBCReader/.classpath @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" path="src"/> + <classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER"> + <attributes> + <attribute name="maven.pomderived" value="true"/> + </attributes> + </classpathentry> + <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"> + <attributes> + <attribute name="maven.pomderived" value="true"/> + </attributes> + </classpathentry> + <classpathentry kind="output" path="target/classes"/> +</classpath> diff --git a/BBCReader/.gitignore b/BBCReader/.gitignore new file mode 100644 index 00000000..b83d2226 --- /dev/null +++ b/BBCReader/.gitignore @@ -0,0 +1 @@ +/target/ diff --git a/BBCReader/.project b/BBCReader/.project new file mode 100644 index 00000000..cfbff769 --- /dev/null +++ b/BBCReader/.project @@ -0,0 +1,23 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>BBCReader</name> + <comment></comment> + <projects> + </projects> + <buildSpec> + <buildCommand> + <name>org.eclipse.jdt.core.javabuilder</name> + <arguments> + </arguments> + </buildCommand> + <buildCommand> + <name>org.eclipse.m2e.core.maven2Builder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.eclipse.m2e.core.maven2Nature</nature> + <nature>org.eclipse.jdt.core.javanature</nature> + </natures> +</projectDescription> diff --git a/BBCReader/.settings/org.eclipse.jdt.core.prefs b/BBCReader/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 00000000..d59e09c9 --- /dev/null +++ b/BBCReader/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,5 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 +org.eclipse.jdt.core.compiler.compliance=1.8 +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.source=1.8 diff --git a/BBCReader/.settings/org.eclipse.m2e.core.prefs b/BBCReader/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 00000000..14b697b7 --- /dev/null +++ b/BBCReader/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/BBCReader/pom.xml b/BBCReader/pom.xml new file mode 100644 index 00000000..23a9dcb3 --- /dev/null +++ b/BBCReader/pom.xml @@ -0,0 +1,33 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <groupId>BBCReader</groupId> + <artifactId>BBCReader</artifactId> + <version>0.0.1-SNAPSHOT</version> + <properties> + <maven.compiler.source>1.8</maven.compiler.source> + <maven.compiler.target>1.8</maven.compiler.target> + </properties> + <dependencies> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>2.5</version> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + <version>2.7.0</version> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-annotations</artifactId> + <version>2.7.0</version> + </dependency> + <dependency> + <groupId>org.jsoup</groupId> + <artifactId>jsoup</artifactId> + <version>1.9.2</version> + </dependency> + </dependencies> +</project> \ No newline at end of file diff --git a/BBCReader/src/de/vipra/bbc/BBCArticle.java b/BBCReader/src/de/vipra/bbc/BBCArticle.java new file mode 100644 index 00000000..ee972775 --- /dev/null +++ b/BBCReader/src/de/vipra/bbc/BBCArticle.java @@ -0,0 +1,42 @@ +package de.vipra.bbc; + +public class BBCArticle { + + private String title; + private String text; + private String url; + private String date; + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getDate() { + return date; + } + + public void setDate(String date) { + this.date = date; + } + +} diff --git a/BBCReader/src/de/vipra/bbc/FileTransformer.java b/BBCReader/src/de/vipra/bbc/FileTransformer.java new file mode 100644 index 00000000..cff6ba77 --- /dev/null +++ b/BBCReader/src/de/vipra/bbc/FileTransformer.java @@ -0,0 +1,109 @@ +package de.vipra.bbc; + +import java.io.File; +import java.net.URLEncoder; +import java.nio.charset.Charset; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.GregorianCalendar; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.io.FileUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; + +public class FileTransformer { + + public static final String BBC_SOURCE_PATH = "/home/eike/Downloads/bbc"; + public static final String BBC_TARGET_PATH = "/home/eike/Downloads/bbc-transformed"; + + public static final String BBC_SEARCH_URL = "http://www.bbc.co.uk/search?filter=news&q="; + + public static final Pattern TITLE_TEXT_REGEX = Pattern.compile("(.+?)\\n\\n(.+)$", Pattern.DOTALL); + + public static final SimpleDateFormat sdfBBC = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); + public static final SimpleDateFormat sdfVIPRA = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); + + public static void main(String[] args) throws Exception { + final File sourceDir = new File(BBC_SOURCE_PATH); + if (!sourceDir.exists() || !sourceDir.isDirectory()) + throw new Exception("Source directory not found or not a directory"); + + final File bbcTarget = new File(BBC_TARGET_PATH); + if (bbcTarget.exists()) + FileUtils.deleteDirectory(bbcTarget); + bbcTarget.mkdirs(); + + final List<BBCArticle> articles = new ArrayList<>(); + + for (final File topicDir : sourceDir.listFiles((d) -> d.isDirectory())) { + final String topic = topicDir.getName(); + int i = 1; + for (final File file : topicDir.listFiles()) { + System.out.println(file.getAbsolutePath()); + try { + articles.add(getArticle(topic, file, i++)); + } catch (Exception e) { + System.err.println(e.getMessage() + " (" + file.getAbsolutePath() + ")"); + } + } + } + + ObjectMapper mapper = new ObjectMapper(); + mapper.enable(SerializationFeature.INDENT_OUTPUT); + mapper.writeValue(new File(bbcTarget, "bbc.json"), articles); + } + + public static final BBCArticle getArticle(final String topic, final File file, final int number) throws Exception { + final BBCArticle article = new BBCArticle(); + final String content = FileUtils.readFileToString(file, Charset.defaultCharset()); + final Matcher matcher = TITLE_TEXT_REGEX.matcher(content); + if (!matcher.find()) + throw new Exception("Invalid article structure"); + + article.setTitle(matcher.group(1)); + article.setText(matcher.group(2)); + + getBBCData(article); + + return article; + } + + public static final void getBBCData(final BBCArticle article) throws Exception { + final String src = BBC_SEARCH_URL + URLEncoder.encode(article.getTitle(), "UTF-8"); + final Document doc = Jsoup.connect(src).get(); + final Elements results = doc.select("article"); + if (results.size() == 0) + throw new Exception("not found: " + article.getTitle() + ", no search results"); + + for (Element el : results) { + String strDate = el.select(".display-date").attr("datetime"); + Date date = sdfBBC.parse(strDate); + final Calendar c = new GregorianCalendar(); + c.setTime(date); + if (c.get(Calendar.YEAR) != 2004 && c.get(Calendar.YEAR) != 2005) + continue; + + article.setDate(sdfVIPRA.format(date)); + + Elements elTitle = el.select("h1 > a"); + String title = elTitle.text(); + if (!article.getTitle().toLowerCase().trim().equals(title.toLowerCase().trim())) + continue; + + String url = elTitle.attr("href"); + article.setUrl(url); + return; + } + } + +} -- GitLab