Skip to content
Snippets Groups Projects
Commit dc2ff2f4 authored by Eike Cochu's avatar Eike Cochu
Browse files

added bbcreader to transform bbc dataset

parent a67dad65
No related branches found
No related tags found
No related merge requests found
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src"/>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>
/target/
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>BBCReader</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
org.eclipse.jdt.core.compiler.compliance=1.8
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.8
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>BBCReader</groupId>
<artifactId>BBCReader</artifactId>
<version>0.0.1-SNAPSHOT</version>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.7.0</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.7.0</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package de.vipra.bbc;
public class BBCArticle {
private String title;
private String text;
private String url;
private String date;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
}
package de.vipra.bbc;
import java.io.File;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
public class FileTransformer {
public static final String BBC_SOURCE_PATH = "/home/eike/Downloads/bbc";
public static final String BBC_TARGET_PATH = "/home/eike/Downloads/bbc-transformed";
public static final String BBC_SEARCH_URL = "http://www.bbc.co.uk/search?filter=news&q=";
public static final Pattern TITLE_TEXT_REGEX = Pattern.compile("(.+?)\\n\\n(.+)$", Pattern.DOTALL);
public static final SimpleDateFormat sdfBBC = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
public static final SimpleDateFormat sdfVIPRA = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
public static void main(String[] args) throws Exception {
final File sourceDir = new File(BBC_SOURCE_PATH);
if (!sourceDir.exists() || !sourceDir.isDirectory())
throw new Exception("Source directory not found or not a directory");
final File bbcTarget = new File(BBC_TARGET_PATH);
if (bbcTarget.exists())
FileUtils.deleteDirectory(bbcTarget);
bbcTarget.mkdirs();
final List<BBCArticle> articles = new ArrayList<>();
for (final File topicDir : sourceDir.listFiles((d) -> d.isDirectory())) {
final String topic = topicDir.getName();
int i = 1;
for (final File file : topicDir.listFiles()) {
System.out.println(file.getAbsolutePath());
try {
articles.add(getArticle(topic, file, i++));
} catch (Exception e) {
System.err.println(e.getMessage() + " (" + file.getAbsolutePath() + ")");
}
}
}
ObjectMapper mapper = new ObjectMapper();
mapper.enable(SerializationFeature.INDENT_OUTPUT);
mapper.writeValue(new File(bbcTarget, "bbc.json"), articles);
}
public static final BBCArticle getArticle(final String topic, final File file, final int number) throws Exception {
final BBCArticle article = new BBCArticle();
final String content = FileUtils.readFileToString(file, Charset.defaultCharset());
final Matcher matcher = TITLE_TEXT_REGEX.matcher(content);
if (!matcher.find())
throw new Exception("Invalid article structure");
article.setTitle(matcher.group(1));
article.setText(matcher.group(2));
getBBCData(article);
return article;
}
public static final void getBBCData(final BBCArticle article) throws Exception {
final String src = BBC_SEARCH_URL + URLEncoder.encode(article.getTitle(), "UTF-8");
final Document doc = Jsoup.connect(src).get();
final Elements results = doc.select("article");
if (results.size() == 0)
throw new Exception("not found: " + article.getTitle() + ", no search results");
for (Element el : results) {
String strDate = el.select(".display-date").attr("datetime");
Date date = sdfBBC.parse(strDate);
final Calendar c = new GregorianCalendar();
c.setTime(date);
if (c.get(Calendar.YEAR) != 2004 && c.get(Calendar.YEAR) != 2005)
continue;
article.setDate(sdfVIPRA.format(date));
Elements elTitle = el.select("h1 > a");
String title = elTitle.text();
if (!article.getTitle().toLowerCase().trim().equals(title.toLowerCase().trim()))
continue;
String url = elTitle.attr("href");
article.setUrl(url);
return;
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment