generated from Nekojimi/JavaMavenTemplate
Made WebScraperSearcher able to parse different result formats (HTML/JSON) and supply query parameters dynamically.
This commit is contained in:
parent
039a91ed40
commit
b8336c26fb
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* To change this license header, choose License Headers in Project Properties.
|
||||
* To change this template file, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
package moe.nekojimi.musicsearcher.parsers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Collection;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
|
||||
public class HTMLParser extends Parser<Document, Element>
|
||||
{
|
||||
@Override
|
||||
public Document getDocument(InputStream input)
|
||||
{
|
||||
try
|
||||
{
|
||||
Document doc = Jsoup.parse(new String(input.readAllBytes()));
|
||||
// System.out.println(doc.outerHtml());
|
||||
return doc;
|
||||
}
|
||||
catch (IOException ex)
|
||||
{
|
||||
Logger.getLogger(HTMLParser.class.getName()).log(Level.SEVERE, null, ex);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Element> getResults(Document document, String selector)
|
||||
{
|
||||
return document.select(selector);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getField(Element object, String selector)
|
||||
{
|
||||
Element ele = object.selectFirst(selector);
|
||||
return ele.text();
|
||||
}
|
||||
|
||||
@Override
|
||||
public URL getURLField(Element object, URL baseURL, String selector) throws MalformedURLException
|
||||
{
|
||||
Element ele = object.selectFirst(selector);
|
||||
|
||||
String link;
|
||||
if (ele.hasAttr("href"))
|
||||
link = ele.attr("href");
|
||||
else
|
||||
link = ele.text();
|
||||
|
||||
URL url;
|
||||
if(!link.startsWith("http"))
|
||||
url = new URL(baseURL, link);
|
||||
else
|
||||
url = new URL(link);
|
||||
|
||||
return url;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* To change this license header, choose License Headers in Project Properties.
|
||||
* To change this template file, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
package moe.nekojimi.musicsearcher.parsers;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import javax.json.Json;
|
||||
import javax.json.JsonObject;
|
||||
import javax.json.JsonReader;
|
||||
import javax.json.JsonStructure;
|
||||
import javax.json.JsonValue;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author jim
|
||||
*/
|
||||
public class JSONParser extends Parser<JsonStructure, JsonObject>
|
||||
{
|
||||
|
||||
@Override
|
||||
public JsonStructure getDocument(InputStream input)
|
||||
{
|
||||
JsonReader reader = Json.createReader(input);
|
||||
return reader.read();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<JsonObject> getResults(JsonStructure document, String selector)
|
||||
{
|
||||
JsonValue value = document.getValue(selector);
|
||||
if (value.getValueType() == JsonValue.ValueType.ARRAY)
|
||||
{
|
||||
return value.asJsonArray().getValuesAs(JsonObject.class);
|
||||
}
|
||||
else if (value.getValueType() == JsonValue.ValueType.OBJECT)
|
||||
{
|
||||
return List.of(value.asJsonObject());
|
||||
}
|
||||
return List.of();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getField(JsonObject object, String selector)
|
||||
{
|
||||
return object.getValue(selector).toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public URL getURLField(JsonObject object, URL baseURL, String selector) throws MalformedURLException
|
||||
{
|
||||
String link = getField(object, selector);
|
||||
|
||||
URL url;
|
||||
if(!link.startsWith("http"))
|
||||
url = new URL(baseURL, link);
|
||||
else
|
||||
url = new URL(link);
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
private JsonValue navigate(JsonObject from, String selector)
|
||||
{
|
||||
return from.getValue(selector);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* To change this license header, choose License Headers in Project Properties.
|
||||
* To change this template file, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
package moe.nekojimi.musicsearcher.parsers;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author jim
|
||||
*/
|
||||
public abstract class Parser<D,O>
|
||||
{
|
||||
public abstract D getDocument(InputStream input);
|
||||
|
||||
public abstract Collection<O> getResults(D document, String selector);
|
||||
public Collection<O> getResults(InputStream input, String selector)
|
||||
{
|
||||
return getResults(getDocument(input), selector);
|
||||
}
|
||||
|
||||
public abstract String getField(O object, String selector);
|
||||
public abstract URL getURLField(O object, URL baseURL, String selector) throws MalformedURLException;
|
||||
|
||||
}
|
|
@ -12,6 +12,8 @@ import java.util.concurrent.ExecutionException;
|
|||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import moe.nekojimi.musicsearcher.Query;
|
||||
import moe.nekojimi.musicsearcher.QueryFieldUnsupportedException;
|
||||
import moe.nekojimi.musicsearcher.Result;
|
||||
|
||||
/**
|
||||
|
@ -40,7 +42,7 @@ public abstract class Searcher
|
|||
return name;
|
||||
}
|
||||
|
||||
public List<Result> searchAndWait(String query) throws InterruptedException, ExecutionException
|
||||
public List<Result> searchAndWait(Query query) throws InterruptedException, ExecutionException
|
||||
{
|
||||
try
|
||||
{
|
||||
|
@ -51,19 +53,19 @@ public abstract class Searcher
|
|||
}
|
||||
}
|
||||
|
||||
public List<Result> searchAndWait(String query, long limit, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException
|
||||
public List<Result> searchAndWait(Query query, long limit, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException
|
||||
{
|
||||
return search(query).get(limit, unit);
|
||||
}
|
||||
|
||||
public CompletableFuture<List<Result>> search(String query)
|
||||
public CompletableFuture<List<Result>> search(Query query)
|
||||
{
|
||||
CompletableFuture<List<Result>> future = new CompletableFuture<>();
|
||||
future.completeAsync(()->doSearch(query), executor);
|
||||
return future;
|
||||
}
|
||||
|
||||
protected abstract List<Result> doSearch(String query);
|
||||
protected abstract List<Result> doSearch(Query query) throws QueryFieldUnsupportedException;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
|
|
@ -6,20 +6,26 @@
|
|||
package moe.nekojimi.musicsearcher.providers;
|
||||
|
||||
import com.amihaiemil.eoyaml.YamlMapping;
|
||||
import com.amihaiemil.eoyaml.YamlNode;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
import java.util.stream.Collectors;
|
||||
import moe.nekojimi.musicsearcher.Query;
|
||||
import moe.nekojimi.musicsearcher.Result;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import moe.nekojimi.musicsearcher.SecretStore;
|
||||
import moe.nekojimi.musicsearcher.parsers.HTMLParser;
|
||||
import moe.nekojimi.musicsearcher.parsers.JSONParser;
|
||||
import moe.nekojimi.musicsearcher.parsers.Parser;
|
||||
import org.apache.http.client.utils.URIBuilder;
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -27,14 +33,18 @@ import org.jsoup.select.Elements;
|
|||
*/
|
||||
public class WebScraperSearcher extends Searcher
|
||||
{
|
||||
private String searchUrl;
|
||||
private URL rootURL;
|
||||
protected String searchUrl;
|
||||
protected URL rootURL;
|
||||
|
||||
private String resultSelector;
|
||||
private String resultArtistSelector;
|
||||
private String resultTitleSelector;
|
||||
private String resultLinkSelector;
|
||||
private String resultAlbumArtistSelector;
|
||||
protected String resultSelector;
|
||||
protected String resultArtistSelector;
|
||||
protected String resultTitleSelector;
|
||||
protected String resultLinkSelector;
|
||||
protected String resultAlbumArtistSelector;
|
||||
|
||||
protected Map<String,String> searchFields = new HashMap<>();
|
||||
|
||||
protected Parser<?,?> parser;
|
||||
|
||||
public WebScraperSearcher(String name)
|
||||
{
|
||||
|
@ -45,11 +55,12 @@ public class WebScraperSearcher extends Searcher
|
|||
{
|
||||
super(yaml);
|
||||
searchUrl = yaml.string("search_url");
|
||||
rootURL = fillURL("");
|
||||
rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), "");
|
||||
rootURL = new URL(searchUrl);
|
||||
// rootURL = fillURL(Query.fullText(""));
|
||||
// rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), "");
|
||||
resultSelector = yaml.string("result_selector");
|
||||
|
||||
YamlMapping fields = yaml.yamlMapping("result_field_selectors");
|
||||
YamlMapping fields = yaml.yamlMapping("result_fields");
|
||||
if (fields != null)
|
||||
{
|
||||
resultArtistSelector = fields.string("artist");
|
||||
|
@ -57,84 +68,93 @@ public class WebScraperSearcher extends Searcher
|
|||
resultLinkSelector = fields.string("link");
|
||||
resultAlbumArtistSelector = fields.string("album_artist");
|
||||
}
|
||||
|
||||
YamlMapping searchFieldMap = yaml.yamlMapping("search_fields");
|
||||
for (YamlNode key: searchFieldMap.keys())
|
||||
searchFields.put(key.asScalar().value(), searchFieldMap.string(key));
|
||||
|
||||
String formatName = yaml.string("format");
|
||||
switch(formatName)
|
||||
{
|
||||
case "html":
|
||||
parser = new HTMLParser(); break;
|
||||
case "json":
|
||||
parser = new JSONParser(); break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Format " + formatName + " is unknown.");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Result> doSearch(String query)
|
||||
protected List<Result> doSearch(Query query)
|
||||
{
|
||||
try
|
||||
{
|
||||
URL url = fillURL(query);
|
||||
Document doc = Jsoup.parse(url, 10000);
|
||||
System.out.println("Document from " + name + ":" + doc.html());
|
||||
Elements resultEles = doc.select(resultSelector);
|
||||
return resultEles.stream()
|
||||
.map((ele)->parseResultElement(ele))
|
||||
.filter((res)->res!=null)
|
||||
.collect(Collectors.toList());
|
||||
InputStream input = url.openStream();
|
||||
return processResults(parser, input);
|
||||
} catch (IOException ex)
|
||||
{
|
||||
Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex);
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
|
||||
protected URL fillURL(Query query) throws MalformedURLException
|
||||
{
|
||||
// URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query.getTextSearch(), Charset.forName("utf-8"))));
|
||||
try
|
||||
{
|
||||
URIBuilder builder = new URIBuilder(rootURL.toURI());
|
||||
if (query.getTextSearch() != null)
|
||||
{
|
||||
if (searchFields.containsKey("query"))
|
||||
builder.addParameter(searchFields.get("query"), transformSearchString(query.getTextSearch()));
|
||||
}
|
||||
if (searchFields.containsKey("secret"))
|
||||
{
|
||||
builder.addParameter(searchFields.get("secret"), SecretStore.get().getSecret(name));
|
||||
}
|
||||
return builder.build().toURL();
|
||||
}
|
||||
catch (URISyntaxException ex)
|
||||
{
|
||||
throw new MalformedURLException();
|
||||
}
|
||||
}
|
||||
|
||||
protected URL fillURL(String query) throws MalformedURLException
|
||||
|
||||
protected String transformSearchString(String search)
|
||||
{
|
||||
URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query, Charset.forName("utf-8"))));
|
||||
return url;
|
||||
return search;
|
||||
}
|
||||
|
||||
protected <E> List<Result> processResults(Parser<?,E> parser, InputStream input)
|
||||
{
|
||||
Collection<E> resultEles = parser.getResults(input, resultSelector);
|
||||
return resultEles.stream()
|
||||
.map((ele)->parseResultElement(parser, ele))
|
||||
.filter((res)->res!=null)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
protected Result parseResultElement(Element ele)
|
||||
protected <E> Result parseResultElement(Parser<?,E> parser, E ele)
|
||||
{
|
||||
try
|
||||
{
|
||||
Result res = new Result();
|
||||
// Artist
|
||||
if (resultArtistSelector != null)
|
||||
{
|
||||
Element artistEle = ele.selectFirst(resultArtistSelector);
|
||||
if (artistEle != null)
|
||||
res.setArtist(artistEle.text());
|
||||
}
|
||||
res.setArtist(parser.getField(ele, resultArtistSelector));
|
||||
// Title
|
||||
if (resultTitleSelector != null)
|
||||
{
|
||||
Element titleEle = ele.selectFirst(resultTitleSelector);
|
||||
if (titleEle != null)
|
||||
res.setTitle(titleEle.text());
|
||||
}
|
||||
res.setTitle(parser.getField(ele, resultTitleSelector));
|
||||
// Link
|
||||
if (resultLinkSelector != null)
|
||||
{
|
||||
Element linkEle = ele.selectFirst(resultLinkSelector);
|
||||
if (linkEle != null)
|
||||
{
|
||||
String link;
|
||||
if (linkEle.hasAttr("href"))
|
||||
link = linkEle.attr("href");
|
||||
else
|
||||
link = linkEle.text();
|
||||
|
||||
URL url;
|
||||
if(!link.startsWith("http"))
|
||||
url = new URL(rootURL, link);
|
||||
else
|
||||
url = new URL(link);
|
||||
|
||||
res.setLink(url);
|
||||
}
|
||||
}
|
||||
res.setLink(parser.getURLField(ele, rootURL, resultLinkSelector));
|
||||
|
||||
// Artist + Album
|
||||
if (resultAlbumArtistSelector != null)
|
||||
{
|
||||
Element alArtEle = ele.selectFirst(resultAlbumArtistSelector);
|
||||
if (alArtEle != null)
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
res.setAlbumArtist(parser.getField(ele, resultAlbumArtistSelector));
|
||||
|
||||
// Artist + Title
|
||||
|
||||
|
@ -146,5 +166,4 @@ public class WebScraperSearcher extends Searcher
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue