generated from Nekojimi/JavaMavenTemplate
Implemented web scraper search.
This commit is contained in:
parent
4d7eccd1e0
commit
68869a149e
|
@ -1,23 +1,53 @@
|
||||||
- name: Soundcloud
|
#- name: Soundcloud
|
||||||
type: WebScraperProvider
|
#abbr: SC
|
||||||
search_url: https://soundcloud.com/search/sounds?q=$QUERY
|
#type: WebScraperSearcher
|
||||||
result_item: li.searchList__item
|
#search_url: https://soundcloud.com/search/sounds?q=$QUERY
|
||||||
result_fields:
|
#result_selector: li.searchList__item
|
||||||
artist: .soundTitle__username
|
#result_field_selectors:
|
||||||
title: .soundTitle__title
|
#artist: .soundTitle__username
|
||||||
link_href: a.soundTitle__title
|
#title: .soundTitle__title
|
||||||
|
#link: a.soundTitle__title
|
||||||
|
#- name: Soundcloud
|
||||||
|
#abbr:SC
|
||||||
|
#type: ApiSearcher
|
||||||
|
#search_url: https://api.soundcloud.com/tracks?q=$QUERY&access=playable&limit=10&linked_partitioning=true
|
||||||
|
#format: json
|
||||||
|
#result_array: collection
|
||||||
|
#result_field_names:
|
||||||
|
#title: title
|
||||||
|
#link: uri
|
||||||
- name: Bandcamp
|
- name: Bandcamp
|
||||||
type: WebScraperProvider
|
abbr: BC
|
||||||
|
type: WebScraperSearcher
|
||||||
search_url: https://bandcamp.com/search?q=$QUERY&item_type
|
search_url: https://bandcamp.com/search?q=$QUERY&item_type
|
||||||
result_item: li.searchresult
|
result_selector: li.searchresult
|
||||||
result_fields:
|
result_field_selectors:
|
||||||
title: .heading
|
title: .heading
|
||||||
link_href: .heading a
|
link: .heading a
|
||||||
album_artist: .subhead
|
album_artist: .subhead
|
||||||
# - name: Youtube
|
#- name: Youtube
|
||||||
# search_url:
|
#abbr: YT
|
||||||
# result_item:
|
#search_url: https://www.youtube.com/results?search_query=$QUERY
|
||||||
# result_fields:
|
#result_item: ytd-video-renderer.ytd-item-section-renderer
|
||||||
# title:
|
#result_fields:
|
||||||
# link_href:
|
#title:
|
||||||
# album_artist:
|
#link_href:
|
||||||
|
#album_artist:
|
||||||
|
#- name: Jamendo
|
||||||
|
#abbr: JM
|
||||||
|
#type: WebScraperSearcher
|
||||||
|
#search_url: https://www.jamendo.com/search/tracks?q=$QUERY
|
||||||
|
#result_selector: li.active-result
|
||||||
|
#result_field_selectors:
|
||||||
|
#link: a.js-search-item-link
|
||||||
|
#title_artist: a.js-search-item-link
|
||||||
|
#- Name: Jamendo
|
||||||
|
#abbr: JM
|
||||||
|
#type: ApiSearcher
|
||||||
|
#search_url: https://api.jamendo.com/v3.0/tracks/?client_id=$SECRET&format=jsonpretty&limit=10&include=musicinfo&groupby=artist_id&search=$QUERY
|
||||||
|
#format: json
|
||||||
|
#result_array: results
|
||||||
|
#result_field_names:
|
||||||
|
#title: name
|
||||||
|
#artist: artist_name
|
||||||
|
#album: album_name
|
||||||
|
|
|
@ -5,17 +5,69 @@
|
||||||
*/
|
*/
|
||||||
package moe.nekojimi.musicsearcher;
|
package moe.nekojimi.musicsearcher;
|
||||||
|
|
||||||
|
import com.amihaiemil.eoyaml.Yaml;
|
||||||
|
import com.amihaiemil.eoyaml.YamlInput;
|
||||||
|
import com.amihaiemil.eoyaml.YamlMapping;
|
||||||
|
import com.amihaiemil.eoyaml.YamlSequence;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.lang.reflect.Constructor;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ExecutionException;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import java.util.logging.Logger;
|
||||||
|
import moe.nekojimi.musicsearcher.providers.Searcher;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @author jim
|
* @author jim
|
||||||
*/
|
*/
|
||||||
public class Main
|
public class Main
|
||||||
{
|
{
|
||||||
|
private static final Map<String, Searcher> searchers = new HashMap<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param args the command line arguments
|
* @param args the command line arguments
|
||||||
*/
|
*/
|
||||||
public static void main(String[] args)
|
public static void main(String[] args) throws IOException
|
||||||
{
|
{
|
||||||
System.out.println("Hello world!");
|
// System.out.println("Hello world!");
|
||||||
|
YamlInput input = Yaml.createYamlInput(new File("searchproviders.yml"));
|
||||||
|
YamlSequence seq = input.readYamlSequence();
|
||||||
|
for (int i = 0; i < seq.size(); i++)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
YamlMapping map = seq.yamlMapping(i);
|
||||||
|
String type = map.string("type");
|
||||||
|
Class<? extends Searcher> clazz = (Class<? extends Searcher>) Class.forName("moe.nekojimi.musicsearcher.providers." + type);
|
||||||
|
Constructor<? extends Searcher> constructor = clazz.getConstructor(YamlMapping.class);
|
||||||
|
Searcher searcher = constructor.newInstance(map);
|
||||||
|
searchers.put(searcher.getName(), searcher);
|
||||||
|
|
||||||
|
} catch (ClassNotFoundException | NoSuchMethodException | SecurityException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException ex) {
|
||||||
|
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println(searchers);
|
||||||
|
String query = "Test";
|
||||||
|
for (Searcher searcher: searchers.values())
|
||||||
|
{
|
||||||
|
System.out.println("Searching " + searcher.getName() + " for " + query);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
List<Result> results = searcher.searchAndWait(query);
|
||||||
|
for (Result result: results)
|
||||||
|
{
|
||||||
|
System.out.println("\t" + result);
|
||||||
|
}
|
||||||
|
} catch (InterruptedException | ExecutionException ex) {
|
||||||
|
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,10 +5,55 @@
|
||||||
*/
|
*/
|
||||||
package moe.nekojimi.musicsearcher;
|
package moe.nekojimi.musicsearcher;
|
||||||
|
|
||||||
|
import java.net.URL;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @author jim
|
* @author jim
|
||||||
*/
|
*/
|
||||||
public class Result {
|
public class Result
|
||||||
|
{
|
||||||
|
private URL link;
|
||||||
|
private String artist;
|
||||||
|
private String album;
|
||||||
|
private String title;
|
||||||
|
|
||||||
|
public URL getLink() {
|
||||||
|
return link;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLink(URL link) {
|
||||||
|
this.link = link;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getArtist() {
|
||||||
|
return artist;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setArtist(String artist) {
|
||||||
|
this.artist = artist;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getAlbum() {
|
||||||
|
return album;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAlbum(String album) {
|
||||||
|
this.album = album;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTitle() {
|
||||||
|
return title;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTitle(String title) {
|
||||||
|
this.title = title;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "Result{" + "link=" + link + ", artist=" + artist + ", album=" + album + ", title=" + title + '}';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,8 +5,11 @@
|
||||||
*/
|
*/
|
||||||
package moe.nekojimi.musicsearcher.providers;
|
package moe.nekojimi.musicsearcher.providers;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
import moe.nekojimi.musicsearcher.Result;
|
import moe.nekojimi.musicsearcher.Result;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -16,7 +19,9 @@ import moe.nekojimi.musicsearcher.Result;
|
||||||
public class MetaSearcher extends Searcher
|
public class MetaSearcher extends Searcher
|
||||||
{
|
{
|
||||||
|
|
||||||
private final Set<Searcher> searchers;
|
private final Set<Searcher> searchers = new HashSet<>();
|
||||||
|
private int minSearchTime = 10000; // ms
|
||||||
|
private int maxSearchTime = 30000; // ms
|
||||||
|
|
||||||
public MetaSearcher()
|
public MetaSearcher()
|
||||||
{
|
{
|
||||||
|
@ -24,9 +29,26 @@ public class MetaSearcher extends Searcher
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Result> search(String query)
|
protected List<Result> doSearch(String query)
|
||||||
{
|
{
|
||||||
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
|
List<Result> results = new ArrayList<>();
|
||||||
|
List<CompletableFuture<List<Result>>> searches = new ArrayList<>();
|
||||||
|
for (Searcher searcher: searchers)
|
||||||
|
{
|
||||||
|
CompletableFuture<List<Result>> search = searcher.search(query);
|
||||||
|
searches.add(search);
|
||||||
|
search.whenComplete((t, u) ->
|
||||||
|
{
|
||||||
|
if (u == null)
|
||||||
|
{
|
||||||
|
results.addAll(t);
|
||||||
|
// searches.remove(search);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
CompletableFuture.allOf((CompletableFuture<?>[]) searches.toArray());
|
||||||
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,11 +5,11 @@
|
||||||
*/
|
*/
|
||||||
package moe.nekojimi.musicsearcher.providers;
|
package moe.nekojimi.musicsearcher.providers;
|
||||||
|
|
||||||
|
import com.amihaiemil.eoyaml.YamlMapping;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
import java.util.concurrent.ExecutionException;
|
import java.util.concurrent.ExecutionException;
|
||||||
import java.util.concurrent.ForkJoinPool;
|
import java.util.concurrent.ForkJoinPool;
|
||||||
import java.util.concurrent.Future;
|
|
||||||
import java.util.concurrent.FutureTask;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.TimeoutException;
|
import java.util.concurrent.TimeoutException;
|
||||||
import moe.nekojimi.musicsearcher.Result;
|
import moe.nekojimi.musicsearcher.Result;
|
||||||
|
@ -20,7 +20,7 @@ import moe.nekojimi.musicsearcher.Result;
|
||||||
*/
|
*/
|
||||||
public abstract class Searcher
|
public abstract class Searcher
|
||||||
{
|
{
|
||||||
private final String name;
|
final String name;
|
||||||
private final ForkJoinPool executor;
|
private final ForkJoinPool executor;
|
||||||
|
|
||||||
public Searcher(String name)
|
public Searcher(String name)
|
||||||
|
@ -29,6 +29,17 @@ public abstract class Searcher
|
||||||
this.executor = new ForkJoinPool();
|
this.executor = new ForkJoinPool();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Searcher(YamlMapping yaml)
|
||||||
|
{
|
||||||
|
this(yaml.string("name"));
|
||||||
|
assert yaml.string("type").equals(this.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName()
|
||||||
|
{
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
public List<Result> searchAndWait(String query) throws InterruptedException, ExecutionException
|
public List<Result> searchAndWait(String query) throws InterruptedException, ExecutionException
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
|
@ -45,15 +56,19 @@ public abstract class Searcher
|
||||||
return search(query).get(limit, unit);
|
return search(query).get(limit, unit);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Future<List<Result>> search(String query)
|
public CompletableFuture<List<Result>> search(String query)
|
||||||
{
|
{
|
||||||
FutureTask<List<Result>> task = new FutureTask<>(() ->
|
CompletableFuture<List<Result>> future = new CompletableFuture<>();
|
||||||
{
|
future.completeAsync(()->doSearch(query), executor);
|
||||||
return doSearch(query);
|
return future;
|
||||||
});
|
|
||||||
executor.execute(task);
|
|
||||||
return task;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract List<Result> doSearch(String query);
|
protected abstract List<Result> doSearch(String query);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return this.getClass().getSimpleName() + "{" + "name=" + name + '}';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,9 +5,21 @@
|
||||||
*/
|
*/
|
||||||
package moe.nekojimi.musicsearcher.providers;
|
package moe.nekojimi.musicsearcher.providers;
|
||||||
|
|
||||||
|
import com.amihaiemil.eoyaml.YamlMapping;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.net.URLEncoder;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import java.util.logging.Logger;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import moe.nekojimi.musicsearcher.Result;
|
import moe.nekojimi.musicsearcher.Result;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -16,22 +28,123 @@ import org.jsoup.Jsoup;
|
||||||
public class WebScraperSearcher extends Searcher
|
public class WebScraperSearcher extends Searcher
|
||||||
{
|
{
|
||||||
private String searchUrl;
|
private String searchUrl;
|
||||||
|
private URL rootURL;
|
||||||
|
|
||||||
private String resultItem;
|
private String resultSelector;
|
||||||
private String artistItem;
|
private String resultArtistSelector;
|
||||||
private String titleItem;
|
private String resultTitleSelector;
|
||||||
private String linkHrefItem;
|
private String resultLinkSelector;
|
||||||
private String albumArtistItem;
|
private String resultAlbumArtistSelector;
|
||||||
|
|
||||||
public WebScraperSearcher(String name)
|
public WebScraperSearcher(String name)
|
||||||
{
|
{
|
||||||
super(name);
|
super(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public WebScraperSearcher(YamlMapping yaml) throws MalformedURLException
|
||||||
|
{
|
||||||
|
super(yaml);
|
||||||
|
searchUrl = yaml.string("search_url");
|
||||||
|
rootURL = fillURL("");
|
||||||
|
rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), "");
|
||||||
|
resultSelector = yaml.string("result_selector");
|
||||||
|
|
||||||
|
YamlMapping fields = yaml.yamlMapping("result_field_selectors");
|
||||||
|
if (fields != null)
|
||||||
|
{
|
||||||
|
resultArtistSelector = fields.string("artist");
|
||||||
|
resultTitleSelector = fields.string("title");
|
||||||
|
resultLinkSelector = fields.string("link");
|
||||||
|
resultAlbumArtistSelector = fields.string("album_artist");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Result> doSearch(String query)
|
protected List<Result> doSearch(String query)
|
||||||
{
|
{
|
||||||
Jsoup.
|
try
|
||||||
|
{
|
||||||
|
URL url = fillURL(query);
|
||||||
|
Document doc = Jsoup.parse(url, 10000);
|
||||||
|
System.out.println("Document from " + name + ":" + doc.html());
|
||||||
|
Elements resultEles = doc.select(resultSelector);
|
||||||
|
return resultEles.stream()
|
||||||
|
.map((ele)->parseResultElement(ele))
|
||||||
|
.filter((res)->res!=null)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
} catch (IOException ex)
|
||||||
|
{
|
||||||
|
Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex);
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected URL fillURL(String query) throws MalformedURLException
|
||||||
|
{
|
||||||
|
URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query, Charset.forName("utf-8"))));
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Result parseResultElement(Element ele)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Result res = new Result();
|
||||||
|
// Artist
|
||||||
|
if (resultArtistSelector != null)
|
||||||
|
{
|
||||||
|
Element artistEle = ele.selectFirst(resultArtistSelector);
|
||||||
|
if (artistEle != null)
|
||||||
|
res.setArtist(artistEle.text());
|
||||||
|
}
|
||||||
|
// Title
|
||||||
|
if (resultTitleSelector != null)
|
||||||
|
{
|
||||||
|
Element titleEle = ele.selectFirst(resultTitleSelector);
|
||||||
|
if (titleEle != null)
|
||||||
|
res.setTitle(titleEle.text());
|
||||||
|
}
|
||||||
|
// Link
|
||||||
|
if (resultLinkSelector != null)
|
||||||
|
{
|
||||||
|
Element linkEle = ele.selectFirst(resultLinkSelector);
|
||||||
|
if (linkEle != null)
|
||||||
|
{
|
||||||
|
String link;
|
||||||
|
if (linkEle.hasAttr("href"))
|
||||||
|
link = linkEle.attr("href");
|
||||||
|
else
|
||||||
|
link = linkEle.text();
|
||||||
|
|
||||||
|
URL url;
|
||||||
|
if(!link.startsWith("http"))
|
||||||
|
url = new URL(rootURL, link);
|
||||||
|
else
|
||||||
|
url = new URL(link);
|
||||||
|
|
||||||
|
res.setLink(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Artist + Album
|
||||||
|
if (resultAlbumArtistSelector != null)
|
||||||
|
{
|
||||||
|
Element alArtEle = ele.selectFirst(resultAlbumArtistSelector);
|
||||||
|
if (alArtEle != null)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Artist + Title
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue