Implemented web scraper search.

master
Nekojimi 4 years ago
parent 4d7eccd1e0
commit 68869a149e
  1. 68
      searchproviders.yml
  2. 56
      src/main/java/moe/nekojimi/musicsearcher/Main.java
  3. 47
      src/main/java/moe/nekojimi/musicsearcher/Result.java
  4. 28
      src/main/java/moe/nekojimi/musicsearcher/providers/MetaSearcher.java
  5. 35
      src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java
  6. 125
      src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java

@ -1,23 +1,53 @@
- name: Soundcloud
type: WebScraperProvider
search_url: https://soundcloud.com/search/sounds?q=$QUERY
result_item: li.searchList__item
result_fields:
artist: .soundTitle__username
title: .soundTitle__title
link_href: a.soundTitle__title
#- name: Soundcloud
#abbr: SC
#type: WebScraperSearcher
#search_url: https://soundcloud.com/search/sounds?q=$QUERY
#result_selector: li.searchList__item
#result_field_selectors:
#artist: .soundTitle__username
#title: .soundTitle__title
#link: a.soundTitle__title
#- name: Soundcloud
#abbr:SC
#type: ApiSearcher
#search_url: https://api.soundcloud.com/tracks?q=$QUERY&access=playable&limit=10&linked_partitioning=true
#format: json
#result_array: collection
#result_field_names:
#title: title
#link: uri
- name: Bandcamp
type: WebScraperProvider
abbr: BC
type: WebScraperSearcher
search_url: https://bandcamp.com/search?q=$QUERY&item_type
result_item: li.searchresult
result_fields:
result_selector: li.searchresult
result_field_selectors:
title: .heading
link_href: .heading a
link: .heading a
album_artist: .subhead
# - name: Youtube
# search_url:
# result_item:
# result_fields:
# title:
# link_href:
# album_artist:
#- name: Youtube
#abbr: YT
#search_url: https://www.youtube.com/results?search_query=$QUERY
#result_item: ytd-video-renderer.ytd-item-section-renderer
#result_fields:
#title:
#link_href:
#album_artist:
#- name: Jamendo
#abbr: JM
#type: WebScraperSearcher
#search_url: https://www.jamendo.com/search/tracks?q=$QUERY
#result_selector: li.active-result
#result_field_selectors:
#link: a.js-search-item-link
#title_artist: a.js-search-item-link
#- Name: Jamendo
#abbr: JM
#type: ApiSearcher
#search_url: https://api.jamendo.com/v3.0/tracks/?client_id=$SECRET&format=jsonpretty&limit=10&include=musicinfo&groupby=artist_id&search=$QUERY
#format: json
#result_array: results
#result_field_names:
#title: name
#artist: artist_name
#album: album_name

@ -5,17 +5,69 @@
*/
package moe.nekojimi.musicsearcher;
import com.amihaiemil.eoyaml.Yaml;
import com.amihaiemil.eoyaml.YamlInput;
import com.amihaiemil.eoyaml.YamlMapping;
import com.amihaiemil.eoyaml.YamlSequence;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.logging.Level;
import java.util.logging.Logger;
import moe.nekojimi.musicsearcher.providers.Searcher;
/**
*
* @author jim
*/
public class Main
{
private static final Map<String, Searcher> searchers = new HashMap<>();
/**
* @param args the command line arguments
*/
public static void main(String[] args)
public static void main(String[] args) throws IOException
{
System.out.println("Hello world!");
// System.out.println("Hello world!");
YamlInput input = Yaml.createYamlInput(new File("searchproviders.yml"));
YamlSequence seq = input.readYamlSequence();
for (int i = 0; i < seq.size(); i++)
{
try
{
YamlMapping map = seq.yamlMapping(i);
String type = map.string("type");
Class<? extends Searcher> clazz = (Class<? extends Searcher>) Class.forName("moe.nekojimi.musicsearcher.providers." + type);
Constructor<? extends Searcher> constructor = clazz.getConstructor(YamlMapping.class);
Searcher searcher = constructor.newInstance(map);
searchers.put(searcher.getName(), searcher);
} catch (ClassNotFoundException | NoSuchMethodException | SecurityException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException ex) {
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
}
}
System.out.println(searchers);
String query = "Test";
for (Searcher searcher: searchers.values())
{
System.out.println("Searching " + searcher.getName() + " for " + query);
try
{
List<Result> results = searcher.searchAndWait(query);
for (Result result: results)
{
System.out.println("\t" + result);
}
} catch (InterruptedException | ExecutionException ex) {
Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
}

@ -5,10 +5,55 @@
*/
package moe.nekojimi.musicsearcher;
import java.net.URL;
/**
*
* @author jim
*/
public class Result {
public class Result
{
private URL link;
private String artist;
private String album;
private String title;
public URL getLink() {
return link;
}
public void setLink(URL link) {
this.link = link;
}
public String getArtist() {
return artist;
}
public void setArtist(String artist) {
this.artist = artist;
}
public String getAlbum() {
return album;
}
public void setAlbum(String album) {
this.album = album;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
@Override
public String toString() {
return "Result{" + "link=" + link + ", artist=" + artist + ", album=" + album + ", title=" + title + '}';
}
}

@ -5,8 +5,11 @@
*/
package moe.nekojimi.musicsearcher.providers;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import moe.nekojimi.musicsearcher.Result;
/**
@ -16,7 +19,9 @@ import moe.nekojimi.musicsearcher.Result;
public class MetaSearcher extends Searcher
{
private final Set<Searcher> searchers;
private final Set<Searcher> searchers = new HashSet<>();
private int minSearchTime = 10000; // ms
private int maxSearchTime = 30000; // ms
public MetaSearcher()
{
@ -24,9 +29,26 @@ public class MetaSearcher extends Searcher
}
@Override
public List<Result> search(String query)
protected List<Result> doSearch(String query)
{
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
List<Result> results = new ArrayList<>();
List<CompletableFuture<List<Result>>> searches = new ArrayList<>();
for (Searcher searcher: searchers)
{
CompletableFuture<List<Result>> search = searcher.search(query);
searches.add(search);
search.whenComplete((t, u) ->
{
if (u == null)
{
results.addAll(t);
// searches.remove(search);
}
});
}
CompletableFuture.allOf((CompletableFuture<?>[]) searches.toArray());
return results;
}
}

@ -5,11 +5,11 @@
*/
package moe.nekojimi.musicsearcher.providers;
import com.amihaiemil.eoyaml.YamlMapping;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import moe.nekojimi.musicsearcher.Result;
@ -20,7 +20,7 @@ import moe.nekojimi.musicsearcher.Result;
*/
public abstract class Searcher
{
private final String name;
final String name;
private final ForkJoinPool executor;
public Searcher(String name)
@ -29,6 +29,17 @@ public abstract class Searcher
this.executor = new ForkJoinPool();
}
public Searcher(YamlMapping yaml)
{
this(yaml.string("name"));
assert yaml.string("type").equals(this.getClass().getSimpleName());
}
public String getName()
{
return name;
}
public List<Result> searchAndWait(String query) throws InterruptedException, ExecutionException
{
try
@ -45,15 +56,19 @@ public abstract class Searcher
return search(query).get(limit, unit);
}
public Future<List<Result>> search(String query)
public CompletableFuture<List<Result>> search(String query)
{
FutureTask<List<Result>> task = new FutureTask<>(() ->
{
return doSearch(query);
});
executor.execute(task);
return task;
CompletableFuture<List<Result>> future = new CompletableFuture<>();
future.completeAsync(()->doSearch(query), executor);
return future;
}
protected abstract List<Result> doSearch(String query);
@Override
public String toString() {
return this.getClass().getSimpleName() + "{" + "name=" + name + '}';
}
}

@ -5,9 +5,21 @@
*/
package moe.nekojimi.musicsearcher.providers;
import com.amihaiemil.eoyaml.YamlMapping;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import moe.nekojimi.musicsearcher.Result;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
@ -16,22 +28,123 @@ import org.jsoup.Jsoup;
public class WebScraperSearcher extends Searcher
{
private String searchUrl;
private URL rootURL;
private String resultItem;
private String artistItem;
private String titleItem;
private String linkHrefItem;
private String albumArtistItem;
private String resultSelector;
private String resultArtistSelector;
private String resultTitleSelector;
private String resultLinkSelector;
private String resultAlbumArtistSelector;
public WebScraperSearcher(String name)
{
super(name);
}
public WebScraperSearcher(YamlMapping yaml) throws MalformedURLException
{
super(yaml);
searchUrl = yaml.string("search_url");
rootURL = fillURL("");
rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), "");
resultSelector = yaml.string("result_selector");
YamlMapping fields = yaml.yamlMapping("result_field_selectors");
if (fields != null)
{
resultArtistSelector = fields.string("artist");
resultTitleSelector = fields.string("title");
resultLinkSelector = fields.string("link");
resultAlbumArtistSelector = fields.string("album_artist");
}
}
@Override
protected List<Result> doSearch(String query)
{
Jsoup.
try
{
URL url = fillURL(query);
Document doc = Jsoup.parse(url, 10000);
System.out.println("Document from " + name + ":" + doc.html());
Elements resultEles = doc.select(resultSelector);
return resultEles.stream()
.map((ele)->parseResultElement(ele))
.filter((res)->res!=null)
.collect(Collectors.toList());
} catch (IOException ex)
{
Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex);
return List.of();
}
}
protected URL fillURL(String query) throws MalformedURLException
{
URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query, Charset.forName("utf-8"))));
return url;
}
protected Result parseResultElement(Element ele)
{
try
{
Result res = new Result();
// Artist
if (resultArtistSelector != null)
{
Element artistEle = ele.selectFirst(resultArtistSelector);
if (artistEle != null)
res.setArtist(artistEle.text());
}
// Title
if (resultTitleSelector != null)
{
Element titleEle = ele.selectFirst(resultTitleSelector);
if (titleEle != null)
res.setTitle(titleEle.text());
}
// Link
if (resultLinkSelector != null)
{
Element linkEle = ele.selectFirst(resultLinkSelector);
if (linkEle != null)
{
String link;
if (linkEle.hasAttr("href"))
link = linkEle.attr("href");
else
link = linkEle.text();
URL url;
if(!link.startsWith("http"))
url = new URL(rootURL, link);
else
url = new URL(link);
res.setLink(url);
}
}
// Artist + Album
if (resultAlbumArtistSelector != null)
{
Element alArtEle = ele.selectFirst(resultAlbumArtistSelector);
if (alArtEle != null)
{
}
}
// Artist + Title
return res;
}
catch (Exception ex)
{
Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex);
return null;
}
}
}

Loading…
Cancel
Save