From 68869a149ece5e4cff861ed7ce6fdec86477e453 Mon Sep 17 00:00:00 2001 From: Nekojimi Date: Wed, 29 Sep 2021 17:19:09 +0100 Subject: [PATCH] Implemented web scraper search. --- searchproviders.yml | 68 +++++++--- .../java/moe/nekojimi/musicsearcher/Main.java | 56 +++++++- .../moe/nekojimi/musicsearcher/Result.java | 47 ++++++- .../musicsearcher/providers/MetaSearcher.java | 28 +++- .../musicsearcher/providers/Searcher.java | 35 +++-- .../providers/WebScraperSearcher.java | 125 +++++++++++++++++- 6 files changed, 318 insertions(+), 41 deletions(-) diff --git a/searchproviders.yml b/searchproviders.yml index d962bc2..28f9394 100644 --- a/searchproviders.yml +++ b/searchproviders.yml @@ -1,23 +1,53 @@ -- name: Soundcloud - type: WebScraperProvider - search_url: https://soundcloud.com/search/sounds?q=$QUERY - result_item: li.searchList__item - result_fields: - artist: .soundTitle__username - title: .soundTitle__title - link_href: a.soundTitle__title +#- name: Soundcloud + #abbr: SC + #type: WebScraperSearcher + #search_url: https://soundcloud.com/search/sounds?q=$QUERY + #result_selector: li.searchList__item + #result_field_selectors: + #artist: .soundTitle__username + #title: .soundTitle__title + #link: a.soundTitle__title +#- name: Soundcloud + #abbr:SC + #type: ApiSearcher + #search_url: https://api.soundcloud.com/tracks?q=$QUERY&access=playable&limit=10&linked_partitioning=true + #format: json + #result_array: collection + #result_field_names: + #title: title + #link: uri - name: Bandcamp - type: WebScraperProvider + abbr: BC + type: WebScraperSearcher search_url: https://bandcamp.com/search?q=$QUERY&item_type - result_item: li.searchresult - result_fields: + result_selector: li.searchresult + result_field_selectors: title: .heading - link_href: .heading a + link: .heading a album_artist: .subhead -# - name: Youtube -# search_url: -# result_item: -# result_fields: -# title: -# link_href: -# album_artist: + #- name: Youtube + #abbr: YT + #search_url: https://www.youtube.com/results?search_query=$QUERY + #result_item: ytd-video-renderer.ytd-item-section-renderer + #result_fields: + #title: + #link_href: + #album_artist: +#- name: Jamendo + #abbr: JM + #type: WebScraperSearcher + #search_url: https://www.jamendo.com/search/tracks?q=$QUERY + #result_selector: li.active-result + #result_field_selectors: + #link: a.js-search-item-link + #title_artist: a.js-search-item-link +#- Name: Jamendo + #abbr: JM + #type: ApiSearcher + #search_url: https://api.jamendo.com/v3.0/tracks/?client_id=$SECRET&format=jsonpretty&limit=10&include=musicinfo&groupby=artist_id&search=$QUERY + #format: json + #result_array: results + #result_field_names: + #title: name + #artist: artist_name + #album: album_name diff --git a/src/main/java/moe/nekojimi/musicsearcher/Main.java b/src/main/java/moe/nekojimi/musicsearcher/Main.java index c438b64..f1fa57a 100644 --- a/src/main/java/moe/nekojimi/musicsearcher/Main.java +++ b/src/main/java/moe/nekojimi/musicsearcher/Main.java @@ -5,17 +5,69 @@ */ package moe.nekojimi.musicsearcher; +import com.amihaiemil.eoyaml.Yaml; +import com.amihaiemil.eoyaml.YamlInput; +import com.amihaiemil.eoyaml.YamlMapping; +import com.amihaiemil.eoyaml.YamlSequence; +import java.io.File; +import java.io.IOException; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.logging.Level; +import java.util.logging.Logger; +import moe.nekojimi.musicsearcher.providers.Searcher; + /** * * @author jim */ public class Main { + private static final Map searchers = new HashMap<>(); + /** * @param args the command line arguments */ - public static void main(String[] args) + public static void main(String[] args) throws IOException { - System.out.println("Hello world!"); +// System.out.println("Hello world!"); + YamlInput input = Yaml.createYamlInput(new File("searchproviders.yml")); + YamlSequence seq = input.readYamlSequence(); + for (int i = 0; i < seq.size(); i++) + { + try + { + YamlMapping map = seq.yamlMapping(i); + String type = map.string("type"); + Class clazz = (Class) Class.forName("moe.nekojimi.musicsearcher.providers." + type); + Constructor constructor = clazz.getConstructor(YamlMapping.class); + Searcher searcher = constructor.newInstance(map); + searchers.put(searcher.getName(), searcher); + + } catch (ClassNotFoundException | NoSuchMethodException | SecurityException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException ex) { + Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex); + } + } + + System.out.println(searchers); + String query = "Test"; + for (Searcher searcher: searchers.values()) + { + System.out.println("Searching " + searcher.getName() + " for " + query); + try + { + List results = searcher.searchAndWait(query); + for (Result result: results) + { + System.out.println("\t" + result); + } + } catch (InterruptedException | ExecutionException ex) { + Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex); + } + } } } diff --git a/src/main/java/moe/nekojimi/musicsearcher/Result.java b/src/main/java/moe/nekojimi/musicsearcher/Result.java index 9e1becf..d0d47f1 100644 --- a/src/main/java/moe/nekojimi/musicsearcher/Result.java +++ b/src/main/java/moe/nekojimi/musicsearcher/Result.java @@ -5,10 +5,55 @@ */ package moe.nekojimi.musicsearcher; +import java.net.URL; + /** * * @author jim */ -public class Result { +public class Result +{ + private URL link; + private String artist; + private String album; + private String title; + + public URL getLink() { + return link; + } + + public void setLink(URL link) { + this.link = link; + } + + public String getArtist() { + return artist; + } + + public void setArtist(String artist) { + this.artist = artist; + } + + public String getAlbum() { + return album; + } + + public void setAlbum(String album) { + this.album = album; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + @Override + public String toString() { + return "Result{" + "link=" + link + ", artist=" + artist + ", album=" + album + ", title=" + title + '}'; + } + } diff --git a/src/main/java/moe/nekojimi/musicsearcher/providers/MetaSearcher.java b/src/main/java/moe/nekojimi/musicsearcher/providers/MetaSearcher.java index 8952adc..e5b0297 100644 --- a/src/main/java/moe/nekojimi/musicsearcher/providers/MetaSearcher.java +++ b/src/main/java/moe/nekojimi/musicsearcher/providers/MetaSearcher.java @@ -5,8 +5,11 @@ */ package moe.nekojimi.musicsearcher.providers; +import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.concurrent.CompletableFuture; import moe.nekojimi.musicsearcher.Result; /** @@ -16,7 +19,9 @@ import moe.nekojimi.musicsearcher.Result; public class MetaSearcher extends Searcher { - private final Set searchers; + private final Set searchers = new HashSet<>(); + private int minSearchTime = 10000; // ms + private int maxSearchTime = 30000; // ms public MetaSearcher() { @@ -24,9 +29,26 @@ public class MetaSearcher extends Searcher } @Override - public List search(String query) + protected List doSearch(String query) { - throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + List results = new ArrayList<>(); + List>> searches = new ArrayList<>(); + for (Searcher searcher: searchers) + { + CompletableFuture> search = searcher.search(query); + searches.add(search); + search.whenComplete((t, u) -> + { + if (u == null) + { + results.addAll(t); +// searches.remove(search); + } + }); + } + CompletableFuture.allOf((CompletableFuture[]) searches.toArray()); + return results; } + } diff --git a/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java b/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java index 41055fa..179f42f 100644 --- a/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java +++ b/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java @@ -5,11 +5,11 @@ */ package moe.nekojimi.musicsearcher.providers; +import com.amihaiemil.eoyaml.YamlMapping; import java.util.List; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.Future; -import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import moe.nekojimi.musicsearcher.Result; @@ -20,7 +20,7 @@ import moe.nekojimi.musicsearcher.Result; */ public abstract class Searcher { - private final String name; + final String name; private final ForkJoinPool executor; public Searcher(String name) @@ -29,6 +29,17 @@ public abstract class Searcher this.executor = new ForkJoinPool(); } + public Searcher(YamlMapping yaml) + { + this(yaml.string("name")); + assert yaml.string("type").equals(this.getClass().getSimpleName()); + } + + public String getName() + { + return name; + } + public List searchAndWait(String query) throws InterruptedException, ExecutionException { try @@ -45,15 +56,19 @@ public abstract class Searcher return search(query).get(limit, unit); } - public Future> search(String query) + public CompletableFuture> search(String query) { - FutureTask> task = new FutureTask<>(() -> - { - return doSearch(query); - }); - executor.execute(task); - return task; + CompletableFuture> future = new CompletableFuture<>(); + future.completeAsync(()->doSearch(query), executor); + return future; } protected abstract List doSearch(String query); + + @Override + public String toString() { + return this.getClass().getSimpleName() + "{" + "name=" + name + '}'; + } + + } diff --git a/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java b/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java index 4b94682..79b736b 100644 --- a/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java +++ b/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java @@ -5,9 +5,21 @@ */ package moe.nekojimi.musicsearcher.providers; +import com.amihaiemil.eoyaml.YamlMapping; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.charset.Charset; import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.stream.Collectors; import moe.nekojimi.musicsearcher.Result; import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; /** * @@ -16,22 +28,123 @@ import org.jsoup.Jsoup; public class WebScraperSearcher extends Searcher { private String searchUrl; + private URL rootURL; - private String resultItem; - private String artistItem; - private String titleItem; - private String linkHrefItem; - private String albumArtistItem; + private String resultSelector; + private String resultArtistSelector; + private String resultTitleSelector; + private String resultLinkSelector; + private String resultAlbumArtistSelector; public WebScraperSearcher(String name) { super(name); } + public WebScraperSearcher(YamlMapping yaml) throws MalformedURLException + { + super(yaml); + searchUrl = yaml.string("search_url"); + rootURL = fillURL(""); + rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), ""); + resultSelector = yaml.string("result_selector"); + + YamlMapping fields = yaml.yamlMapping("result_field_selectors"); + if (fields != null) + { + resultArtistSelector = fields.string("artist"); + resultTitleSelector = fields.string("title"); + resultLinkSelector = fields.string("link"); + resultAlbumArtistSelector = fields.string("album_artist"); + } + } + @Override protected List doSearch(String query) { - Jsoup. + try + { + URL url = fillURL(query); + Document doc = Jsoup.parse(url, 10000); + System.out.println("Document from " + name + ":" + doc.html()); + Elements resultEles = doc.select(resultSelector); + return resultEles.stream() + .map((ele)->parseResultElement(ele)) + .filter((res)->res!=null) + .collect(Collectors.toList()); + } catch (IOException ex) + { + Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex); + return List.of(); + } + } + + protected URL fillURL(String query) throws MalformedURLException + { + URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query, Charset.forName("utf-8")))); + return url; + } + + protected Result parseResultElement(Element ele) + { + try + { + Result res = new Result(); + // Artist + if (resultArtistSelector != null) + { + Element artistEle = ele.selectFirst(resultArtistSelector); + if (artistEle != null) + res.setArtist(artistEle.text()); + } + // Title + if (resultTitleSelector != null) + { + Element titleEle = ele.selectFirst(resultTitleSelector); + if (titleEle != null) + res.setTitle(titleEle.text()); + } + // Link + if (resultLinkSelector != null) + { + Element linkEle = ele.selectFirst(resultLinkSelector); + if (linkEle != null) + { + String link; + if (linkEle.hasAttr("href")) + link = linkEle.attr("href"); + else + link = linkEle.text(); + + URL url; + if(!link.startsWith("http")) + url = new URL(rootURL, link); + else + url = new URL(link); + + res.setLink(url); + } + } + + // Artist + Album + if (resultAlbumArtistSelector != null) + { + Element alArtEle = ele.selectFirst(resultAlbumArtistSelector); + if (alArtEle != null) + { + + } + } + + // Artist + Title + + return res; + } + catch (Exception ex) + { + Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex); + return null; + } } }