Skip to content

Commit 0013139

Browse files
committed
ARD Grouping Structure test
1 parent 98a537b commit 0013139

9 files changed

Lines changed: 144 additions & 39 deletions

src/main/java/de/mediathekview/mserver/crawler/ard/ArdCrawler.java

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,13 @@ private Queue<CrawlerUrlDTO> createDayUrlsToCrawl() {
5757

5858
@Override
5959
protected RecursiveTask<Set<Film>> createCrawlerTask() {
60-
60+
ConcurrentLinkedQueue<CrawlerUrlDTO> test = new ConcurrentLinkedQueue<>();
6161
try {
6262
final ForkJoinTask<Set<ArdFilmInfoDto>> dayTask =
6363
forkJoinPool.submit(new ArdDayPageTask(this, createDayUrlsToCrawl()));
6464

6565
final Set<ArdFilmInfoDto> shows = dayTask.get();
66+
shows.clear();
6667
printMessage(
6768
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());
6869

@@ -74,8 +75,18 @@ protected RecursiveTask<Set<Film>> createCrawlerTask() {
7475
senderTopicUrls.addAll(senderTopicTask.get());
7576
}
7677
LOG.debug("sender topic tasks: {}", senderTopicUrls.size());
78+
final ArdTopicGroupsTask groupsToAsset = new ArdTopicGroupsTask(this, new ConcurrentLinkedQueue<>(senderTopicUrls));
79+
final Set<CrawlerUrlDTO> assitUrls = new HashSet<>();
80+
assitUrls.addAll(forkJoinPool.submit(groupsToAsset).get());
81+
LOG.debug("sender group assit tasks: {}", assitUrls.size());
82+
83+
//test.add(new CrawlerUrlDTO("https://api.ardmediathek.de/page-gateway/widgets/swr/asset/Y3JpZDovL3N3ci5kZS8yNDEwMzY1MA?pageNumber=0&pageSize=48&embedded=true&seasoned=false&seasonNumber=&withAudiodescription=false&withOriginalWithSubtitle=false&withOriginalversion=false&single=false"));
84+
test.add(new CrawlerUrlDTO("https://api.ardmediathek.de/page-gateway/widgets/wdr/asset/Y3JpZDovL3dkci5kZS93ZXN0cG9s?pageNumber=0&pageSize=48&embedded=true&seasoned=false&seasonNumber=&withAudiodescription=false&withOriginalWithSubtitle=false&withOriginalversion=false&single=false"));
85+
7786
final ArdTopicPageTask topicTask =
78-
new ArdTopicPageTask(this, new ConcurrentLinkedQueue<>(senderTopicUrls));
87+
new ArdTopicPageTask(this, new ConcurrentLinkedQueue<>(assitUrls));
88+
//new ArdTopicPageTask(this, new ConcurrentLinkedQueue<>(test));
89+
7990
final int showsCountBefore = shows.size();
8091
shows.addAll(forkJoinPool.submit(topicTask).get());
8192
LOG.debug(
@@ -113,19 +124,19 @@ private Set<ForkJoinTask<Set<CrawlerUrlDTO>>> createSenderTopicTasks() {
113124
}
114125

115126
private ForkJoinTask<Set<CrawlerUrlDTO>> getTopicEntriesBySender(final String sender) throws ExecutionException, InterruptedException {
116-
Set<CrawlerUrlDTO> senderTopics = forkJoinPool.submit(
117-
new ArdTopicsTask(this, sender, createTopicsOverviewUrl(sender))).get();
127+
Set<CrawlerUrlDTO> senderSingleLetterUrls = forkJoinPool.submit(
128+
new ArdTopicsTask(this, sender, CreateLetterUrlQuery(sender))).get();
118129

119-
LOG.debug("topics task result {}", senderTopics.size());
120-
return forkJoinPool.submit(new ArdTopicsLetterTask(this, sender, new ConcurrentLinkedQueue<>(senderTopics)));
130+
LOG.debug("topics task result {}", senderSingleLetterUrls.size());
131+
return forkJoinPool.submit(new ArdTopicsLetterTask(this, sender, new ConcurrentLinkedQueue<>(senderSingleLetterUrls)));
121132
}
122133

123-
private Queue<CrawlerUrlDTO> createTopicsOverviewUrl(final String client) {
134+
private Queue<CrawlerUrlDTO> CreateLetterUrlQuery(final String client) {
124135
final Queue<CrawlerUrlDTO> urls = new ConcurrentLinkedQueue<>();
125136

126137
final String url = String.format(ArdConstants.TOPICS_URL, client);
127138
urls.add(new CrawlerUrlDTO(url));
128139

129140
return urls;
130141
}
131-
}
142+
}

src/main/java/de/mediathekview/mserver/crawler/ard/json/ArdTeasersDeserializer.java

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,21 +56,31 @@ private Optional<String> toId(final JsonObject teaserObject) {
5656
}
5757

5858
private ArdFilmInfoDto createFilmInfo(final String id, final int numberOfClips) {
59-
final String url = String.format(ArdConstants.ITEM_URL, id);
59+
String refId = id;
60+
if(id.contains(":")) {
61+
refId = id.replace(":", "%3A");
62+
}
63+
64+
final String url = String.format(ArdConstants.ITEM_URL, refId);
65+
66+
67+
68+
if (id.contains("a04c5a47-0801-40e5-b530-b7f9a4312be9:6898178275329995836")
69+
|| id.contains("Y3JpZDovL25kci5kZS9wcm9wbGFuXzE5NjM4MTA5N19nYW56ZVNlbmR1bmc")
70+
|| id.contains("1TDLUvc8cVEtcSb9GGsOnt:6898178275329995836")
71+
|| id.contains("6b64fc2c-4bd7-47ae-af6c-680e65b53b89")
72+
) {
73+
System.out.println("stop");
74+
}
75+
6076
return new ArdFilmInfoDto(id, url, numberOfClips);
6177
}
6278

6379
private boolean isRelevant(final JsonObject teaserObject) {
64-
if (teaserObject.has(ELEMENT_PUBLICATION_SERVICE)) {
65-
final JsonObject publicationService =
66-
teaserObject.get(ELEMENT_PUBLICATION_SERVICE).getAsJsonObject();
67-
final Optional<String> attributeAsString =
68-
JsonUtils.getAttributeAsString(publicationService, ATTRIBUTE_PARTNER);
69-
if (attributeAsString.isPresent()) {
70-
return ArdConstants.PARTNER_TO_SENDER.get(attributeAsString.get()) != null;
71-
}
80+
Optional<String> partner = JsonUtils.getElementValueAsString(teaserObject, ELEMENT_PUBLICATION_SERVICE, ATTRIBUTE_PARTNER);
81+
if (partner.isPresent()) {
82+
return ArdConstants.PARTNER_TO_SENDER.get(partner.get()) != null;
7283
}
73-
7484
return true;
7585
}
7686
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package de.mediathekview.mserver.crawler.ard.json;
2+
3+
import com.google.gson.JsonArray;
4+
import com.google.gson.JsonDeserializationContext;
5+
import com.google.gson.JsonDeserializer;
6+
import com.google.gson.JsonElement;
7+
import de.mediathekview.mserver.base.utils.JsonUtils;
8+
import de.mediathekview.mserver.crawler.ard.ArdConstants;
9+
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
10+
import java.lang.reflect.Type;
11+
import java.util.*;
12+
13+
public class ArdTopicGroupsDeserializer implements JsonDeserializer<Set<CrawlerUrlDTO>> {
14+
private static final String ELEMENT_WIDGETS = "widgets";
15+
private static final String ELEMENT_LINKS = "links";
16+
private static final String ELEMENT_TARGET = "self";
17+
private static final String ELEMENT_HREF = "href";
18+
private final int maxPageSize = ArdConstants.TOPICS_COMPILATION_PAGE_SIZE;
19+
20+
21+
@Override
22+
public Set<CrawlerUrlDTO> deserialize(
23+
JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) {
24+
final Set<CrawlerUrlDTO> result = new HashSet<>();
25+
26+
if (JsonUtils.hasElements(jsonElement, ELEMENT_WIDGETS)) {
27+
final JsonArray widgets = jsonElement.getAsJsonObject().getAsJsonArray(ELEMENT_WIDGETS);
28+
widgets.forEach(widget -> parseWidget(widget.getAsJsonObject()).ifPresent(result::add));
29+
}
30+
31+
return result;
32+
}
33+
34+
private Optional<CrawlerUrlDTO> parseWidget(final JsonElement compilation) {
35+
Optional<String> totalElements = JsonUtils.getElementValueAsString(compilation, "pagination", "totalElements");
36+
if (totalElements.isEmpty() || totalElements.get() == null || totalElements.get().trim().length() == 0 || totalElements.get().trim().equalsIgnoreCase("0")) {
37+
return Optional.empty();
38+
}
39+
if (JsonUtils.hasElements(compilation, ELEMENT_LINKS)) {
40+
final JsonElement selfLink =
41+
compilation.getAsJsonObject().get(ELEMENT_LINKS).getAsJsonObject().get(ELEMENT_TARGET);
42+
final Optional<String> url = JsonUtils.getElementValueAsString(selfLink, ELEMENT_HREF);
43+
44+
if (url.isPresent()) {
45+
String x = url.get().replaceAll("pageSize=\\d+", "pageSize="+this.maxPageSize);
46+
return Optional.of(new CrawlerUrlDTO(x));
47+
}
48+
}
49+
50+
return Optional.empty();
51+
}
52+
}

src/main/java/de/mediathekview/mserver/crawler/ard/json/ArdTopicsLetterDeserializer.java

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -71,24 +71,10 @@ private int getJsonElementAsIntOrNullIfNotExist(final JsonElement element) {
7171
private Set<CrawlerUrlDTO> parseTeaser(final JsonObject teaserObject) {
7272
final Set<CrawlerUrlDTO> results = new HashSet<>();
7373

74-
final Optional<String> id;
75-
76-
if (JsonUtils.checkTreePath(teaserObject, null, ELEMENT_LINKS, ELEMENT_TARGET)) {
77-
final JsonObject targetObject =
78-
teaserObject.get(ELEMENT_LINKS).getAsJsonObject().get(ELEMENT_TARGET).getAsJsonObject();
79-
id = JsonUtils.getAttributeAsString(targetObject, ATTRIBUTE_ID);
80-
} else {
81-
id = JsonUtils.getAttributeAsString(teaserObject, ATTRIBUTE_ID);
82-
}
83-
if (isRelevant(teaserObject)) {
84-
id.ifPresent(
85-
nonNullId ->
86-
results.add(
87-
new CrawlerUrlDTO(
88-
String.format(
89-
ArdConstants.TOPIC_URL, nonNullId, ArdConstants.TOPIC_PAGE_SIZE))));
74+
final Optional<String> urlToGroup = JsonUtils.getElementValueAsString(teaserObject, ELEMENT_LINKS, ELEMENT_TARGET, "href");
75+
if (isRelevant(teaserObject) && urlToGroup.isPresent()) {
76+
results.add(new CrawlerUrlDTO(urlToGroup.get()));
9077
}
91-
9278
return results;
9379
}
9480

src/main/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ public Map<Resolution, URL> loadM3U8(URL m3u8File) {
121121
if (UrlUtils.getProtocol(videoUrl).isEmpty()) {
122122
videoUrl = baseUrl + videoUrl;
123123
}
124-
urls.put(resolution.get(), URI.create(videoUrl).toURL());
124+
urls.put(resolution.get(), new URL(videoUrl));
125125
} catch (final MalformedURLException malformedURLException) {
126126
LOG.error(
127127
"ArdVideoInfoJsonDeserializer: invalid url {}",

src/main/java/de/mediathekview/mserver/crawler/ard/tasks/ArdFilmDetailTask.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ protected void processRestTarget(final ArdFilmInfoDto aDTO, final WebTarget aTar
5757
taskResults.add(result);
5858

5959
if (aDTO.getNumberOfClips() > 1) {
60-
processRelatedFilms(filmDto.getRelatedFilms());
60+
//processRelatedFilms(filmDto.getRelatedFilms());
6161
}
6262
}
6363
crawler.incrementAndGetActualCount();
@@ -83,7 +83,7 @@ private void processRelatedFilms(final Set<ArdFilmInfoDto> relatedFilms) {
8383
private Optional<URL> getWebsiteUrl(final ArdFilmInfoDto aDTO) {
8484
final String url = String.format(ArdConstants.WEBSITE_URL, aDTO.getId());
8585
try {
86-
return Optional.of(URI.create(url).toURL());
86+
return Optional.of(new URL(url));
8787
} catch (final MalformedURLException e) {
8888
LOG.error(e);
8989
}

src/main/java/de/mediathekview/mserver/crawler/ard/tasks/ArdTaskBase.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ private Response executeRequest(final WebTarget aTarget) {
101101
}
102102

103103
return request
104+
.header("Accept-Encoding", "br, gzip, deflate, zstd")
104105
.header(HEADER_ACCEPT, APPLICATION_JSON)
105106
.header(HEADER_CONTENT_TYPE, APPLICATION_JSON)
106107
.get();
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package de.mediathekview.mserver.crawler.ard.tasks;
2+
3+
import com.google.gson.reflect.TypeToken;
4+
import de.mediathekview.mserver.crawler.ard.PaginationUrlDto;
5+
import de.mediathekview.mserver.crawler.ard.json.ArdTopicGroupsDeserializer;
6+
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
7+
import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask;
8+
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
9+
import jakarta.ws.rs.client.WebTarget;
10+
import java.lang.reflect.Type;
11+
import java.util.HashSet;
12+
import java.util.Queue;
13+
import org.apache.logging.log4j.LogManager;
14+
import org.apache.logging.log4j.Logger;
15+
16+
public class ArdTopicGroupsTask extends ArdTaskBase<CrawlerUrlDTO, CrawlerUrlDTO> {
17+
private static final Logger LOG = LogManager.getLogger(ArdTopicGroupsTask.class);
18+
private static final String PAGE_NUMBER = "pageNumber";
19+
private static final String URL_PAGE_NUMBER_REPLACE_REGEX = PAGE_NUMBER + "=\\d+";
20+
private static final String PAGE_NUMBER_URL_ENCODED = PAGE_NUMBER + "=";
21+
private static final Type DTO_TYPE_TOKEN =
22+
new TypeToken<HashSet<CrawlerUrlDTO>>() {}.getType();
23+
24+
public ArdTopicGroupsTask(
25+
final AbstractCrawler crawler,
26+
final Queue<CrawlerUrlDTO> urlToCrawlDtos) {
27+
super(crawler, urlToCrawlDtos);
28+
registerJsonDeserializer(DTO_TYPE_TOKEN, new ArdTopicGroupsDeserializer());
29+
}
30+
31+
@Override
32+
protected AbstractRecursiveConverterTask<CrawlerUrlDTO, CrawlerUrlDTO> createNewOwnInstance(
33+
final Queue<CrawlerUrlDTO> aElementsToProcess) {
34+
return new ArdTopicGroupsTask(crawler, aElementsToProcess);
35+
}
36+
37+
@Override
38+
protected void processRestTarget(final CrawlerUrlDTO aDTO, final WebTarget aTarget) {
39+
taskResults.addAll(deserialize(aTarget, DTO_TYPE_TOKEN, aDTO));
40+
}
41+
42+
43+
}

src/main/java/de/mediathekview/mserver/crawler/ard/tasks/ArdTopicPageTask.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,9 @@ private String changePageNumber(final WebTarget aTarget, final int newPageNumber
7878
.getUri()
7979
.getRawQuery()
8080
.replaceAll(
81-
URL_PAGE_NUMBER_REPLACE_REGEX, PAGE_NUMBER_URL_ENCODED + newPageNumber))
81+
URL_PAGE_NUMBER_REPLACE_REGEX, PAGE_NUMBER_URL_ENCODED + newPageNumber)
82+
.replaceAll(
83+
"pageNumber=\\d+", "pageNumber=" + newPageNumber))
8284
.build()
8385
.toString()
8486
: aTarget.queryParam(PAGE_NUMBER, newPageNumber).getUri().toString();

0 commit comments

Comments
 (0)