Skip to content

Commit b0d7936

Browse files
committed
Merge branch 'feature/ArdGroupStructure' into develop3
2 parents 582ca85 + 0013139 commit b0d7936

9 files changed

Lines changed: 144 additions & 39 deletions

src/main/java/de/mediathekview/mserver/crawler/ard/ArdCrawler.java

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,13 @@ private Queue<CrawlerUrlDTO> createDayUrlsToCrawl() {
5151

5252
@Override
5353
protected RecursiveTask<Set<Film>> createCrawlerTask() {
54-
54+
ConcurrentLinkedQueue<CrawlerUrlDTO> test = new ConcurrentLinkedQueue<>();
5555
try {
5656
final ForkJoinTask<Set<ArdFilmInfoDto>> dayTask =
5757
forkJoinPool.submit(new ArdDayPageTask(this, createDayUrlsToCrawl()));
5858

5959
final Set<ArdFilmInfoDto> shows = dayTask.get();
60+
shows.clear();
6061
printMessage(
6162
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());
6263

@@ -68,8 +69,18 @@ protected RecursiveTask<Set<Film>> createCrawlerTask() {
6869
senderTopicUrls.addAll(senderTopicTask.get());
6970
}
7071
LOG.debug("sender topic tasks: {}", senderTopicUrls.size());
72+
final ArdTopicGroupsTask groupsToAsset = new ArdTopicGroupsTask(this, new ConcurrentLinkedQueue<>(senderTopicUrls));
73+
final Set<CrawlerUrlDTO> assitUrls = new HashSet<>();
74+
assitUrls.addAll(forkJoinPool.submit(groupsToAsset).get());
75+
LOG.debug("sender group assit tasks: {}", assitUrls.size());
76+
77+
//test.add(new CrawlerUrlDTO("https://api.ardmediathek.de/page-gateway/widgets/swr/asset/Y3JpZDovL3N3ci5kZS8yNDEwMzY1MA?pageNumber=0&pageSize=48&embedded=true&seasoned=false&seasonNumber=&withAudiodescription=false&withOriginalWithSubtitle=false&withOriginalversion=false&single=false"));
78+
test.add(new CrawlerUrlDTO("https://api.ardmediathek.de/page-gateway/widgets/wdr/asset/Y3JpZDovL3dkci5kZS93ZXN0cG9s?pageNumber=0&pageSize=48&embedded=true&seasoned=false&seasonNumber=&withAudiodescription=false&withOriginalWithSubtitle=false&withOriginalversion=false&single=false"));
79+
7180
final ArdTopicPageTask topicTask =
72-
new ArdTopicPageTask(this, new ConcurrentLinkedQueue<>(senderTopicUrls));
81+
new ArdTopicPageTask(this, new ConcurrentLinkedQueue<>(assitUrls));
82+
//new ArdTopicPageTask(this, new ConcurrentLinkedQueue<>(test));
83+
7384
final int showsCountBefore = shows.size();
7485
shows.addAll(forkJoinPool.submit(topicTask).get());
7586
LOG.debug(
@@ -109,19 +120,19 @@ private Set<ForkJoinTask<Set<CrawlerUrlDTO>>> createSenderTopicTasks() {
109120
}
110121

111122
private ForkJoinTask<Set<CrawlerUrlDTO>> getTopicEntriesBySender(final String sender) throws ExecutionException, InterruptedException {
112-
Set<CrawlerUrlDTO> senderTopics = forkJoinPool.submit(
113-
new ArdTopicsTask(this, sender, createTopicsOverviewUrl(sender))).get();
123+
Set<CrawlerUrlDTO> senderSingleLetterUrls = forkJoinPool.submit(
124+
new ArdTopicsTask(this, sender, CreateLetterUrlQuery(sender))).get();
114125

115-
LOG.debug("topics task result {}", senderTopics.size());
116-
return forkJoinPool.submit(new ArdTopicsLetterTask(this, sender, new ConcurrentLinkedQueue<>(senderTopics)));
126+
LOG.debug("topics task result {}", senderSingleLetterUrls.size());
127+
return forkJoinPool.submit(new ArdTopicsLetterTask(this, sender, new ConcurrentLinkedQueue<>(senderSingleLetterUrls)));
117128
}
118129

119-
private Queue<CrawlerUrlDTO> createTopicsOverviewUrl(final String client) {
130+
private Queue<CrawlerUrlDTO> CreateLetterUrlQuery(final String client) {
120131
final Queue<CrawlerUrlDTO> urls = new ConcurrentLinkedQueue<>();
121132

122133
final String url = String.format(ArdConstants.TOPICS_URL, client);
123134
urls.add(new CrawlerUrlDTO(url));
124135

125136
return urls;
126137
}
127-
}
138+
}

src/main/java/de/mediathekview/mserver/crawler/ard/json/ArdTeasersDeserializer.java

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,21 +56,31 @@ private Optional<String> toId(final JsonObject teaserObject) {
5656
}
5757

5858
private ArdFilmInfoDto createFilmInfo(final String id, final int numberOfClips) {
59-
final String url = String.format(ArdConstants.ITEM_URL, id);
59+
String refId = id;
60+
if(id.contains(":")) {
61+
refId = id.replace(":", "%3A");
62+
}
63+
64+
final String url = String.format(ArdConstants.ITEM_URL, refId);
65+
66+
67+
68+
if (id.contains("a04c5a47-0801-40e5-b530-b7f9a4312be9:6898178275329995836")
69+
|| id.contains("Y3JpZDovL25kci5kZS9wcm9wbGFuXzE5NjM4MTA5N19nYW56ZVNlbmR1bmc")
70+
|| id.contains("1TDLUvc8cVEtcSb9GGsOnt:6898178275329995836")
71+
|| id.contains("6b64fc2c-4bd7-47ae-af6c-680e65b53b89")
72+
) {
73+
System.out.println("stop");
74+
}
75+
6076
return new ArdFilmInfoDto(id, url, numberOfClips);
6177
}
6278

6379
private boolean isRelevant(final JsonObject teaserObject) {
64-
if (teaserObject.has(ELEMENT_PUBLICATION_SERVICE)) {
65-
final JsonObject publicationService =
66-
teaserObject.get(ELEMENT_PUBLICATION_SERVICE).getAsJsonObject();
67-
final Optional<String> attributeAsString =
68-
JsonUtils.getAttributeAsString(publicationService, ATTRIBUTE_PARTNER);
69-
if (attributeAsString.isPresent()) {
70-
return ArdConstants.PARTNER_TO_SENDER.get(attributeAsString.get()) != null;
71-
}
80+
Optional<String> partner = JsonUtils.getElementValueAsString(teaserObject, ELEMENT_PUBLICATION_SERVICE, ATTRIBUTE_PARTNER);
81+
if (partner.isPresent()) {
82+
return ArdConstants.PARTNER_TO_SENDER.get(partner.get()) != null;
7283
}
73-
7484
return true;
7585
}
7686
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package de.mediathekview.mserver.crawler.ard.json;
2+
3+
import com.google.gson.JsonArray;
4+
import com.google.gson.JsonDeserializationContext;
5+
import com.google.gson.JsonDeserializer;
6+
import com.google.gson.JsonElement;
7+
import de.mediathekview.mserver.base.utils.JsonUtils;
8+
import de.mediathekview.mserver.crawler.ard.ArdConstants;
9+
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
10+
import java.lang.reflect.Type;
11+
import java.util.*;
12+
13+
public class ArdTopicGroupsDeserializer implements JsonDeserializer<Set<CrawlerUrlDTO>> {
14+
private static final String ELEMENT_WIDGETS = "widgets";
15+
private static final String ELEMENT_LINKS = "links";
16+
private static final String ELEMENT_TARGET = "self";
17+
private static final String ELEMENT_HREF = "href";
18+
private final int maxPageSize = ArdConstants.TOPICS_COMPILATION_PAGE_SIZE;
19+
20+
21+
@Override
22+
public Set<CrawlerUrlDTO> deserialize(
23+
JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) {
24+
final Set<CrawlerUrlDTO> result = new HashSet<>();
25+
26+
if (JsonUtils.hasElements(jsonElement, ELEMENT_WIDGETS)) {
27+
final JsonArray widgets = jsonElement.getAsJsonObject().getAsJsonArray(ELEMENT_WIDGETS);
28+
widgets.forEach(widget -> parseWidget(widget.getAsJsonObject()).ifPresent(result::add));
29+
}
30+
31+
return result;
32+
}
33+
34+
private Optional<CrawlerUrlDTO> parseWidget(final JsonElement compilation) {
35+
Optional<String> totalElements = JsonUtils.getElementValueAsString(compilation, "pagination", "totalElements");
36+
if (totalElements.isEmpty() || totalElements.get() == null || totalElements.get().trim().length() == 0 || totalElements.get().trim().equalsIgnoreCase("0")) {
37+
return Optional.empty();
38+
}
39+
if (JsonUtils.hasElements(compilation, ELEMENT_LINKS)) {
40+
final JsonElement selfLink =
41+
compilation.getAsJsonObject().get(ELEMENT_LINKS).getAsJsonObject().get(ELEMENT_TARGET);
42+
final Optional<String> url = JsonUtils.getElementValueAsString(selfLink, ELEMENT_HREF);
43+
44+
if (url.isPresent()) {
45+
String x = url.get().replaceAll("pageSize=\\d+", "pageSize="+this.maxPageSize);
46+
return Optional.of(new CrawlerUrlDTO(x));
47+
}
48+
}
49+
50+
return Optional.empty();
51+
}
52+
}

src/main/java/de/mediathekview/mserver/crawler/ard/json/ArdTopicsLetterDeserializer.java

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -71,24 +71,10 @@ private int getJsonElementAsIntOrNullIfNotExist(final JsonElement element) {
7171
private Set<CrawlerUrlDTO> parseTeaser(final JsonObject teaserObject) {
7272
final Set<CrawlerUrlDTO> results = new HashSet<>();
7373

74-
final Optional<String> id;
75-
76-
if (JsonUtils.checkTreePath(teaserObject, null, ELEMENT_LINKS, ELEMENT_TARGET)) {
77-
final JsonObject targetObject =
78-
teaserObject.get(ELEMENT_LINKS).getAsJsonObject().get(ELEMENT_TARGET).getAsJsonObject();
79-
id = JsonUtils.getAttributeAsString(targetObject, ATTRIBUTE_ID);
80-
} else {
81-
id = JsonUtils.getAttributeAsString(teaserObject, ATTRIBUTE_ID);
82-
}
83-
if (isRelevant(teaserObject)) {
84-
id.ifPresent(
85-
nonNullId ->
86-
results.add(
87-
new CrawlerUrlDTO(
88-
String.format(
89-
ArdConstants.TOPIC_URL, nonNullId, ArdConstants.TOPIC_PAGE_SIZE))));
74+
final Optional<String> urlToGroup = JsonUtils.getElementValueAsString(teaserObject, ELEMENT_LINKS, ELEMENT_TARGET, "href");
75+
if (isRelevant(teaserObject) && urlToGroup.isPresent()) {
76+
results.add(new CrawlerUrlDTO(urlToGroup.get()));
9077
}
91-
9278
return results;
9379
}
9480

src/main/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ public Map<Resolution, URL> loadM3U8(URL m3u8File) {
121121
if (UrlUtils.getProtocol(videoUrl).isEmpty()) {
122122
videoUrl = baseUrl + videoUrl;
123123
}
124-
urls.put(resolution.get(), URI.create(videoUrl).toURL());
124+
urls.put(resolution.get(), new URL(videoUrl));
125125
} catch (final MalformedURLException malformedURLException) {
126126
LOG.error(
127127
"ArdVideoInfoJsonDeserializer: invalid url {}",

src/main/java/de/mediathekview/mserver/crawler/ard/tasks/ArdFilmDetailTask.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ protected void processRestTarget(final ArdFilmInfoDto aDTO, final WebTarget aTar
5757
taskResults.add(result);
5858

5959
if (aDTO.getNumberOfClips() > 1) {
60-
processRelatedFilms(filmDto.getRelatedFilms());
60+
//processRelatedFilms(filmDto.getRelatedFilms());
6161
}
6262
}
6363
crawler.incrementAndGetActualCount();
@@ -83,7 +83,7 @@ private void processRelatedFilms(final Set<ArdFilmInfoDto> relatedFilms) {
8383
private Optional<URL> getWebsiteUrl(final ArdFilmInfoDto aDTO) {
8484
final String url = String.format(ArdConstants.WEBSITE_URL, aDTO.getId());
8585
try {
86-
return Optional.of(URI.create(url).toURL());
86+
return Optional.of(new URL(url));
8787
} catch (final MalformedURLException e) {
8888
LOG.error(e);
8989
}

src/main/java/de/mediathekview/mserver/crawler/ard/tasks/ArdTaskBase.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ private Response executeRequest(final WebTarget aTarget) {
101101
}
102102

103103
return request
104+
.header("Accept-Encoding", "br, gzip, deflate, zstd")
104105
.header(HEADER_ACCEPT, APPLICATION_JSON)
105106
.header(HEADER_CONTENT_TYPE, APPLICATION_JSON)
106107
.get();
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package de.mediathekview.mserver.crawler.ard.tasks;
2+
3+
import com.google.gson.reflect.TypeToken;
4+
import de.mediathekview.mserver.crawler.ard.PaginationUrlDto;
5+
import de.mediathekview.mserver.crawler.ard.json.ArdTopicGroupsDeserializer;
6+
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
7+
import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask;
8+
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
9+
import jakarta.ws.rs.client.WebTarget;
10+
import java.lang.reflect.Type;
11+
import java.util.HashSet;
12+
import java.util.Queue;
13+
import org.apache.logging.log4j.LogManager;
14+
import org.apache.logging.log4j.Logger;
15+
16+
public class ArdTopicGroupsTask extends ArdTaskBase<CrawlerUrlDTO, CrawlerUrlDTO> {
17+
private static final Logger LOG = LogManager.getLogger(ArdTopicGroupsTask.class);
18+
private static final String PAGE_NUMBER = "pageNumber";
19+
private static final String URL_PAGE_NUMBER_REPLACE_REGEX = PAGE_NUMBER + "=\\d+";
20+
private static final String PAGE_NUMBER_URL_ENCODED = PAGE_NUMBER + "=";
21+
private static final Type DTO_TYPE_TOKEN =
22+
new TypeToken<HashSet<CrawlerUrlDTO>>() {}.getType();
23+
24+
public ArdTopicGroupsTask(
25+
final AbstractCrawler crawler,
26+
final Queue<CrawlerUrlDTO> urlToCrawlDtos) {
27+
super(crawler, urlToCrawlDtos);
28+
registerJsonDeserializer(DTO_TYPE_TOKEN, new ArdTopicGroupsDeserializer());
29+
}
30+
31+
@Override
32+
protected AbstractRecursiveConverterTask<CrawlerUrlDTO, CrawlerUrlDTO> createNewOwnInstance(
33+
final Queue<CrawlerUrlDTO> aElementsToProcess) {
34+
return new ArdTopicGroupsTask(crawler, aElementsToProcess);
35+
}
36+
37+
@Override
38+
protected void processRestTarget(final CrawlerUrlDTO aDTO, final WebTarget aTarget) {
39+
taskResults.addAll(deserialize(aTarget, DTO_TYPE_TOKEN, aDTO));
40+
}
41+
42+
43+
}

src/main/java/de/mediathekview/mserver/crawler/ard/tasks/ArdTopicPageTask.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,9 @@ private String changePageNumber(final WebTarget aTarget, final int newPageNumber
7878
.getUri()
7979
.getRawQuery()
8080
.replaceAll(
81-
URL_PAGE_NUMBER_REPLACE_REGEX, PAGE_NUMBER_URL_ENCODED + newPageNumber))
81+
URL_PAGE_NUMBER_REPLACE_REGEX, PAGE_NUMBER_URL_ENCODED + newPageNumber)
82+
.replaceAll(
83+
"pageNumber=\\d+", "pageNumber=" + newPageNumber))
8284
.build()
8385
.toString()
8486
: aTarget.queryParam(PAGE_NUMBER, newPageNumber).getUri().toString();

0 commit comments

Comments
 (0)