Skip to content

Commit 582ca85

Browse files
committed
zdf partner2sender, ard generate urls, checkUrlAv
1 parent 45c8d35 commit 582ca85

15 files changed

Lines changed: 796 additions & 45 deletions

File tree

MServer-Config.yaml

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ databaseConfig:
66
url: "jdbc:postgresql://localhost:55432/crawler"
77
username: "crawler"
88
password: "secret"
9+
refreshIntervalInDays: 7
10+
checkUrlIntervalInDays: 3
11+
batchSize: 2000
912

1013
# The maximum amount of cpu threads to be used.
1114
maximumCpuThreads: 10
@@ -24,7 +27,7 @@ maximumRequestsPerSecond: 999.0
2427

2528
# If set only these Sender will be crawled all other will be ignored.
2629
senderIncluded:
27-
#- ARD
30+
- ARD
2831
#- ARTE_DE
2932
#- ARTE_FR
3033
#- ARTE_PL
@@ -37,7 +40,7 @@ senderIncluded:
3740
#- ORF
3841
#- PHOENIX
3942
#- SRF
40-
- SR
43+
#- SR
4144
#- ZDF
4245

4346
#SRF,SR,PHONIX,ORF,KIKA,DW,3SAT<
@@ -105,8 +108,8 @@ filmlistIdFilePath: target/filmlists/filmlist.id.xx
105108

106109
# import additional filmlist sources
107110
importFilmlistConfigurations :
108-
- active: false
109-
path: "someCrawlerlist.json"
111+
- active: true
112+
path: "Filmliste-akt"
110113
format: OLD_JSON
111114
createDiff: false
112115
checkImportListUrl: false
@@ -157,7 +160,7 @@ maximumSubpages: 5
157160
maximumDaysForSendungVerpasstSection: 7
158161

159162
# The maximum amount of days going to future will be crawled for the "Sendung Verpasst?" section.
160-
maximumDaysForSendungVerpasstSectionFuture: 0
163+
maximumDaysForSendungVerpasstSectionFuture: 3
161164

162165
# The time in seconds before a socket connection should time out.
163166
socketTimeoutInSeconds: 60
@@ -167,11 +170,12 @@ socketTimeoutInSeconds: 60
167170
senderConfigurations:
168171
ARD:
169172
# Actually the ARD has a maximum of 6 days in the past
170-
maximumDaysForSendungVerpasstSection: 1
173+
maximumDaysForSendungVerpasstSection: 6
174+
maximumDaysForSendungVerpasstSectionFuture: 6
171175
#2,4,8 ok
172176
maximumUrlsPerTask: 32
173177
#10,20,40 ok
174-
maximumSubpages: 0
178+
maximumSubpages: 40
175179
ORF:
176180
maximumRequestsPerSecond: 10.0
177181
ARTE_DE:
@@ -195,12 +199,13 @@ senderConfigurations:
195199
FUNK:
196200
maximumUrlsPerTask: 99
197201
DREISAT:
198-
maximumSubpages: 5
199-
maximumDaysForSendungVerpasstSection: 60
202+
maximumSubpages: 15
203+
maximumDaysForSendungVerpasstSection: 30
204+
maximumDaysForSendungVerpasstSectionFuture: 20
200205
PHOENIX:
201206
maximumSubpages: 500
202207
SRF:
203-
maximumSubpages: 1
208+
maximumSubpages: 25
204209

205210
#### COPY ####
206211
copySettings:
@@ -258,4 +263,4 @@ logSettings:
258263

259264
# The pattern of the file name of the archived log files.
260265
# See: https://logging.apache.org/log4j/2.0/manual/appenders.html#RollingFileAppender
261-
logFileRollingPattern: logs/${date:yyyy-MM}/server-%d{MM-dd-yyyy}-%i.log
266+
logFileRollingPattern: logs/${date:yyyy-MM}/server-%d{MM-dd-yyyy-HH}-%i.log

src/main/docker/runDocker

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ docker compose run -d --rm -e MSERVER_OPTS="--config /config/MServer-Config-R2.y
1515
# on demand - do not run this unless you know what you are doing!
1616
docker compose run -d --rm -e MSERVER_OPTS="--config /config/MServer-Config-R2.yaml --flow importFilmlistIntoDB" mserver-r3
1717

18-
18+
## docker save -o mserver.tar mediathekview/mserver:4.0.1-SNAPSHOT
19+
## docker load -i mserver.tar

src/main/java/de/mediathekview/mserver/base/utils/FilmDBService.java

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,49 @@ public void update(String sql) {
6666
}
6767
}
6868

69+
/////////////////////////////////////////////////////////////////////////////////////////
70+
/////////////////////////////////////////////////////////////////////////////////////////
71+
72+
public void updateLastUrlCheck(List<Film> checked) {
73+
try {
74+
AtomicInteger updateCounter = new AtomicInteger(0);
75+
List<Future<?>> futures = new ArrayList<>();
76+
List<Film> allVideos = checked.stream()
77+
.sorted(Comparator.comparing(Film::getId))
78+
.toList();
79+
for (int i = 0; i < allVideos.size(); i += batchSize) {
80+
int from = i;
81+
int to = Math.min(i + batchSize, allVideos.size());
82+
List<Film> batch = allVideos.subList(from, to);
83+
futures.add(executorService.submit(() -> {
84+
String sql = "UPDATE filme SET last_url_check = NOW() WHERE id = ?";
85+
try (Connection con = dataSource.getConnection(); PreparedStatement ps = con.prepareStatement(sql)) {
86+
for (Film video : batch) {
87+
ps.setString(1, video.getId());
88+
ps.addBatch();
89+
}
90+
int [] rs = ps.executeBatch();
91+
for (int rsCode : rs) {
92+
updateCounter.addAndGet(rsCode);
93+
}
94+
} catch (SQLException e) {
95+
LOG.error(e);
96+
}
97+
}));
98+
}
99+
futures.forEach( f -> {try { f.get(); } catch(Exception e) { LOG.error("{}",e); }});
100+
LOG.debug("updated lastUrlCheck {}", updateCounter.get());
101+
} catch (Exception e) {
102+
LOG.error(e);
103+
}
104+
}
105+
69106
/////////////////////////////////////////////////////////////////////////////////////////
70107
/////////////////////////////////////////////////////////////////////////////////////////
71108

72109
public void deleteFilms(Collection<Film> abandonedFilmlist) {
73110
try {
74-
List<Future<List<Film>>> futures = new ArrayList<>();
111+
List<Future<?>> futures = new ArrayList<>();
75112
List<Film> allVideos = abandonedFilmlist.stream()
76113
.sorted(Comparator.comparing(Film::getId))
77114
.toList();
@@ -80,7 +117,6 @@ public void deleteFilms(Collection<Film> abandonedFilmlist) {
80117
int to = Math.min(i + batchSize, allVideos.size());
81118
List<Film> batch = allVideos.subList(from, to);
82119
futures.add(executorService.submit(() -> {
83-
List<Film> newVideos = new ArrayList<>();
84120
String sql = "DELETE FROM filme WHERE id = ?";
85121
try (Connection con = dataSource.getConnection(); PreparedStatement ps = con.prepareStatement(sql)) {
86122
for (Film video : batch) {
@@ -91,13 +127,9 @@ public void deleteFilms(Collection<Film> abandonedFilmlist) {
91127
} catch (SQLException e) {
92128
LOG.error(e);
93129
}
94-
return newVideos;
95130
}));
96131
}
97-
List<Film> result = new ArrayList<>();
98-
for (Future<List<Film>> f : futures) {
99-
result.addAll(f.get());
100-
}
132+
futures.forEach( f -> {try { f.get(); } catch(Exception e) { LOG.error("{}",e); }});
101133
LOG.debug("deleted {}", abandonedFilmlist.size());
102134

103135
} catch (Exception e) {
@@ -157,13 +189,10 @@ public <T> List<T> filterNewVideos(Sender sender, List<T> videos, Function<T, St
157189
List<T> newVideos = new ArrayList<>();
158190
StringBuffer sql = new StringBuffer();
159191
sql.append("UPDATE filme SET last_seen = now() ")
160-
.append("WHERE id = ? AND (")
161-
.append("( cast(created_at as date) = cast(last_update as date) and cast(created_at as date) <> cast(now() as date) )")
162-
.append(" OR ")
163-
.append("(last_seen - last_update <= interval '").append(refreshIntervalInDays).append("' DAY)")
164-
.append(")");
192+
.append("WHERE id = ? ")
193+
.append("AND NOT( created_at::date = last_update::date and last_update::date <> CURRENT_DATE ) ")
194+
.append("AND NOT( last_seen - last_update >= interval '").append(refreshIntervalInDays).append("' DAY)");
165195
try (Connection con = dataSource.getConnection(); PreparedStatement ps = con.prepareStatement(sql.toString())) {
166-
167196
for (T video : batch) {
168197
String id = idExtractor.apply(video);
169198
if (id != null) {

src/main/java/de/mediathekview/mserver/crawler/ard/ArdConstants.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ public class ArdConstants {
1919
public static final String DAY_PAGE_URL = "https://programm-api.ard.de/program/api/program?day=%s&channelIds=%s&mode=channel";
2020

2121
public static final int TOPICS_COMPILATION_PAGE_SIZE = 200;
22-
public static final int TOPIC_PAGE_SIZE = 50;
22+
public static final int TOPIC_PAGE_SIZE = 200;
2323

2424
public static final String DEFAULT_CLIENT = "ard";
2525

0 commit comments

Comments
 (0)