-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy patharxiv.py
More file actions
executable file
·67 lines (57 loc) · 1.95 KB
/
arxiv.py
File metadata and controls
executable file
·67 lines (57 loc) · 1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
###############################################################################
# Filter the arXiv daily title/abstract distribution based on keywords of #
# interest. #
###############################################################################
from sys import stdin
KEYWORDS = [
"code generation",
"document understanding",
"graphics program",
"inverse graphics",
"procedural material",
"program synthesis",
"LaTeX",
"vector graphics",
"LMM",
"MLLM",
"multimodal",
"optical character recognition"
"perceptual similarity",
"poetry",
"scientific document"
"scientific figure"
"TikZ",
"vectorization",
"vision language model",
"VLLM",
"VLM",
]
class ArxivMail:
sep = 78 * "-" + "\n"
header_sep = 2 * sep
footer = 13 * "%%%---"
def __init__(self, mail=None, header=None, papers=None, filtered=0):
self.filtered = filtered
if mail is not None:
self.header, _, self.papers = mail.rpartition(self.header_sep)
self.papers = self.papers.rstrip(self.footer).split(self.sep)
self.header = header or self.header
self.papers = papers or self.papers
def filter(self, keywords):
filtered_papers = [
paper
for paper in self.papers
if any(key.lower() in " ".join(paper.split()).lower() for key in keywords)
]
return ArxivMail(
header=self.header,
papers=filtered_papers,
filtered=self.filtered + len(self.papers) - len(filtered_papers),
)
def __repr__(self):
papers = self.sep.join(self.papers)
header = self.sep.join([self.header, f"Filtered {self.filtered} papers.\n"])
return "".join([header, self.header_sep, papers, self.footer])
if __name__ == "__main__":
print(ArxivMail(stdin.read()).filter(KEYWORDS))