Skip to content

Commit 45a3677

Browse files
authored
[API] QuantumEvolutionKernel.__call__ now computes a matrix rather than a single distance (#31)
1 parent 405b50b commit 45a3677

4 files changed

Lines changed: 188 additions & 489 deletions

File tree

.pre-commit-config.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ repos:
77
- id: check-yaml
88
- id: check-added-large-files
99

10+
# Reformat source code.
1011
- repo: https://github.com/ambv/black
1112
rev: 24.8.0
1213
hooks:
@@ -23,3 +24,9 @@ repos:
2324
hooks:
2425
- id: mypy
2526
exclude: examples|docs
27+
28+
# Cleanup jupyter notebooks
29+
- repo: https://github.com/kynan/nbstripout
30+
rev: 0.5.0
31+
hooks:
32+
- id: nbstripout

examples/tutorial.ipynb

Lines changed: 42 additions & 443 deletions
Large diffs are not rendered by default.

qek/kernel/kernel.py

Lines changed: 105 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from collections.abc import Sequence
77

88
import numpy as np
9+
from numpy.typing import NDArray
910
from scipy.spatial.distance import jensenshannon
1011

1112
from qek.data.dataset import ProcessedData
@@ -15,26 +16,108 @@ class QuantumEvolutionKernel:
1516
"""QuantumEvolutionKernel class.
1617
1718
Attributes:
18-
- params (dict): Dictionary of training parameters.
19-
- X (Sequence[ProcessedData]): Training data used for fitting the kernel
19+
- params (dict): Dictionary of training parameters. As of this writing, the only
20+
training parameter is "mu", the scaling factor for the Jensen-Shannon divergence.
21+
- X (Sequence[ProcessedData]): Training data used for fitting the kernel.
2022
- kernel_matrix (np.ndarray): Kernel matrix. This is assigned in the `fit()` method
2123
2224
2325
"""
2426

25-
def __init__(self, mu: float):
27+
def __init__(self, mu: float, size_max: int | None = None):
2628
"""Initialize the QuantumEvolutionKernel.
2729
2830
Args:
2931
mu (float): Scaling factor for the Jensen-Shannon divergence
32+
size_max (int, optional): If specified, only consider the first `size_max`
33+
qubits of bitstrings. Otherwise, consider all qubits. You may use this
34+
to trade precision in favor of speed.
3035
"""
31-
self.params: dict[str, Any] = {"mu": mu}
36+
self.params: dict[str, Any] = {
37+
"mu": mu,
38+
"size_max": size_max,
39+
}
3240
self.X: Sequence[ProcessedData]
3341
self.kernel_matrix: np.ndarray
3442

3543
def __call__(
36-
self, graph_1: ProcessedData, graph_2: ProcessedData, size_max: int | None = None
37-
) -> float:
44+
self,
45+
X1: Sequence[ProcessedData],
46+
X2: Sequence[ProcessedData] | None = None,
47+
) -> NDArray[np.floating]:
48+
"""Compute a kernel matrix from two sequences of processed data.
49+
50+
This method computes a M x N kernel matrix from the Jensen-Shannon divergences
51+
between all pairs of graphs in the two datasets. The resulting matrix can be used
52+
as a similarity metric for machine learning algorithms.
53+
54+
If `X1` and `X2` are two sequences representing the processed data for a
55+
single graph each, the resulting matrix can be used as a measure of similarity
56+
between both graphs.
57+
58+
Args:
59+
X1: processed data to be used as rows.
60+
X2 (optional): processed data to be used as columns. If unspecified, use X1
61+
as both rows and columns.
62+
Returns:
63+
np.ndarray: A len(X1) x len(X2) matrix where entry[i, j] represents the
64+
similarity between rows[i] and columns[j], scaled by a factor that depends
65+
on mu.
66+
Notes:
67+
The JSD is computed using the jensenshannon function from
68+
`scipy.spatial.distance`, and it is squared because scipy function
69+
`jensenshannon` outputs the distance instead of the divergence.
70+
"""
71+
# If size is not specified, set it to the length of the largest bitstring.
72+
size_max = self.params["size_max"]
73+
if size_max is None:
74+
if X2 is None:
75+
# No need to walk the same source twice.
76+
sources = [X1]
77+
else:
78+
sources = [X1, X2]
79+
for source in sources:
80+
for data in source:
81+
length = len(data.sequence.qubit_info)
82+
if size_max is None or size_max <= length:
83+
size_max = length
84+
85+
# Note: At this stage, size_max could theoretically still be `None``, if both `X1` and `X2`
86+
# are empty. In such cases, `dist_excitation` will never be called, so we're ok.
87+
88+
mu = float(self.params["mu"])
89+
feat_rows = [row.dist_excitation(size_max) for row in X1]
90+
91+
if X2 is None:
92+
# Fast path:
93+
# - rows and columns are identical, so no need to compute a `feat_cols`;
94+
# - the matrix is symmetric, we only need to compute half of it.
95+
#
96+
# We could avoid computing kernel[i, i], as we know that it's always 1,
97+
# but we do not perform this specific optimization, as it is a useful
98+
# canary to detect some bugs.
99+
kernel = np.zeros([len(X1), len(X1)])
100+
for i, dist_row in enumerate(feat_rows):
101+
for j in range(i, len(feat_rows)):
102+
dist_col = feat_rows[j]
103+
js = jensenshannon(dist_row, dist_col) ** 2
104+
similarity = np.exp(-mu * js)
105+
kernel[i, j] = similarity
106+
if j != i:
107+
kernel[j, i] = similarity
108+
else:
109+
# Slow path:
110+
# - we need to compute a `feat_columns`
111+
# - the matrix is generally not symmetric and diagonal entries are generally not 1.
112+
kernel = np.zeros([len(X1), len(X2)])
113+
feat_columns = [col.dist_excitation(size_max) for col in X2]
114+
for i, dist_row in enumerate(feat_rows):
115+
for j, dist_col in enumerate(feat_columns):
116+
js = jensenshannon(dist_row, dist_col) ** 2
117+
kernel[i, j] = np.exp(-mu * js)
118+
return kernel
119+
120+
def similarity(self, graph_1: ProcessedData, graph_2: ProcessedData) -> float:
38121
"""Compute the similarity between two graphs using Jensen-Shannon
39122
divergence.
40123
@@ -63,32 +146,17 @@ def __call__(
63146
`scipy.spatial.distance`, and it is squared because scipy function
64147
`jensenshannon` outputs the distance instead of the divergence.
65148
"""
66-
if len(graph_1.state_dict) == 0 or len(graph_2.state_dict) == 0:
67-
raise ValueError("An input counter is empty")
68-
69-
if size_max is None:
70-
# If size is not specified, it's the length of bitstrings.
71-
bitstring_1 = next(iter(graph_1.state_dict.keys()))
72-
bitstring_2 = next(iter(graph_1.state_dict.keys()))
73-
size_max = max(len(bitstring_1), len(bitstring_2))
74-
75-
dist_graph_1 = dist_excitation_and_vec(
76-
count_bitstring=graph_1.state_dict, size_max=size_max
77-
)
78-
dist_graph_2 = dist_excitation_and_vec(
79-
count_bitstring=graph_2.state_dict, size_max=size_max
80-
)
81-
js = (
82-
jensenshannon(p=dist_graph_1, q=dist_graph_2) ** 2
83-
) # Because the divergence is the square root of the distance
84-
return float(np.exp(-self.params["mu"] * js))
149+
matrix = self([graph_1], [graph_2])
150+
return float(matrix[0, 0])
85151

86152
def fit(self, X: Sequence[ProcessedData], y: list | None = None) -> None:
87153
"""Fit the kernel to the training dataset by storing the dataset.
88154
89155
Args:
90156
X (Sequence[ProcessedData]): The training dataset.
91-
y: list: Target variable for the dataset sequence. defaults to None.
157+
y: list: Target variable for the dataset sequence.
158+
This argument is ignored, provided only for compatibility
159+
with machine-learning libraries.
92160
"""
93161
self.X = X
94162
self.kernel_matrix = self.create_train_kernel_matrix(self.X)
@@ -98,8 +166,9 @@ def transform(self, X_test: Sequence[ProcessedData], y_test: list | None = None)
98166
99167
Args:
100168
X_test (Sequence[ProcessedData]): The dataset to transform.
101-
y_test: list: Target variable for the dataset sequence. defaults to None.
102-
169+
y_test: list: Target variable for the dataset sequence.
170+
This argument is ignored, provided only for compatibility
171+
with machine-learning libraries.
103172
Returns:
104173
np.ndarray: Kernel matrix where each entry represents the similarity between
105174
the given dataset and the training dataset.
@@ -114,8 +183,9 @@ def fit_transform(self, X: Sequence[ProcessedData], y: list | None = None) -> np
114183
115184
Args:
116185
X (Sequence[ProcessedData]): The dataset to fit and transform.
117-
y: list: Target variable for the dataset sequence. defaults to None.
118-
186+
y: list: Target variable for the dataset sequence.
187+
This argument is ignored, provided only for compatibility
188+
with machine-learning libraries.
119189
Returns:
120190
np.ndarray: Kernel matrix for the training dataset.
121191
"""
@@ -137,13 +207,7 @@ def create_train_kernel_matrix(self, train_dataset: Sequence[ProcessedData]) ->
137207
column j represents the similarity between the graphs in positions
138208
i and j of the input dataset.
139209
"""
140-
N = len(train_dataset)
141-
kernel_mat = np.zeros((N, N))
142-
for i in range(N):
143-
for j in range(i + 1, N):
144-
kernel_mat[i][j] = self(train_dataset[i], train_dataset[j])
145-
kernel_mat[j][i] = kernel_mat[i][j]
146-
return kernel_mat
210+
return self(train_dataset)
147211

148212
def create_test_kernel_matrix(
149213
self,
@@ -171,13 +235,7 @@ def create_test_kernel_matrix(
171235
represents the similarity between the graph in position i of the
172236
test dataset and the graph in position j of the training set.
173237
"""
174-
N_train = len(train_dataset)
175-
N_test = len(test_dataset)
176-
kernel_mat = np.zeros((N_test, N_train))
177-
for i in range(N_test):
178-
for j in range(N_train):
179-
kernel_mat[i][j] = self(test_dataset[i], train_dataset[j])
180-
return kernel_mat
238+
return self(test_dataset, train_dataset)
181239

182240
def set_params(self, **kwargs: dict[str, Any]) -> None:
183241
"""Set multiple parameters for the kernel.
@@ -193,11 +251,12 @@ def get_params(self, deep: bool = True) -> dict:
193251
"""Retrieve the value of all parameters.
194252
195253
Args:
196-
deep (bool): Ignored. Added for compatibility with various machine learning libraries,
197-
such as scikit-learn.
254+
deep (bool): Ignored for the time being. Added for compatibility with
255+
various machine learning libraries, such as scikit-learn.
198256
199257
Returns
200258
dict: A dictionary of parameters and their respective values.
259+
Note that this method always performs a copy of the dictionary.
201260
"""
202261
return copy.deepcopy(self.params)
203262

tests/test_kernel.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import qek.data.dataset as qek_dataset
2+
from qek.kernel import QuantumEvolutionKernel
3+
4+
5+
def test_kernel() -> None:
6+
"""
7+
A few basic checks on QuantumEvolutionKernel.
8+
"""
9+
10+
# Load dataset
11+
processed_dataset = qek_dataset.load_dataset(file_path="examples/ptcfm_processed_dataset.json")
12+
13+
# Test with various qubit lengths.
14+
#
15+
# We expect that a size_max of 5000 qubits should be sufficient to be larger than the number of qubits needed to
16+
# execute our dataset.
17+
for size_max in [None, 0, 5000]:
18+
qek = QuantumEvolutionKernel(mu=2.0, size_max=size_max)
19+
similarities = qek(processed_dataset, processed_dataset)
20+
assert len(similarities) == len(processed_dataset)
21+
assert len(similarities[0]) == len(processed_dataset)
22+
for i in range(len(similarities)):
23+
assert (
24+
similarities[i, i] >= 0.999
25+
) # It should be 1, but let's allow for rounding errors.
26+
for j in range(len(similarities)):
27+
print(f"similarities[{i}, {j}] == {similarities[i, j]}")
28+
assert (
29+
abs(similarities[i, j] - similarities[j, i]) < 0.001
30+
) # It should be 0, but let's allow for rounding errors.
31+
assert similarities[i, j] >= 0
32+
assert (
33+
similarities[i, j] <= 1.001
34+
) # It should be 1, but let's allow for rounding errors.

0 commit comments

Comments
 (0)