Skip to content

Commit 2ecf5a1

Browse files
Codexclaude
andcommitted
[fix] Reconcile stuck "modified" config status on checksum request #1330
When a device applies a new configuration but fails to report its status back (e.g. due to a transient HTTP 502 from the controller), the config remains in "modified" state on the server forever. The device's agent will compare its local checksum with the remote checksum on the next polling cycle and find them identical, so it will not re-download or re-report. Without manual intervention the status stays "modified" indefinitely. Detect this condition on the DeviceChecksumView: if the config has been in "modified" state longer than a 5 minute grace period and the device is actively polling (proven by the very checksum request we are handling), set the status to "applied". Implementation notes: - Use the cached device object's config status as a fast path: if the cached status is not "modified" we return immediately with zero extra database queries, preserving the existing zero-query guarantee of the cached checksum path. - Only when the cached status says "modified" do we re-query Config fresh from the database. This covers the edge case where the cache has been populated with a "modified" status that was already reconciled a moment ago by a concurrent request. - The re-query uses .only() on the fields we need to keep it cheap. - A 5 minute grace period avoids fighting an in-flight apply that has not had time to report yet. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 0d17acd commit 2ecf5a1

2 files changed

Lines changed: 132 additions & 0 deletions

File tree

openwisp_controller/config/controller/views.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,13 @@ class DeviceChecksumView(UpdateLastIpMixin, GetDeviceView):
144144
returns device's configuration checksum
145145
"""
146146

147+
# Grace period before reconciling a "modified" status.
148+
# If the config has been in "modified" state for longer than this,
149+
# and the device is actively requesting its checksum (proving it's
150+
# online and polling), we assume a previous report_status call was
151+
# lost due to a transient error and reconcile the status to "applied".
152+
_STATUS_RECONCILE_GRACE_SECONDS = 300 # 5 minutes
153+
147154
def get(self, request, pk):
148155
device = self.get_device()
149156
bad_request = forbid_unallowed(request, "GET", "key", device.key)
@@ -156,10 +163,78 @@ def get(self, request, pk):
156163
checksum_requested.send(
157164
sender=device.__class__, instance=device, request=request
158165
)
166+
self._reconcile_modified_status(device)
159167
return ControllerResponse(
160168
device.config.get_cached_checksum(), content_type="text/plain"
161169
)
162170

171+
@staticmethod
172+
def _reconcile_modified_status(device):
173+
"""
174+
Reconciles config status for devices stuck in "modified" state.
175+
176+
When a device applies a new configuration but fails to report its
177+
status back (e.g., due to a transient HTTP error like 502), the
178+
config remains in "modified" state on the controller even though
179+
the device already has the current configuration.
180+
181+
The device's agent will compare its local checksum with the
182+
remote checksum on the next polling cycle and find they match,
183+
so it won't re-download or re-report. The status stays "modified"
184+
indefinitely.
185+
186+
This method detects this condition: if the config has been in
187+
"modified" state for longer than the grace period and the device
188+
is actively polling (proven by this very checksum request), we
189+
set the status to "applied".
190+
191+
Fast path: the cached device object (from cache_memoize) is used
192+
first to check the status. Only when the cached status indicates
193+
"modified" do we re-query the database for the fresh state. This
194+
keeps the checksum endpoint zero-query for the common case where
195+
the status is already "applied".
196+
"""
197+
from django.utils import timezone
198+
199+
# Fast path: if the cached device's config is not "modified",
200+
# there is nothing to reconcile. This avoids one database query
201+
# per checksum request in the common case.
202+
try:
203+
cached_config = device.config
204+
except Config.DoesNotExist:
205+
return
206+
if cached_config is None or cached_config.status != "modified":
207+
return
208+
209+
# The cached state says "modified"; re-read config fresh from the
210+
# database because cache_memoize has a 30-day TTL and may be
211+
# serving a stale "modified" status that was already reconciled
212+
# earlier.
213+
try:
214+
config = Config.objects.only("id", "status", "modified").get(device=device)
215+
except Config.DoesNotExist:
216+
return
217+
if config.status != "modified":
218+
return
219+
grace = DeviceChecksumView._STATUS_RECONCILE_GRACE_SECONDS
220+
elapsed = (timezone.now() - config.modified).total_seconds()
221+
if elapsed < grace:
222+
return
223+
try:
224+
config.set_status_applied()
225+
logger.info(
226+
"Reconciled config status for device %s: was 'modified' "
227+
"for %d seconds with device actively polling, "
228+
"setting to 'applied'.",
229+
device,
230+
int(elapsed),
231+
)
232+
except Exception:
233+
logger.exception(
234+
"Failed to reconcile config status for device %s",
235+
device,
236+
)
237+
163238
@cache_memoize(
164239
timeout=Config._CHECKSUM_CACHE_TIMEOUT, args_rewrite=get_device_args_rewrite
165240
)

openwisp_controller/config/tests/test_controller.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,63 @@ def test_device_checksum_requested_signal_is_emitted(self):
270270
request=response.wsgi_request,
271271
)
272272

273+
def test_device_checksum_reconciles_modified_status(self):
274+
"""
275+
When a device with status "modified" requests its checksum,
276+
and enough time has passed (grace period), the status should
277+
be automatically reconciled to "applied".
278+
"""
279+
from datetime import timedelta
280+
281+
from django.utils import timezone
282+
283+
d = self._create_device_config()
284+
c = d.config
285+
c.set_status_modified()
286+
self.assertEqual(c.status, "modified")
287+
url = reverse("controller:device_checksum", args=[d.pk])
288+
289+
# First request within grace period: status should stay "modified"
290+
response = self.client.get(url, {"key": d.key})
291+
self.assertEqual(response.status_code, 200)
292+
c.refresh_from_db()
293+
self.assertEqual(c.status, "modified")
294+
295+
# Simulate that grace period has elapsed by backdating the
296+
# modified timestamp.
297+
Config.objects.filter(pk=c.pk).update(
298+
modified=timezone.now() - timedelta(seconds=600)
299+
)
300+
301+
# Second request after grace period: status should be reconciled
302+
# to "applied"
303+
response = self.client.get(url, {"key": d.key})
304+
self.assertEqual(response.status_code, 200)
305+
c.refresh_from_db()
306+
self.assertEqual(c.status, "applied")
307+
308+
def test_device_checksum_no_reconcile_for_applied(self):
309+
"""Status "applied" should not be changed."""
310+
d = self._create_device_config()
311+
c = d.config
312+
c.set_status_applied()
313+
url = reverse("controller:device_checksum", args=[d.pk])
314+
response = self.client.get(url, {"key": d.key})
315+
self.assertEqual(response.status_code, 200)
316+
c.refresh_from_db()
317+
self.assertEqual(c.status, "applied")
318+
319+
def test_device_checksum_no_reconcile_within_grace_period(self):
320+
"""Status should not be reconciled if within the grace period."""
321+
d = self._create_device_config()
322+
c = d.config
323+
c.set_status_modified()
324+
url = reverse("controller:device_checksum", args=[d.pk])
325+
response = self.client.get(url, {"key": d.key})
326+
self.assertEqual(response.status_code, 200)
327+
c.refresh_from_db()
328+
self.assertEqual(c.status, "modified")
329+
273330
def test_device_checksum_bad_uuid(self):
274331
d = self._create_device_config()
275332
pk = "{}-wrong".format(d.pk)

0 commit comments

Comments
 (0)