Notebook 01 — Multi-dataset event detection¶
Lance EventDetectorStage sur deux datasets du catalog Telemachus 0.7 et compare les patterns d’événements détectés.
fr_clermont_proto_2025-09— livraison urbaine, seuils calibrésus_greensboro_fmc880_2026-04— trips routiers courts Teltonika commercial
Les seuils actuels (harsh_accel, harsh_brake, harsh_cornering, harsh_swerving, over_speed, crash, idling, rapid_stop) ont été calibrés sur Clermont. Ce notebook vérifie que l’ordre de grandeur des counts transfère raisonnablement au hardware commercial.
import sys
from pathlib import Path
import yaml
import duckdb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
NB_DIR = Path.cwd()
TELEFORGE_ROOT = NB_DIR.parent.parent.parent.parent
NOSTOS_ROOT = TELEFORGE_ROOT.parent / "nostos"
sys.path.insert(0, str(NOSTOS_ROOT / "src"))
from nostos.context import TelematicsContext
from nostos.stages.d0_trip_detector import TripDetectorStage
from nostos.stages.d1_gps_cleaner import GPSCleanerStage
from nostos.stages.d1_imu_calibrator import IMUCalibratorStage
from nostos.stages.d2_event_detector import EventDetectorStage
print("Stages loaded")Stages loaded
1. Helper : pipeline D0→D2 events¶
def run_events(df, country='FR'):
ctx = TelematicsContext(cfg={}, df=df.copy(),
meta={'device_id': '?', 'country': country, 'hz': 10})
for stage in (TripDetectorStage(), GPSCleanerStage(), IMUCalibratorStage(), EventDetectorStage()):
stage.run(ctx)
events = ctx.artifacts.get('events', [])
if not isinstance(events, list):
events = []
return events, ctx.df
print("Helper ready")Helper ready
2. Dataset 1 — Clermont¶
clermont_dir = TELEFORGE_ROOT / "datasets" / "fr_clermont_proto_2025-09"
with open(clermont_dir / "manifest.yaml") as f:
mf_clermont = yaml.safe_load(f)
parquet = (clermont_dir / mf_clermont['data_files'][0]['path']).resolve()
df_clermont = pd.read_parquet(parquet)
print(f"Clermont : {len(df_clermont):,} samples, distance {mf_clermont['volume']['distance_km']} km")
events_clermont, _ = run_events(df_clermont, country='FR')
print(f"\nEvents détectés : {len(events_clermont)}")
# Count par type
clermont_types = {}
for e in events_clermont:
t = e.get('type', 'unknown') if isinstance(e, dict) else str(e)
clermont_types[t] = clermont_types.get(t, 0) + 1
print("\nPar type:")
for t, c in sorted(clermont_types.items(), key=lambda x: -x[1]):
print(f" {t:<20} {c}")Clermont : 351,356 samples, distance 67 km
Burst sampling: 50 frames @ 50 Hz, effective 25 Hz (gap 1020 ms)
Events détectés : 0
Par type:
/Users/sebastien.edet/projects/pro/deeptech/nostos/src/nostos/stages/d2_event_detector.py:208: RuntimeWarning: divide by zero encountered in divide
speed_factor = np.where(v > 2.0, np.clip(5.5 / v, 0.3, 3.0), 1.0)
3. Dataset 2 — us_greensboro (concat trips)¶
conn = duckdb.connect(str(NOSTOS_ROOT / "data" / "flespi" / "storage" / "telemetry.duckdb"), read_only=True)
us_meta = conn.execute("""
SELECT trip_id, device_id, distance_km, parquet_path
FROM trips
WHERE carrier_state = 'mounted_driving'
AND distance_km > 1.0
AND ts_start >= '2026-04-10 11:06:00'
ORDER BY distance_km DESC
""").fetchdf()
conn.close()
us_dfs = []
us_total_dist = 0
for _, row in us_meta.iterrows():
pq = Path(row['parquet_path'])
if not pq.exists():
alt = NOSTOS_ROOT / "data" / "flespi" / "trips" / str(row['device_id']) / f"{row['trip_id']}.parquet"
if alt.exists(): pq = alt
if pq.exists():
us_dfs.append(pd.read_parquet(pq))
us_total_dist += row['distance_km']
if us_dfs:
df_us = pd.concat(us_dfs, ignore_index=True)
print(f"us_greensboro : {len(df_us):,} samples, distance ~{us_total_dist:.2f} km")
events_us, _ = run_events(df_us, country='US')
us_types = {}
for e in events_us:
t = e.get('type', 'unknown') if isinstance(e, dict) else str(e)
us_types[t] = us_types.get(t, 0) + 1
print(f"\n{len(events_us)} events détectés, par type :")
for t, c in sorted(us_types.items(), key=lambda x: -x[1]):
print(f" {t:<20} {c}")
else:
events_us = []
us_types = {}
us_total_dist = 0validate_d0: timestamps non monotones: 6 inversions
Burst sampling: 6 frames @ 0 Hz, effective 0 Hz (gap 28000 ms)
us_greensboro : 1,944 samples, distance ~85.14 km
0 events détectés, par type :
/Users/sebastien.edet/projects/pro/deeptech/nostos/src/nostos/stages/d2_event_detector.py:208: RuntimeWarning: divide by zero encountered in divide
speed_factor = np.where(v > 2.0, np.clip(5.5 / v, 0.3, 3.0), 1.0)
4. Comparaison cross-dataset : events/km¶
all_types = sorted(set(clermont_types.keys()) | set(us_types.keys()))
clermont_dist = mf_clermont['volume']['distance_km']
comparison = pd.DataFrame([
{
'type': t,
'clermont_count': clermont_types.get(t, 0),
'us_count': us_types.get(t, 0),
'clermont_per_100km': (clermont_types.get(t, 0) / clermont_dist) * 100,
'us_per_100km': (us_types.get(t, 0) / us_total_dist) * 100 if us_total_dist > 0 else 0,
}
for t in all_types
])
comparisonfig, axes = plt.subplots(1, 2, figsize=(13, 5))
# Left: events count par type × dataset
ax = axes[0]
x = np.arange(len(all_types))
width = 0.35
if all_types:
ax.bar(x - width/2, comparison['clermont_count'], width,
label=f'Clermont ({clermont_dist} km)', color='#0066CC', alpha=0.85)
ax.bar(x + width/2, comparison['us_count'], width,
label=f'us_greensboro (~{us_total_dist:.1f} km)', color='#E87700', alpha=0.85)
ax.set_xticks(x)
ax.set_xticklabels(all_types, rotation=30, ha='right')
ax.legend()
ax.set_ylabel('Event count')
ax.set_title('Events détectés par type')
ax.grid(True, alpha=0.3, axis='y')
# Right: events per 100 km (normalized)
ax = axes[1]
if all_types:
ax.bar(x - width/2, comparison['clermont_per_100km'], width,
label='Clermont', color='#0066CC', alpha=0.85)
ax.bar(x + width/2, comparison['us_per_100km'], width,
label='us_greensboro', color='#E87700', alpha=0.85)
ax.set_xticks(x)
ax.set_xticklabels(all_types, rotation=30, ha='right')
ax.legend()
ax.set_ylabel('Events / 100 km')
ax.set_title('Taux d\'événements normalisé (/100 km)')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('p017_cross_dataset_events.png', dpi=120, bbox_inches='tight')
plt.show()
Conclusion¶
La normalisation par distance (events / 100 km) permet de comparer des datasets de volumes très différents (Clermont 67 km vs us_greensboro ~20 km).
Attendu : les seuils P017 calibrés sur Clermont devraient produire des ordres de grandeur cohérents sur us_greensboro, sans explosion de fausses alarmes. Si on observe un pic anormal sur un type d’événement pour us_greensboro, cela indiquerait que les seuils doivent être ajustés au hardware commercial (la cadence 1 Hz vs 25 Hz effective de Clermont peut biaiser les détections basées sur la variance instantanée).
Datasets consommés : fr_clermont_proto_2025-09 + us_greensboro_fmc880_2026-04
Extension planifiée (cf requirements.yaml) : cross-validation sur es_uah_driveset_2016 (labels driver_behavior normal/drowsy/aggressive) pour calibrer formellement les seuils quand l’adapter telemachus-cli aura été lancé.