From 55a45315de7cc017519e04589bcfb83681f65f21 Mon Sep 17 00:00:00 2001 From: ricknie Date: Tue, 19 May 2026 20:37:18 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81Prompt=E8=87=AA?= =?UTF-8?q?=E4=BC=98=E5=8C=96AgentOptimizer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TAPD: --story=134314588 --- .../assets/imgs/optimization_quickstart.png | Bin 0 -> 307225 bytes docs/mkdocs/en/optimization.md | 2030 ++++++++++++++++ docs/mkdocs/zh/optimization.md | 2038 +++++++++++++++++ .../advanced_strategies/README.md | 206 ++ .../advanced_strategies/agent/__init__.py | 0 .../advanced_strategies/agent/agent.py | 134 ++ .../advanced_strategies/agent/config.py | 33 + .../agent/prompts/system.md | 1 + .../advanced_strategies/compare.py | 100 + .../data/train.evalset.json | 73 + .../advanced_strategies/data/val.evalset.json | 73 + .../optimizer_advanced.json | 48 + .../optimizer_baseline.json | 45 + .../advanced_strategies/run_advanced.py | 70 + .../advanced_strategies/run_baseline.py | 65 + examples/optimization/blackbox_cli/README.md | 205 ++ .../blackbox_cli/agent/__init__.py | 10 + .../blackbox_cli/agent/call_agent.py | 141 ++ .../optimization/blackbox_cli/optimizer.json | 45 + .../blackbox_cli/run_optimization.py | 89 + .../blackbox_cli/train.evalset.json | 62 + .../blackbox_cli/val.evalset.json | 40 + .../.claude/skills/city-info/SKILL.md | 6 + .../blackbox_cli/workspace/CLAUDE.md | 1 + .../optimization/ci_integration/README.md | 243 ++ .../ci_integration/agent/__init__.py | 0 .../ci_integration/agent/agent.py | 156 ++ .../ci_integration/agent/config.py | 33 + .../ci_integration/agent/prompts/skill.md | 1 + .../ci_integration/agent/prompts/system.md | 10 + .../ci_integration/ci/run_nightly_optimize.sh | 20 + .../ci_integration/ci/run_pr_check.sh | 12 + .../ci_integration/data/test_config.json | 14 + .../ci_integration/data/train.evalset.json | 40 + .../ci_integration/data/val.evalset.json | 40 + .../ci_integration/optimizer.json | 45 + .../ci_integration/run_optimization.py | 97 + .../ci_integration/tests/__init__.py | 0 .../tests/test_agent_quality.py | 62 + examples/optimization/http_service/README.md | 197 ++ .../optimization/http_service/optimizer.json | 45 + .../http_service/run_optimization.py | 123 + .../http_service/service/__init__.py | 5 + .../http_service/service/prompts/system.md | 1 + .../http_service/service/server.py | 157 ++ .../http_service/train.evalset.json | 112 + .../http_service/val.evalset.json | 70 + .../multi_agent_pipeline/README.md | 191 ++ .../multi_agent_pipeline/optimizer.json | 46 + .../multi_agent_pipeline/pipeline/__init__.py | 5 + .../multi_agent_pipeline/pipeline/config.py | 33 + .../pipeline/orchestrator.py | 131 ++ .../pipeline/prompts/fact_agent.md | 1 + .../pipeline/prompts/math_agent.md | 1 + .../pipeline/prompts/router.md | 1 + .../pipeline/prompts/summarizer.md | 1 + .../multi_agent_pipeline/run_optimization.py | 105 + .../multi_agent_pipeline/train.evalset.json | 92 + .../multi_agent_pipeline/val.evalset.json | 58 + .../multi_metric_with_judges/README.md | 241 ++ .../agent/__init__.py | 5 + .../multi_metric_with_judges/agent/agent.py | 48 + .../multi_metric_with_judges/agent/config.py | 33 + .../agent/prompts/system.md | 1 + .../multi_metric_with_judges/optimizer.json | 100 + .../run_optimization.py | 123 + .../train.evalset.json | 112 + .../multi_metric_with_judges/val.evalset.json | 70 + examples/optimization/quickstart/README.md | 213 ++ .../optimization/quickstart/agent/__init__.py | 5 + .../optimization/quickstart/agent/agent.py | 103 + .../optimization/quickstart/agent/config.py | 33 + .../quickstart/agent/prompts/skill.md | 1 + .../quickstart/agent/prompts/system.md | 1 + .../optimization/quickstart/optimizer.json | 88 + .../quickstart/run_optimization.py | 167 ++ .../quickstart/train.evalset.json | 112 + .../optimization/quickstart/val.evalset.json | 70 + .../remote_prompt_store/README.md | 208 ++ .../remote_prompt_store/agent/__init__.py | 5 + .../remote_prompt_store/agent/agent.py | 48 + .../remote_prompt_store/agent/config.py | 33 + .../remote_prompt_store/optimizer.json | 45 + .../remote_prompt_store/run_optimization.py | 161 ++ .../remote_prompt_store/store/__init__.py | 5 + .../store/fake_kv_store.py | 53 + .../store/prompt_client.py | 85 + .../remote_prompt_store/store/store.json | 4 + .../remote_prompt_store/train.evalset.json | 112 + .../remote_prompt_store/val.evalset.json | 70 + .../slo_runtime_control/README.md | 218 ++ .../slo_runtime_control/agent/__init__.py | 10 + .../slo_runtime_control/agent/agent.py | 47 + .../slo_runtime_control/agent/config.py | 33 + .../agent/prompts/system.md | 1 + .../slo_runtime_control/optimizer.json | 48 + .../slo_runtime_control/run_optimization.py | 143 ++ .../slo_runtime_control/train.evalset.json | 239 ++ .../slo_runtime_control/val.evalset.json | 123 + pyproject.toml | 7 + requirements-test.txt | 2 + requirements.txt | 2 + tests/evaluation/test_agent_optimizer.py | 1285 +++++++++++ tests/evaluation/test_base_optimizer.py | 240 ++ tests/evaluation/test_optimize_config.py | 629 +++++ .../test_optimize_evaluator_call.py | 613 +++++ .../evaluation/test_optimize_gepa_adapter.py | 1748 ++++++++++++++ .../evaluation/test_optimize_gepa_callback.py | 667 ++++++ tests/evaluation/test_optimize_gepa_e2e.py | 210 ++ .../test_optimize_gepa_reflective.py | 1628 +++++++++++++ tests/evaluation/test_optimize_metric_info.py | 630 +++++ .../test_optimize_model_callable.py | 261 +++ .../evaluation/test_optimize_model_options.py | 113 + .../test_optimize_quickstart_example.py | 489 ++++ tests/evaluation/test_optimize_registry.py | 109 + tests/evaluation/test_optimize_reporter.py | 611 +++++ tests/evaluation/test_optimize_result.py | 456 ++++ tests/evaluation/test_remote_eval_service.py | 30 + tests/evaluation/test_target_prompt.py | 539 +++++ trpc_agent_sdk/evaluation/__init__.py | 62 + trpc_agent_sdk/evaluation/_agent_evaluator.py | 33 +- trpc_agent_sdk/evaluation/_agent_optimizer.py | 614 +++++ trpc_agent_sdk/evaluation/_base_optimizer.py | 123 + trpc_agent_sdk/evaluation/_optimize_config.py | 257 +++ .../evaluation/_optimize_evaluator_call.py | 136 ++ .../evaluation/_optimize_gepa_adapter.py | 794 +++++++ .../evaluation/_optimize_gepa_callback.py | 381 +++ .../evaluation/_optimize_gepa_reflective.py | 612 +++++ .../evaluation/_optimize_metric_info.py | 534 +++++ .../evaluation/_optimize_model_callable.py | 309 +++ .../evaluation/_optimize_model_options.py | 45 + .../evaluation/_optimize_registrations.py | 22 + .../evaluation/_optimize_registry.py | 41 + .../evaluation/_optimize_reporter.py | 1001 ++++++++ trpc_agent_sdk/evaluation/_optimize_result.py | 361 +++ .../evaluation/_remote_eval_service.py | 9 + trpc_agent_sdk/evaluation/_target_prompt.py | 243 ++ 137 files changed, 26724 insertions(+), 2 deletions(-) create mode 100644 docs/mkdocs/assets/imgs/optimization_quickstart.png create mode 100644 docs/mkdocs/en/optimization.md create mode 100644 docs/mkdocs/zh/optimization.md create mode 100644 examples/optimization/advanced_strategies/README.md create mode 100644 examples/optimization/advanced_strategies/agent/__init__.py create mode 100644 examples/optimization/advanced_strategies/agent/agent.py create mode 100644 examples/optimization/advanced_strategies/agent/config.py create mode 100644 examples/optimization/advanced_strategies/agent/prompts/system.md create mode 100644 examples/optimization/advanced_strategies/compare.py create mode 100644 examples/optimization/advanced_strategies/data/train.evalset.json create mode 100644 examples/optimization/advanced_strategies/data/val.evalset.json create mode 100644 examples/optimization/advanced_strategies/optimizer_advanced.json create mode 100644 examples/optimization/advanced_strategies/optimizer_baseline.json create mode 100644 examples/optimization/advanced_strategies/run_advanced.py create mode 100644 examples/optimization/advanced_strategies/run_baseline.py create mode 100644 examples/optimization/blackbox_cli/README.md create mode 100644 examples/optimization/blackbox_cli/agent/__init__.py create mode 100644 examples/optimization/blackbox_cli/agent/call_agent.py create mode 100644 examples/optimization/blackbox_cli/optimizer.json create mode 100644 examples/optimization/blackbox_cli/run_optimization.py create mode 100644 examples/optimization/blackbox_cli/train.evalset.json create mode 100644 examples/optimization/blackbox_cli/val.evalset.json create mode 100644 examples/optimization/blackbox_cli/workspace/.claude/skills/city-info/SKILL.md create mode 100644 examples/optimization/blackbox_cli/workspace/CLAUDE.md create mode 100644 examples/optimization/ci_integration/README.md create mode 100644 examples/optimization/ci_integration/agent/__init__.py create mode 100644 examples/optimization/ci_integration/agent/agent.py create mode 100644 examples/optimization/ci_integration/agent/config.py create mode 100644 examples/optimization/ci_integration/agent/prompts/skill.md create mode 100644 examples/optimization/ci_integration/agent/prompts/system.md create mode 100755 examples/optimization/ci_integration/ci/run_nightly_optimize.sh create mode 100755 examples/optimization/ci_integration/ci/run_pr_check.sh create mode 100644 examples/optimization/ci_integration/data/test_config.json create mode 100644 examples/optimization/ci_integration/data/train.evalset.json create mode 100644 examples/optimization/ci_integration/data/val.evalset.json create mode 100644 examples/optimization/ci_integration/optimizer.json create mode 100644 examples/optimization/ci_integration/run_optimization.py create mode 100644 examples/optimization/ci_integration/tests/__init__.py create mode 100644 examples/optimization/ci_integration/tests/test_agent_quality.py create mode 100644 examples/optimization/http_service/README.md create mode 100644 examples/optimization/http_service/optimizer.json create mode 100644 examples/optimization/http_service/run_optimization.py create mode 100644 examples/optimization/http_service/service/__init__.py create mode 100644 examples/optimization/http_service/service/prompts/system.md create mode 100644 examples/optimization/http_service/service/server.py create mode 100644 examples/optimization/http_service/train.evalset.json create mode 100644 examples/optimization/http_service/val.evalset.json create mode 100644 examples/optimization/multi_agent_pipeline/README.md create mode 100644 examples/optimization/multi_agent_pipeline/optimizer.json create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/__init__.py create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/config.py create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/orchestrator.py create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/prompts/fact_agent.md create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/prompts/math_agent.md create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/prompts/router.md create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/prompts/summarizer.md create mode 100644 examples/optimization/multi_agent_pipeline/run_optimization.py create mode 100644 examples/optimization/multi_agent_pipeline/train.evalset.json create mode 100644 examples/optimization/multi_agent_pipeline/val.evalset.json create mode 100644 examples/optimization/multi_metric_with_judges/README.md create mode 100644 examples/optimization/multi_metric_with_judges/agent/__init__.py create mode 100644 examples/optimization/multi_metric_with_judges/agent/agent.py create mode 100644 examples/optimization/multi_metric_with_judges/agent/config.py create mode 100644 examples/optimization/multi_metric_with_judges/agent/prompts/system.md create mode 100644 examples/optimization/multi_metric_with_judges/optimizer.json create mode 100644 examples/optimization/multi_metric_with_judges/run_optimization.py create mode 100644 examples/optimization/multi_metric_with_judges/train.evalset.json create mode 100644 examples/optimization/multi_metric_with_judges/val.evalset.json create mode 100644 examples/optimization/quickstart/README.md create mode 100644 examples/optimization/quickstart/agent/__init__.py create mode 100644 examples/optimization/quickstart/agent/agent.py create mode 100644 examples/optimization/quickstart/agent/config.py create mode 100644 examples/optimization/quickstart/agent/prompts/skill.md create mode 100644 examples/optimization/quickstart/agent/prompts/system.md create mode 100644 examples/optimization/quickstart/optimizer.json create mode 100644 examples/optimization/quickstart/run_optimization.py create mode 100644 examples/optimization/quickstart/train.evalset.json create mode 100644 examples/optimization/quickstart/val.evalset.json create mode 100644 examples/optimization/remote_prompt_store/README.md create mode 100644 examples/optimization/remote_prompt_store/agent/__init__.py create mode 100644 examples/optimization/remote_prompt_store/agent/agent.py create mode 100644 examples/optimization/remote_prompt_store/agent/config.py create mode 100644 examples/optimization/remote_prompt_store/optimizer.json create mode 100644 examples/optimization/remote_prompt_store/run_optimization.py create mode 100644 examples/optimization/remote_prompt_store/store/__init__.py create mode 100644 examples/optimization/remote_prompt_store/store/fake_kv_store.py create mode 100644 examples/optimization/remote_prompt_store/store/prompt_client.py create mode 100644 examples/optimization/remote_prompt_store/store/store.json create mode 100644 examples/optimization/remote_prompt_store/train.evalset.json create mode 100644 examples/optimization/remote_prompt_store/val.evalset.json create mode 100644 examples/optimization/slo_runtime_control/README.md create mode 100644 examples/optimization/slo_runtime_control/agent/__init__.py create mode 100644 examples/optimization/slo_runtime_control/agent/agent.py create mode 100644 examples/optimization/slo_runtime_control/agent/config.py create mode 100644 examples/optimization/slo_runtime_control/agent/prompts/system.md create mode 100644 examples/optimization/slo_runtime_control/optimizer.json create mode 100644 examples/optimization/slo_runtime_control/run_optimization.py create mode 100644 examples/optimization/slo_runtime_control/train.evalset.json create mode 100644 examples/optimization/slo_runtime_control/val.evalset.json create mode 100644 tests/evaluation/test_agent_optimizer.py create mode 100644 tests/evaluation/test_base_optimizer.py create mode 100644 tests/evaluation/test_optimize_config.py create mode 100644 tests/evaluation/test_optimize_evaluator_call.py create mode 100644 tests/evaluation/test_optimize_gepa_adapter.py create mode 100644 tests/evaluation/test_optimize_gepa_callback.py create mode 100644 tests/evaluation/test_optimize_gepa_e2e.py create mode 100644 tests/evaluation/test_optimize_gepa_reflective.py create mode 100644 tests/evaluation/test_optimize_metric_info.py create mode 100644 tests/evaluation/test_optimize_model_callable.py create mode 100644 tests/evaluation/test_optimize_model_options.py create mode 100644 tests/evaluation/test_optimize_quickstart_example.py create mode 100644 tests/evaluation/test_optimize_registry.py create mode 100644 tests/evaluation/test_optimize_reporter.py create mode 100644 tests/evaluation/test_optimize_result.py create mode 100644 tests/evaluation/test_target_prompt.py create mode 100644 trpc_agent_sdk/evaluation/_agent_optimizer.py create mode 100644 trpc_agent_sdk/evaluation/_base_optimizer.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_config.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_evaluator_call.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_gepa_adapter.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_gepa_callback.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_gepa_reflective.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_metric_info.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_model_callable.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_model_options.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_registrations.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_registry.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_reporter.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_result.py create mode 100644 trpc_agent_sdk/evaluation/_target_prompt.py diff --git a/docs/mkdocs/assets/imgs/optimization_quickstart.png b/docs/mkdocs/assets/imgs/optimization_quickstart.png new file mode 100644 index 0000000000000000000000000000000000000000..d68dd4dd6e0a4cf1001b4a505ea71cc808f74fff GIT binary patch literal 307225 zcma%j1zeNu-aoB~2#SKBgd(Ao%IGwZ?vh4gbdF8|3sFJ3K~kh7Bu0sdbR#gv0I4xz zlkV}q_MG$n-}7*u^FGJV2Vta86qMgYDI;IT0}&Y+(bm= zYNsi{clUxFfIA}L?yjfG2nrZ^7%~QvnjbVN&n|-@;_cl zKl-x~yo%CQ!2m`?M9)F^AXe16@tcT9mPqm8Jslt771XIz-PXO|YcD6`b6W-_ir{Rg zU!6XE=DMzI*tIjZkac-Fx+~XygraP}XKH3QGRZyrIL3WlmF(tqwg$e277}IL7Rtu; z9J&gHteQYg1xSwy+VrL-iA}-0>3*#x1yE8_GCT_)KV2`|3$2nXYw&HmW zE4dqtS{mtl#XZa8`HVU@`ebn3bygi07$to^!&p|8^!gZ9w**;8%J};$9b9Tlj%7O% z1TxyPa~ur?j{R5E)sR9>7_R*_{<@Lk_`dXj_Bt{7RQ*rLuSM+EE#pz6+T{UqwdkJ6 zoC?Vvon?uL$!P!OKep=A7rXgrzQO+Woq_i_cP_*k1y?jSPHk&3hV=-s zpvwZ*S1S6H3iXOp<$W-@30C`?nBLl^&X1beITo$2SrX1%x^+XyemE+@EXV-A*B9Iw zdsADoMBeX0uQ+>J%)xnCHevGd3AaePRFA{o*Tic4U;0MzDF?6lWKvTtC=Jv^t0Ch= zP%9H?$U>$P!d__3TcG}l9t78=dDo@8{c5Fn{J&iRM@rhvi$=JJpV$B8x)71nnr6yH z&`b9{uC&!GFl~T4BCC4PhR!ZNxQ3m8(3tuRg9qQ~p3v%7Z}Wz@8@-ym$3Dd2pP@yI^a`mDA2;Lqp3{X)nm-NOB}eB}iG ztY&fF0^5AH0WW}Cj zp`Ii0kt;676QA0SZmT9k-k)O@S7`9t%F!v%ijI!HzC2o*AkcKE<%OGE#o8~?NbhD2 zRe?p0`1~Yn6N@rR7INUsS4)dF3)+*L4BRovkJQ`W!3oTF#tAx6GeLgWjAYBx)UJ2( zq*d`5RkiDz1zPtW|9&YWx@tI3i{H$bNvc}tNr`Z#v5An_h!vmknSHw*t&({A-a5L^ zWdu>Ud zzfYoCw;eCuQ}SLb?p9Yw8qx~YS7F_{Go`pZh2q;=N5{+|WL`m+*Z5nN;^GSjvBk#6 zcf(5)+1hBBqhhHY=Fz2it~MI%4wfJ8Y4FQ}!)ke5t~|I+Z@{hZt# zlsV##TM~+W#0*#3;we!{Ij-~_x#`lhB&hq5e{>Tg(5f7FXI4kG{u^VdTRwJhdVITg zuXir496GYo>zfm&F^WkSXuQucu%-KHNLDGg!9S4orIuG`mP=E-6A!(F2_4ca>K3(t zw07C9mYRP9{it}KL=uVW`yIT=r;fj8u1d;D`M zhSwA`wgX?;sTY6qGrieZDlC01krK5n7IausoBuHULN6aSli4X?8)2i$c|AX@Fl)n( zC{p58e$dhW`Zm%1fqOwF{jQ z%x~Rke3qpD1)(tJsirJ!vPgIpT@)~#5M#Y-^@_50f0p%_8|)ePahslG4ku)lXRi~7 z;qvereycQCQA;bDd3^R`h%uZxt{C{#~*raP9=+bek#uM zm-JoFvHhJ`_iR6~Z08-L;3K63ekXO+Ow`v9(lnUSS`cehT6X?doT;Dot)C@mM50%f zZ^!+DpK~*~2y#oA7_sAn{%rM@k=5Lxz{p*c*2)Az@GhoD-wIW4z_N4byZMlA!rOf3 zT)2SQ0NS*{WrgQ~^>Vo!%MNP0`!-j5>+H8uuq2sd)iVNao4dv|`G?y1n-7y!pvK<` z@yz|6lyWyfUI}oPNGw;D-tWStZMz!dA@21Td1Kc`9B-zKbRDxC#0VyM&xff0xH4q2 ztk-3uE%JMUkt~?_bOHWmNTS@ zAB!BhxhDIc%4qy&*ZXb9x;vS#Mubc`k_twT^)esht|`LtoG@|hPaCV*Zy75b;on;s zpPl)0cPAn-V4FUk>Z~xg$orRm`A0SVYV-%UQwdFnKyXG zSj~Xo1C`9xwL@|)%n`iz!-JRhLh_e-lIPE<4Ypzq7LnC1RmFl#Qw()J!Oe=>Ol_Ow ze)v@}E>T6nX-AW?hmgzig4&4b8^P?uE}m%~zSAyhPX`){Q><3&S>;G=D=%_CR)FK@ zaa!>5V?NHV8#+RP?^qP#sO_4s9e6u?W0$w8(W`ofWmkf=$?$%*z zVl02My^WKQXG^ioUOQ0}XiJ_-bg6)*YWsq%E;s320b>-TFq+SM!tLe7_0E|O2tP<- zEAxug0WzCHT*z^(KRtOC?z!BpXUX(fr_4eQo$Ooztu}@+nZ#wwN4_r6t;Uxl2Qc$k zR9#o97{jqU{v~Tr)cSl^N}Wenc{GQzIJ@mgYJ>ldI2zuN$|v{wVwr9Xb*I?gmn`Im z+6XA@jE-79xzge`EP1|ND!5RO;dsUfN9CR7hkL^)*iG+kIJ%0oA1x{Vnb{R7gSV}c zfZ4oZk_oE!MFVIfGz)PW*O5Nj)Yhi)Zp1Yn^m3)d1(`Y5{Gy2bVo$+kl(iRFX1Sy} zbx>kb_eTGV{m+Sn@-HPa5+jyVje#r%-^*c7;3&Of9s#I=IBY5Bjn5*DyT`F~#SN6& zYE}MAU&r$pQ-?PHV+%DQy3sNH^yr=l2GIluv!7ai_$@}5%g)P;Sak{-_mWy3iIJk% zJL)ns5YMZ-nKw}V2(m_x%exGF_=eFmhbzHJTwOy|sYc>?k)yRzPM?n&zpfAF zC<_>@qeQl~hTq8zlH@?@hQ9w2NPc=t!u5HwXO8o@o^{o;<*eE9!F}(h1bQg}DCpW| zeezN~vAuLy@tzSI6hB2$x`ekLCw`fnhKAXwf+tR`X0vVsEg&BrRJphvKx!J;H>|Ic zabx4h8_3os{=?B`cCMA-NU{Iqp=bm4?s1Cxe5V?NAeRtkqKhjcp|Qe(W47n=WD@+F zr=s^ggkSE z{zE?qtzhs&7*1HiPqkVW#z5`2HysvXgj=L{HXL2KZwMh$s@@+oXAI8u#!cq!!Iw*7 zxH19;Fu+l3iF%FSjP^|(t8z2FpJp-2Xi#qBAZs>5%3qHq za8uKu);%{pIqRMqpJC}8j=!{)R4ZhB^@1TA0Us?+@0#bmAYG`d#=_PtKJS=7JU@#2wfczgsSFWvF^ z4)M}uk^YX5Wg{O*shWH-s(b)H7!?|s5E6z#us$I5uT!CMMm|8ujDtt6MmiyrZwXY3g* zF_DPm7iP0XRyk=k1qH%X1vhGx3$$`4^6xrNh@y>`Lr7vBh1rn?6;xZ8p|(`}ukWC8 z?g@TdOSS50aEY#WP$Kt)&8h_04pmi2l39I(cfVG1DE?R|7@%!6*XVb597cFCV`bTb zsnkE*naXP{GFJOPJm~Fa%FJm!;=eoFcLuWCE^&JiP=qLenMEDKM|N7z?cxt`{$yNs z2kBD&a{#D(F%jKA&#k{@(VuC$kJ&gcEj7}OZ+3aA6-SXE7AVBFD5BnwWXVQNF>6)dFOdLaU)?u7+mc>Sd=4rnjZh$#w=AO8TF8 zlZiWfZAy_}Z~C(oXG0SQ&5JgbAjX4NWDG~I8`$hIed)ft>D}FZ9{12eQIpkX{>{z7 zPwK&RD_*__YUQ_9C*loN1t|E_CB1aCgp0oPTnLX4REWBS!p5E;fwU4N`!?~(m;BMa zX`CDWY6i8(GPT_^Kkb+{07joe(~vbB_j6`jBV_vLq}6X4OG>RU^6P!ucomWVQmUm@ z$%n;lZS%8D&11?wcK&ph?XwI#zkI*94z;8MOf4B%|B>{+h1jCzX?jhrz0g1NnaiU!~L<1K?=-k5{^>^Ti37Y9*)b-k;788SE(%jNBl z4E5C;F!7fL11tIf9j@2-@BaS9f;_#{Y8z;_@(?H#mAdb)osQ)-Yu>DCnyPn^`s`5W z9U~^7oeLi_ZS>>IXZfgAW$&3mluIYiPu^FHlYoZCFYS}D=oV?4IkOyAevhk-RqP4O zC`R{XOzCC&NxF?4`=MeUwzNu_LVvOw*cc<$p$0bU8J$J*a@NKuEo}1Crr=aIj|&_@ zsWE6?`>lzhsW?NX&9X$nSd!w*Oiu7IUV3WnjdO)e{s+9vwVl0$R&s`861>q~8-MdRsmhC__O$^6&# zJS@(}L?BfiDc)yx({N^R$G?>(=E!r+cBpu!z6()Vj6IBlH9_|W9ju3{ULu>0EEbBT z)sF=E3biz05gh>)ZpDjwrZ&N=Q#F?QFg`O*a&D8esdu%>4&@=nB4j7LaTu+x!yy7P286ZDPso;?mv%6vT?M*73shm9-H3sTpo$b z&j=cyeGhgv4^VE><8TosE;((0yzN`Pf9=t2I}r@w`KOqUH_w%T7JWU0P^Z!nZi>xY4504)N1wvb)|Hvm3|{0Nr!sl@$ROr6x_Z-@~fKx zhL39BfckVdnVo5r-2mOeE}x@OdtGaH^l6=K5m&{J$KuwZ9=k213kRr5fLsnyB3e=f z;YrxV5FO1^-heeO=#%ytbsbo!cO)U?v5k--AQ~$niWE_ zn8pKaOu2Pe=SZ>9{I8d+YQee2C*Vc<7m{MbFY7*4u^0QAUh(Odq>l0RLMoJ7Utf|7 zvwz1Tbqi~rqLwD!@#;K#_hXSspX|0oEL{)xTrbn+bQld>z{VccV4b9<1NXo@-M};n zh}aLe*G@M0dvnPuhP|8UD@lX2|1h%~oSQLoM>;r-SC;9*D({(Ke`t(g8U^x4xB9M4 z`pi;CPJf}kBHRvI4Jp5rM3Vq9L)H@~`IYWmy4GzV}(G{0KLQa)yUGwRE{4aY41?5IpeCUu_TYGnTznR~Cx{z3mB}~#XMen1&p1AmFKOJj$Bcg| zoLbp5AdZsZ;iQDTJZiBx3cGiL3d&Ai`Uq6SjowJ?*8fuT8ow%T1RUeat84Ee;6{5afmh0Zq<@8gm zYIWySetKG*ZOubCURypg@tP7vdr&R|*cm_MWF(YVJf^bMP<2}M9tJp0SdckdIKRQz z@ToR50ku4mc(YTl_?mvc^-MX?>Lib=Ws&S|WwY)l@O1kX5I28St;|ngO3j~U>VM*L_C*t#G?LWct`=&e1it8&gKdR?z1LcekDO}f!-Lz4 z)bSY?xpf{pU)|Ig7dCA@T5>e|qT-cH9;8!}!EIfrm(VCs&u?kQc#G>Cv&X(j)1I*1 z_?PNbXUvLo?zL4~)0->o zJ#f@5pajN0gHz8{*}UjLKVWr>H~ye!GO*((xmx!d5rY@Rq)EXZouR*V<4dwBpQ4CI z{G}l-u^suk^{!_EL*>sn7$ZBk1O!`fQ?)1@TA5NqBER?*I0jN~(7YtR#*sVoBlO zxzL_uwG2RRR3YZAb+*A3ff^wslLf1qmOMKhdsjvtOnjRu6%5msq|p4$y+84X47~1`p$@n%}(l>cD;`ZnvB1f{4GfJ z9ObE#4TWa4;;U;jf~-Q*&RpWRAj#^MvP=54;$>o^mT{pW{*TJ)`!2l2EDE0SMg*r# zS`B&J9`9BvJ?l6N$}CqSK5G+yKm|RA3c@w3b0v8v%BvBI~E zGB1QHXWp?pi5VJ5BIx;|2|fYnYvmlWH<497JVx6VOIAk@r2oD2+D_DzwY3BvAMLQ9 zx2*uZsk0FPCcajo`g5GZosQUo9_H>2s{R;yb3B!&Eh58akld<>>53Z|{cO z2Y9eTRk}?(U#(-NH6qQ6+Q*#%cPFf2JNS4fm@{*6t|NxUVfdh-&G?Ib9HLyU;+!u- z&-Nz5CZmJk$pv=I25c@4NE=mQB;oV$rj>i!3+AJkRSO)A$8c>!hW5e!l{OR<=T=Rg z+=61k24e4g+iM&I_X9V{YetEF3xx-Rkic_pg zMt8EYu-efOGPQ@bGmn-K#Vq0j%RM@MA`e!=p@oQQ&ls_vj^b4-lb`kQ2V7G{P_u^L ziC1L;Fg7C;Tz0fSKVI9(x3-sXcu_OGCDHos3g*t38@uO#XSxh}zaeqEg4v#N?RP^o z+_I4FHWVjwT=+g+zFaNP8+TmRw4zFNx?FDR1GSoX;7m8Wz_6?E2A z!JD9+xh~ul+GWA%Qfl%u`^UIPLfVOZ;!`u>W=QP}%NzXCVFM?mwy0%tEiJ8XLA_5p zH;vMT%6NMKi+OzD}^R|=qwe_2W%DsD_Le4Fu zxOnw8@GxpI-qIC6fOzx?y8DnUYzFF`ih zJ$(xmY{bHsZ-|K@*nf9o6gX@{9`7}h9m(DqFMEh^=k$?F$QY{%^T5xVZ`?3z*!6Zh z<>w=l?!2nITK*_nfGYxB=al&6O+*jEjWMgg)v!EfDOmNpBvxKpw-e z1-ekD^n;F_YppG9cPA?+a^b4~*%6L9R<>wQJw@bw)v1nx**2@4eIb?*WyF#3`(D;? z;je}2GDnb=gW9!d*`fe5{$f$@5lGk@KE=@~ zt3nJ|Ib1z?v%Mc@CoVs+f!M+GqjnpapT?3|E%3a@sUu`^G0O^?cX$7fNGJ^zDk#Uf5A;F%Y(5N zu5Y#>7L2b)g!X`Uw!)aSyC1O);e5BZK2H;zwzvK~Sit+wp7I}m*MJ+ixbzl+mrh*3 zP4dS5B_7O3amsL!fi}Si2aaPb4Bgw&5yQnHB&0j(i)J1m%B|Pg+(Wew( zYThO+R__uS%d3BHJ(nCk*B5-J20c+*VP{n36k8@Bb5pob3zEM4?fHgX%UlALU5G}K zu2-oM-_#36oi8q3dEJKevwv}vnwQvQZ+=QW`{Km;_(>AOse0i^gP+n?BR+`x^qg== z>|2NVreG#>mO>!KUoiU+O#t+2SH%-=M(*Pz;5MB#lVK$X7;KT!evlxTv+0^ScNk@^O8wQV|N47uE>z%x zS5L{PNB;R8o+w2B@=!MKj1`jJC23#h>^oVO^B*6VWdV1cOF2XM+O>Ot7+nW6G&)Nz zBDt1<-4|CdW zlfge3DeBAXQ@D3RGXBelA^qg*51(q)y3fB+vA4HhykEO}&e#GsZ|cALWgR(%*IW8o z$OQ4mpwW_{)d1sj}&(HI~3q9(Y{km_#{q z`W@%a&Nq&#$uf#vTl(w4xf&nt4`#_xB|+?>CqP>+cbk0_jJobzw=?OVcIO6`pDT{G_GZx6iCy5H-g7I99As3-p=0amCO3)TJaxN z?A-MbW5&#!kc)tR+$~ik6?P;9S1$7Cbps;`6!JN@60S?PbfPGznG}Gxp!&6~7KWY! zWIW5j9O9trCrA1K#1VMwym9e8DI&_Cj`chSsU>&(p)aD^C8IlC3TgAIOQ6XZ#7&@8 z1%tI!&Zt`sqa{KioZz{1J&6^;$3YnfmG{uYz8(>UF};W*41d&II9MGL`}V@fxmK>Ec7bU_uqN=pD@n*9p4YhMc8ey+m@Xbdy^D-Ym~D$p**e;tjpQ`vZ6RL#mm=^VPCe)#EVS)$#7az;pM>3-z-b`sk=j~X zRzN3EtL{ycNa3qUePPDK6p%@*rf=-_JOICko=ixCHIxv`F-_zU%LBv1t;9~QZGf+q z&<65Ns&J`gIlH*=6s-$|@(M z-`RZPz)4~yur*@dF-!PuR&Zg+xlyhy)SR?k4RR?I)WX2YBr|USi0UqEK}AC6d)x6L zT1KnXRLbURTA|%gUT?fdFL`Na)U^iz>FUATn>)`a*1kB5suA3s(mK%-TGU@}6&&SU zCwOz6peDZYO?iIil6z=0WT=l|PED>ihnfa&DzpX@H6*heeZa$-`ZuWzYr^KY)4nK)PIh5mb4qgi_ z-a9A;VZ!_j3NpLjAyD2yq8C34^$#}7sF@bQ>PyDRk@7kYOkeZPd_jF*+<&|14U5#c zmmslHa?ec-yPu!pU>M+ph!9_>x+f@NN6*9uH1>)&S44V;+1&=vRdUs>+U(u~Pv6wk zhKIY5=Mid9Ow%F*QE~i>5_x?auvmac_ z=+XR#mn`mZN&N2Y-C%qG+PG;$yfhvp7&-x5T{fnw9xO5tgcT-lEe^CWKB{gXQc(?g zeY7{Y-13sfJAG(-^o^ERiX!sI`TwFhgw)@Pk|~w;$wZsh`;<*h-&mCaZWy}CJd_4f z^Nl$*N5o;|L#}c{vK!>|H+r0TJ`mbo1s%qH|NgKnDIF9msnD3pCJF;)nV^Y@Pewg* z*)?I2^6bw$bfo0H?h~sU?%EWZ2O4&2ysToU;y zq{X82lqw_e379D+?P9RUy9Pr68yVRTlPF|JcSPWG`C%=JK6y1<2P>PJ;L-wIym# zn`we`C~e@xH61sqqzLz9({TM4i~BU2tL9T8j#8IU=jM?Q4^G_#cFyRfKJ6=~=Ma7v zmFhR0A7i;SucP>1(y(s(qnO8Z@>Cd`0w><)D#g7Pm9^_l6$4C%neKKL$j(*X$pB0~ zsatVlp$Dd}C>23QkY~6wpdhR_)0-13r~GOhf_-|e&cwto%;6Y#M){*0Cvz|Xm@eF<+YFFBQ| zl~~|@tn_c;a3K=#X;u&mb?Z(avsM_SK;ZYhV6B1%`0o&ow@5GUdVlBRGAi>RwCV6!E* z>hE)BYe}~`Rg>rLR5xW*U*gg8*`(acu^q@-hzAMe-3B3zr zavFW(fU5IO07gwaXvXYQ0UK|Hwf`Df{~52g50Q{@*P1ptGg^Zka&Q;L-&Hq8#DVj| zZOGoE!3MQOvL+CUMvOd)`SY(v*Ud#ubR&QzZ~<$-KmeP8q=e@OuBl^KKC;7(ND;_B zAV9Gx;_4O(4rGWd0+SRJ>0Gy3#hWa)kq45PkvDGKNTl@eukPk&vdTfS?o$N~*75+y z1A768u{XmgsP6_Qi@5NEO?^>n2}esUJ?ucnovYj~_h-iWSA`@*@_Lj^q-4IY3U!WR zsqN=bK744*C;d`q0r~ll$a{zx7so3_PY%XrLE&=L{iIjCG~7~bH8F(9S9Eq z-O8f6Vn{afO;yjj2))w>$_c;a;=xig8CcBt6s%9`19j8`e-}h~s{!JtzS-(e2&_p)S3b(~+8kD5|uL0&k@Y<}lmVQEO_bc{+ytL2T;4;;C7utQ9@s7o_J|XWGfR9tP zSCM(DR|asO2!dxFZti!` zh0qXpn@|E6I1rBX`^@jB4T?U8w6y7Xm6L-sFjNKAu*4l#XGdNIp_q5GvK`E|GP6SN z&`51+WOgM@f}|J#)H@RD8Pb6dWvU#=#?cq>Qo24I_S0fMm*UT3{OzcMTWEu3=ybD-!Wec7;Yp?u9j$Dz#X2eob8iw8J- zkQGG!!~N%}{BV;@3e3^rMxR;=1!Ha7kThTpu#!`sCIWX*&oJUY2P296PUK2hBaDKQmCBw(X5m!6hAlXf3NgoZb zm9g_59loyu;o~k6MVL$>WR@M5>F?YmC8PbQUz!eao){zJcABg=pyz!|n`txGLtsBi z4uF2nOkU@2crUb^1oa!_Rb`^BaYl*LolTmQGCnci@YfBrki1+%jZ58ejlNeI1=#5# zmxS`s{#F`hh)QixE!j;`#y}II@hntU@3gI}xgPoH660E}O**B4Zj1d7|IO}tDr+7h zS=MSa-PVWa0g0R#>A-!3p*&S7`XQiP6zCL0+YRx`UHtp5@z3gOq51Tz65?=#pzMK# z8}sNbQ&QxPJHK%a8y%<0Z|8-pC(87Hd>E2Q3&gn;vmwxC=Elk_34}T_&elBI8S%(9 z`V`fFho?X>xx)#hNa3d2sK!-5>pk=|Kinq=Nt9C9e<`v601)z+zMZuzqVFvT=-m`) z)J>=SkB9xIAGFs2TriLF?u$7Q!THaZCJIC1xE7E3uJVsE_%vTO>8^DSh;-(yEtfWJ z*S9m0*pYy?9t-4rf}~Fvy-57ezwVFULOVoarSTdw+BWltG-;uk3CM|7xs~EgtsDhj z{tO}cdzqlKxX->7Q34g|@SWag^Lrc2U^Bd(T88{hsi9Q{@3D0dX;BR32|mbLSY$1} zjIJiLz%-10(p8?Bo$aAAtbnSP_-TSs6}lshyY&5Ea{!}P1{tKOUgdb+JF86J<6~pz zs%hT-TM7GfFZli%=;meXd5^hBPFrznfZw{e({$`4?LFyh^Wq2-B|&Df+E8sf@a?Nv zjq5!}S|zLx-E~0u1v3ahFkZg13OxpbB!Na*;Ai`*Krdc}quK*aJgi`H0J z24P7M7)Uw*LsqMAhTxtKH^IOUtRxQTYC2$yX?_5z?t%KJ=p;udgFqOI1IL?$t_EvEt z&XgtUZZVi!S&&-vm-&Z2*IF;JBg;gj3owG1``oJ{V2EhE#+9r)8Fb z5bdz)rQphxwu93r1)l)2*}y@Lr%3eIvWQmtr}XqsV#L5p)W|2pqGx$8Stt zh|#_Tf}Waf8Bjg~m6mNc!>=<$nwKgVfAeirn5t_dwf+?@r1t z-F!yMkZCx-IRKkQtR`-QKWwCA z7BH?c4`mag?TQDZN_(q?_l%f+e*}VL)Ov(Kl2UWHw6p1YDF|U1fu_L&)>h)jumB|YC3;*i*gazFtvKJq^BHRqbaByry5Oyc7L6s zJDuGuVhEv%s0%pS8tpBB{#es@ZrIYx_x=9ryqZ+%9%S*}aj2OLcKTFLs1C#)b}eE7EtHaOm@LMr+8}Jc)V7zrme?oR2MeOut zK!?i-xU*`>t^RPmD-GuDswKLC@m6Ts*e{#nU^W(%M0oj2`uxrmEY-9B`RR{Qu10^q zV?e@sc8p4wD+tPJVNe81z>eJx4qg2EViWpIY(k9MX+JWUEEb~_+` zM=emd!3BUYjTibdaGU*S1^c8Aio{sOJwEz!qAd*NMoW<2Fjd}OLEiM&Qwj=zU6aK* za0~7<8b%|%cy_0qJyHSi4b|IiEi^JK@85P}HXYx>T?yX zXznCRDf8I-$H1=`1k`1LTy7&>CEL4qcx^->B#A>Z@V2ASatC_}j% zX*dM=zRs2?)nBC{k5~!blSGkfC!6d{wP@l&1>s}*J;(p6JKqS8X`@fMrZl>QpB!7p zX7|kGI86l~Hz4Q-O-D0j4-TdQ6^wVA{ee(k_~^h2rt=DrVnR=Q@kIz>>NBuYPQGU^ zd{Y$NKorUE3!*3KHIYu!Hn){skKYYD;ko~=y=M_I##*cumqE$O6*55;=~cmpQje6F z@Bp^wzCZRFiX6xap=J6$$G?N}&VY#nGMZl!=ZfCAe4{-EsE-nidLW<)w9N;$vhCn; zk8LuhwQkH&1-lwQVWf%Z=5KZmo|g*Bgk6DHm(pAj?(Vpr&G_N&t>nFS@Twyb$FXV0 zQ+dR?gu$T}+v)Hf|Iukqqlu)?jz&5B@1-+LB!!t`t5GRH2B8FaAJXl^qJ3vr2y;-5Z{Z}6IG`oB^KLfvoh!o1alKs`tmZIAt%SR zPOR{#Z1VQZ*laVJhzHZrUu`l1p;dR_!_h&i$Dzj|r=(u5!vE)904xek+7$YpXY2TW z0-+LGP6yT_Us&Bk>BHX-2>E7=H)!V_Aa&`DW=RoySn*mV8PZ`ls#LQ9uJK%WApdH5 zZ0rrDMa$~0@-}d>#InIiyF$Ex88#I;P3hEcGw??HA5cB2T`N;gC|jC;4fO+B5KRE``8wQT& z9Qyg%9SN>UF@>c4NIGG9Y&md$^A2*Qw))Olk6A)o$%(QQpeeWGZhJirzWt+0BgS#J zZ)ts#tepeQZ$2W~?5l(hU^bChr&B`OMnGp0d>&NoIAAl-=&VhxG3e%jY_*NmK*LVy z`kvau_-#trQxs`LAa4x&WB$fGm-QmVD$@45-^!jQnM~5S3kOv36QgIxZ@6ip5W4~)K$s7Ip{m3G?skx=b&VhMs$JMIM5%*xS+tE<8= zzeovguKvs8Q_Pt;&C*q4j(k|*p3EF#wSjJ*8G(q1l4}}uCK*z2!R|3(TsBg!+F?WA zgNKh=W1WHakF(bgBeUPBu1cNon)yy?LtfkiR~vWiV$BasjB zp`@9tJizwu(pHz6CsSIa2Hs9Am>mR&)MPHrd-6TdmVVfzf)W%CRUae@VUhBC<(w^S z{*@>{18T6&|Gpb7rS}NUWNsPZ&B>M0+-f>H3jq5>zD1bJ#9RFM#3+05sWWP#K#CTu zq%f+je&)`fqZ>-OIpFQk1zLgA&sp2iUws`JAx*-O#KBT!vwmh&+k5r6E<1FxZ)dzY z={fm#SM7p=ChkcB>9sj%Yk4$*Ff|VX1S_9=HIWce4MD7}8S32{08AbweTjEk3hH=% z^`cqW6X6mPN#Ll0cb4}BZ#4(?-DTn$TgLRDgQSCJ$@fbRmK`c=F+$!^DNT))f49-K z$$>;%;PS%r+BC=S30R-+7poo&Fk!9xGw(smK_gWL&0j8($tNc%ZZo-wh;VO>4sZhP z%9|$KyZAU|3jhFxUhTO`$1R~=v+?{>nN1%HSKS+D96ePlG-KV=0o>AJ=pwhMqJ*tp z{+mv44bu2^Kk1zJT<7`=Iy3pA58`vgp1N26qW!l{g)0I#Fi{H=Tw!%!hD5b9 zlVKWy4pt^t@x7>KJ1nmjet+;pI_E^idn&6&Nb%%1GLAf#J$LajU=xe-e+-lFGoYDr zVga*^d7^Lt^58nj5WDOD)Ho6|Ih{sVl$ybgmjLM|LB~goDul$`+bbf%N2MU|o+K08 z2y;;@ChLS-RE^J8@4>8^ks^#luUh}93b2N;a{orug;s4N! zZZsqPILHHUO*)!fqKAT1h(f8qPO0bWxbB2OIHcbhNSK;w52~9hVT(J46)m-3*v6Sk z$J<6|XjSm$-*#2m{ot_65$fNQ0asKe3q{0Z00D3xx$8PDt368e7I3>^JDREs-2qsq zRY1|+WBajuC4dDqdK@goMi6=-qHtol0PrVDjR9dC+iCA6^W_Kz0WbKHaL{Bfu#43) z5!%*`y}wfR^nMc7G_#F~U$5>PuWN7)z5grr8*66}qK zG;_bRgco3Xub@hlIv&i0`lZ4NLM=lnE{&KW5F7em)%(ke{bc~>Ky+;=$+X9jK(U;J zR=pmsx$2YxF{DcRH+h4-l(g!`Zr&JA0vlxB)|(n%Tv^XMA`P{S_f8 z`=^P{1&;xd^scb|od#ttZ=Dhj@KY5O29UGUOHh7C7{%S1G1bbHEk+H%wWjOh5s{WBm0h_X1Cf@EGUVWmYA&kfow&+ZD#Wy-RqwkdStFj zx+G$D;$RM}k(ep`cD33DkdtSI8+Zm1T?mY>|9(Z5zJ#D{VFDpt>8f>&ygPi@>UXLtLxv_GU1#FnhUBp;Ng94rF~ ze(avdffP0LwHG+0_kD2#rZDbVyJ;mr_j`7D{&3GOF9vOmtZx1x-NT<}9QYUPSjQST z0tpC8^+-bAk2rK7QGIj>aB)EsfUtmI-<>SfH7gL)3Awzw3)FdK^Sf)agp%S9*5Z$V(k48da3c+O;1Le2&Vj1nq#Ma1 zH3v9=7u+uDg~aKH2ZTsMtuxR5Vbz$gp=MDzKI)R3odJVNy;)eS4{~6 z2icUcpJr6&RotB>A=1B^hhiSiVhkMY%Dnh?4>T5=s;D9JALgCF8fb0MHOC+BNrn1& z7*zmpoAF_#FV)m_v_WFOZUMCrh?RP=a}~4YZ8qcyx)IVK18_}g?9M`Co0K=uKnQo{ zfYiR72>iD@lbk5>v^*qx7cN{FJicNt5OY~l3lgX01?oIxksT*HABZ_6m{*ZUSV`md9*C9xn_l0!nl$1KoDojj2Vr#- zG%St_wR;OGs=pIWyH>J{4((o~5Hu6e-m|1lykyLVS-D>kJ1wwLlU$MTa(v-E3{*}K z+_S5qMm7aEk&Tb^__Lv)J|68fc5Ec8GvP7L(qy&96rlNnrQn*E3xGg_JvD_}Os3n|5O_yaZX5acz_En8!RH6FY? zN4;$7+=(Db4!u8GJm0W+7ju3Et2Qyl_DZ-wLCo$In*9hZGIdFlS7vY)h0*tmga z8)}h7M{@ev_2~xYfh0j2f<|@5`jzpJes$rQ>N%j&O&HY*&`{ahKk_ETn^jUp2T%|H z-dAqd0>8UFHl>5M-G~GPwS|yhq&B(*pSYZ7ShfHqB^un?>{?M9L1wO1CsjFfk(&*; zY^NPkQ-idfgBw@Y^+&9R%-&$(_MH2R=qJc9Hfb z-eJ$0_r^|$WgWowduosN%EvQ`o68MT|1AmsZ>NSm#}Ve{D!Sl}%>=_^L0M z9Na|i|2*&i`%|_JGO#c=92ptG3)&$ecda`K!^S?`0kJU{)&8~oPC(}avMNIJH99D*!x{2Dc8)8}xljc~Ur zg6eGm_b8Z37oRfmM$N;VKr#j-47oGBkhIvqZTyO46%d%fw!H~bnD6%Qf)v9of~?Kx zJzi;d{_RwilhFnqXv|=$x)t|W+ziCQ14^7wa9skw9df?%t)Xu+LmMOsy&#W3>>?J7 zQSI{Og$l)WM@t7NZaW%heKJLmoRp5Nzt&e89`-}&R*&Pn3BuGjPV zcs%awfxG})`-`N=j8yT^lBY8BKm%P27OdyVsSp=;EgBDW4;i&;hZw;8^8)oC|LEMW zO1UbP@~F7{DZuwAG4Ia)9^qbjQt?4iWM3@%pPyg`W!ah&QEOW^TONm4T*a z_bLi;ke*eLD0~?lmsD{{!CE@6(Bn&>{+NT}l9ogLG9a!6SIw?IBvKod{At3_J|v5l`XJ7VBYNoI{$xyfbYp zHXdrbMG<0YS`BI*k-C%DF5OiqUnM`rs~9hordDM1uIc7+d}L=MOf=1hcU#YA($kz2 zWkE7*rVJ3CL`eJkj9*k`?cX4M91}_G0eXip(8@tBLB-K7@!O|kT1hCz=u>uAkJG`sAo0-Q4z!7{J41`0#Z%uBQ6ui(KrBYSH381BC}7L3K03^b)KU)rZv}Gz zQ@u+UCJq!e2^!6RWyBR&#iH zgg#0~>G$WgVKThdevohNRgqlAl~@zImxh4a?)TZ4m0IOBsJRJ1lu2JOEyq`p6Jf2& z?&)w7K8 znzyIkf9VT2xx;vpt4D~5iqip!FdKSdxZFKCTA4?_em%K+K%YD9YfDR7T&Q4qlb+zK zCdtf=cvrmuD=WK@J3rPyz*t=w>zMRLvK0ixn=8SeuI&GD9I$Mi8gBw*Ry9C&;%Iwj zoiQ2&t-EJI3Ci!WWH#PjIO9G?yf7ZupFATX?n1 zZhZhJj8wLmgf3|zMcVM$=C4QCML*8(dYRX5U>F-E{ z|K#!KVWGFm=)geI^9|^a7&yt~GxPJWVfeKs_;*@Xuzo^n=E4)!GJ7YYjR=V_l4|W{ z5+a7#oXN}pofM*#KEO4vTZ9vCwIyVYf$CH1I2GCd$rV7aD5rp7{ z3a1m2JQY#NAVb_-@-{n{s~X&*xby2Ul0{s*yGEWsD;16hD||Kp1Zxkvh73&iU+u6S z6g+<=fc*hAgOOClyB=^4jV|{Xthy%1B}n}IE5JmMs}uMG$#uKKiLbz4nHXvj_hbZK z3Azu?8D0@sFEQ_Y?*;$Igr5zuybOG*@9vOxOoQQuri-Vwq%z>i%%u$~U5F(8`m5@W zai)xM-O<(7lA239p`&|QVAcUaMMHv7lDqye&`D0k#bg0RfQ}kgWDrfEzjyc_c(mf* z!lUN~>{n)b&j9#ByT9+4lvgh-#bP0;kNPljx_i;fp zmVW^<6D_wn#ciPkC=u{r3c6mi`9sln;D=ShKzB1U2Bch@LK4Px&60oEql&4Oohpd|dk*uq^mLfTa}zEOk9d6yVy8hNF00>gVw898fOq z)AacHB~Z33B<U(k-n4s+0u7e3i7Se=e!zyHBZDav>40hU$>n=3l19VDg zz8b!K4%s5Ke!ydDBFe|eyXUaEoi_e3Jw5$Igi58v2X5%}#>MyV+=BiGw9%&c$8ULR zMBa^}*YiPaO0kB0^N?|YWRXgSk>EoOQFY<26}i&p22LZ#GKmqT^Kb5Z!fYs)7G^V< z39)+5*>SIqN|fzMn3QX@E*m(9khk0BE?mtHqlIVL*b>Gdct-(ZHGcndeZ?lh0@1Y_ zr_En5JmT+TYW56Oou7t~j`G?w71xO{n^y-up14_dsl?H&o2yx2jCUozl1EifAf~q% zx-IPJ!yRXjZ*G2@Vm|i~4CIi0_fGe|Qbgvq@E}@KBg_RD<`T_p)l=zD-&FIZ%b%X{ z0?C5FU(jEjKF%Q|B=l_q=A+qu$2a>$M3>F9ucb&?KTzUZ46hZtRIReRx*xeadHUBu z{hx>h$)5s9aUsd3^b>WMFy_(h0DpSw?*9r@!SDa=Jd8&UpQ&z%?frJvuWZN%i`Xj^ z`Bh&Gjr?tb`-{hZ#caLPE26jGQ=3{o2cGI9D5`Js$u5^$eBv~%I&|jB!vgKHm`)WB zX!tr#2=w)!=vwMYzTtN-W3`Zwr8hnws`uQT5!F(Q>sdUiD`&Ib6h^_RZ-h&_InL%# zPnsOfJg=IVYhf0I(lOMCNSK$e_HGD36sZ>y4qwtP)Ym&MY&D_e2)s`Mx7M{w;&l6s zJFMwS!zDQw*?D(>`tM>wX#tW;5_2%;ri5k}i+Qnxq{6>%ulT2#;C*0fj0at4z55k5 z)CYJif6`9vH58oqT2MI`0a3ai`Vm1gIaFIabP9s;&a4;71_(lTW~?F^8lUvhhD|#| zfsO%?O-H?HtCYB!fVc2u%Hk+m!0NU0vaoB;+su71ucXY((w9x5dA8dWTs8)VZs9 zx`QxBskrs4Zd^;gIN3A>!d1f

z1cEv%WkSw>fVC%Dpvh_Ieqhk8cWs~|-a>%k`& z2H!DMC`4k^bK&niedZnKFTkMZnb>r(Ohy)hDGxO_9Lb(oF@opXnYYp##3L-Ae_B3~ zm(gQo-(yXLj^66}aS@xDE}>lY=d5WpDPBJwofLPvS~b2ZwVKR_+o-DlA+^ecZ5{CL zcb>1Avjj11DsiM&xH2j+xiVmM#(BO-@E|ur#fa2egWjS$4`2?(n3|GTcNKk)B;Duw zq{Q)0-49jP9Zs@?d)4p`qYZZ~Ih(II0RgB}W|Kr#nU8&_(Jpz_4~d|y)-q%+LQ!b` z4rq%-T;BDHfzQywzpG}{eIU|#%7PR*DWPYV2flP>pi(5;Uc!0G*#D>CUGr}rp2!Sb zV%xEaeZ><#U@va9Fy?cj>k#Hu;oUo5+i6(G_f7~KR9wFS_C0^yF1Y*yrsT4k=-Tk< zhPxwY6s#T;fpd#^ZfE^GMhOi88im2{kWP`FygN_l%=*)BJ|xY8e_%2;(qZxC=1VU9 z|A2C0Tg4*IWsf!{aed?y-I@4c+u@& zDb*XHFH*0rN^c(|XQg3bIebqhB6ipAa-%)u7aCJpEZ=$-78H!qbm} zUC*gqR;$L#ePc2!Wd_JHPQgORnosbx{UQtgr9w>#cFDDGkz=)C&x1+(ypSPcKkPP0 z8Jk`J_t^8@G#A@I2yR$5=wX>WUa>J(nGaKS=BfdRvkj}@ogn7vyqI|+$5aF@7sagR zpS&j+`~(#)Xjg_~i&%a2BhW&A1~Mji8_SVS2S`>6d4k#c>lWQKC3)!jsL#S#rt4C4 zl#R=H2JrG3Fv9P5s7lTWMiAGh9F6+@B47^-q4or)PCa856h)VBPvqiBQ*A0H&(Y`6 zmYvy2iTjTyTlE{6g6v)OCmO9LGE$=1_%c^ZA|W3eS4_(hn|rT*T$w2X3jZAM8))t5$|O4<$LM?HoKWw5%pX@|4qOgzxE)CBGUtbZTyrIE zfj7U`+WY53T9_`-ApBdRA%Ax6*3N?JTdHo%OZ5KbmMDL8Q@o$Nyp^{=@&3 zRl#50!!Bgq*L6VZr(ssU;U0qnl1ptY*{{o38-yL3F4ULvyvH@0LDZ5K(0`UN&B8_3 zio0b7vi=otNB+FkuSV3%QF(k^=;pz9FoalxD1_iLtGfZAcc<4MsL(n+EtQRdL4yd8 z1ns~~OPKf+yGUR$ca8zz82P~RyR^wr#QAHZ`I1X3@1dt{)q8k``-J9)YKJnF;tscG zK-uMWlW(npA^xG|m&ha6MC+13qTeR@qBgo9xHjc>`;iHN>an<*1=D2rOw3VHKoF;a zWlT^Uvdr9#TdMR1yKB7va*1*>|0M_~Y8dp{u-*W|^>%U*z(AR^XO=y+L7z=5wTv;h zd=2%!4?_2h)&oxn%7Ig+e<%`wiC%zftib3c3?iTtLKd3kmP%XCatrva-Lr+J<^xD6 z>8dg|jzm>8#h58U49b6)SO4YoT-8EvzIiS`IrU}wQrR-DS8`Z+S$cA2D9rE4PoS{s z0}?aOAM#Risv5RKA2`wnCYGs|GvZqwsXmla9c~bOHk8I!=}1lkU={aV>17fHk1N*h znZwNF2iL5Gk6)Hipd`q-!_I_g5}VSXLn${Bv71?Zpma_hfzKhbl{gq5DI_GN%(wbLtfd&8kK8(3=>|w;5*P#HroskP<^(Rv> zz8m3hh1m!914s2IoCuFKgyge|KC&;nAe-ccE;A7!AE`~?70z^NZ!4^#iedIh6Xf$Np? zGjFI#!yL1|FMOvdHnt|trW_2Q{--GRAE)hFeBm$_XBzM0^=LI|-7<%%uzE$e^5@s! z03xknmTeV0f|*uU;@#jVD=!G5+1oY5u&kZQN@t>~(H1_~s>oOUjWn?T{?Imfhcyl+ zrdyc+)L#PYDkvN|R5>JO=uZ8L5@t2H3s}Kiz+(}|O??wMn*LW3@qkW{j zYHH{2lb+Z7YT~K30YP4RcO#~BeVy(`aDxjfyHN)JKw@n<`~j6o1qw zx!@r}f78y7WB6Qc5aF<OKk<_k*bMfC%+rYWn<2Sik>#3qNe!1k`}wpPo+>TBLt@(Pw9@L!b`4 zqOKK?D)#~utXT?>eRXUzRRer!>9yQ@{948;Y0NIyD8>U#c*JLRz?{Yiw zl{s~aIeYJd!*o{uKx!}{WKovE%z@Kqnv?wIs3z{qWOiDpL)}S`KRaUO*S4i9n_z@L ze22hcMqf*aaPgXW8hE-mU~ODtBt&2aN`h+^WsunVMA{kBPhPcECzq2;lVXEv0xvU^ z{ooI*)vl9R7p4&%-c`2sbVz*`xQjAKUFft<8QT~eb^ClPki3kBf&Xka8&-|wIxq+Q z^Hu!1vbL?L$jMS?gcvnAWuSFGeE_x8*8@Lryo+^sWEs~ z5B+B))?q_3h)D#R85E}ApPgEGzqPd4sGOZeo>+j~ti{E}H?Odp-0y?#hIqx{`*7Wt zxq?A>Sn$dW#SUCk^Sf~!aeQS9FGK&+YyOwE2J?*cQJS_oW6r&mu$ZUpgK2<*2<`1! zh`Z-B4}APhS8~B@V{y#Ku=zbI3n`GNR4D-!{#5)q-*CMrH#mFs=k&kOA02m3SD-9fV z*`D{~FL0h81ntlC9VlE?07`q#`D`6&c`XPREnbh#*SFjrRP}Y{;)~=We*laFA9D$D z6r?YGe{J9N*tAq+O_2qPOcu3mZ~I4f%Y%m2VXZof0_Rx zPL`vr$@I%d@NzSfv3Wzgz5*_Rdwi7@G^#-_zgr>?b{_mTbLzmZ=L0N}ndYO7(~<9a z4Sh@DwtEU(M{bwKK1WY^i5+;qBL{<8td4^Z`+eCTlLU40wD&6Um%3Oxjwy;hd|k%q zG4ZrU*EkeDS{KXqZmougkxFqQw#jRE^MP8p0exHm^#z~CvhSjCP78}nZo__GII;uD z;JKww?=Kgzt!2-l!T*KcaH+ zLfI+*>Aq)k$)0oeagoNksVo`aPr`}WzwSEo`9gM-W(KQ?>ig9JENAwY`(AgW z?;Mx@$8<%vjbf;HLl9U(Yuj*mAk(@V+_@&c>zyg5`De_C)sTq&2702=WCNX|L9fN; z5FIC=N*|Zru0OB=C7oZ`W(k8*MEidt`^Ei$2^7)|1{6G~irwpYAPa8alzUFqayNEN zfPQk-2(F()Z-i_I`yn^wj&3BM<5<(e^-B2i8Gb%NFZK^AR8&tU>NS3brCXQCL;+RtEC7)$8CkhFGs&oJ3eR5`)~srTYq zUmcV6*-QS#Op!kd4Le&uWql!=ir}_jYai+chk?BRcraJqBEHx_c8BzrOw+b3=XuE05+Ey=!h9{rEOic4+?-UN25 zKVOf;K(E4S-)|*f^dwKSe_im64D}2J8;SGGg0!~oFpvlJBx+${7HTr3RzI)eL*ER8 zIooQa8Lzd5FbHBU$7tBZ^dP5RvPLbiK$}Igu;k8rM2hM&q^9}XL-fB%<@eWA`5+PD zHJpGeNRWKvdbVoa^1hyc1fN_@n41UEtd!T)?wo&ogcmT@TR*!#p;Q4+z*Lpze?b&4 zO?URPxuU26ZQ@bh$rgI10P&pwcT^(X>DcKQa3zl;{p~k}Z{^_EW)xozEVj!r>xhcJ zq98GEK|$5v4V{1RT~0cAe!SID{QdL^N&Tg*_8#w7k^Uhyka93FC!YkN`kbjlfQ6iH zdBZkYaj*WnvzQI=pwE|P+gm4yihcHQee&9!Yj?~CHzo=pPq^srkjl$ZX5jFVb&koq zzM%t`f+` zE#^dkuZ1fiLtO7*%DUXIE!8&tpYxYRMdfQFf&V4I+kNq3RIV+AIe08h99pm`eQ#$M zu^w_#jQ{JP(|GW)dq@7{>iNO0nh9E%UqOiBd`5yrkDzr3B=R}0lt3sTJJ zsiUS=%-6P*8mhjQwZuz$5|op~g=({?-}5!_)}0QPM`3cA%+^I&ua22KSJ!%~6a!}7 z9<;znWCl#NrohHnWoBI!9RyqLG(jO37Hx8{Tq8b%IPTN8ektk0dX;?QU&tqCV_BN$ z69zcUWN~6f#g-n_PrY+zttqm0Wy4TY$J8d9d}_!ga)!iovEbt1ZtcV%QuU?CAzAEH4*Lrg z8Aw)bL2iz%Z2dOb-tLHjZB;*wb@|df{p!A@NE@{_&1-%d?IUB(H4_VefP_PRrh|q} zy`6K+yYy2M;eLm(CM}u65jRNp$%C`E0uB9GchvNsVV#)2;NLpjfvchdtstK}&ATI` z39rB=xM*#)=gQ7d1l&6vXH*GG7dcge{{3D4?|uz(9ssRmf(8C82@xH}jH|$Mt1lJ1 zYyQa-`5P;t)P-;HKWZxV$KRpw1hb~Tly}kM0(~|-Ue1#p?VL0#ycRu_T_9VzWXb41Vfm9 z8C<9r?CXu((CGrXgizTJaF?C+^1nnQ*@c)x4+>E2P_4u8*Q!;hZw*9_eB04NB$}Fr zATcU`3piEHiDCl!B7Oe?K>b&YDuNccsR-2^CyDVyMx>u4G2O;eqB$2a{DwSacQLmHWBeJ`L#@P%2}>h1(Bip@SIE zhHx>2Avm184QhwB;!@wGp2D+C*CPp&#Cmof}H-X$e zz{5!vOXR?CB?lDqF3$}RX`RsOxLkwU8sZNg59%b9B=JuGVvq7N+P^)7wxDW{Gymb^ z>eoe%a4UjVE)W(7NcP4r=dd?JS0f1~ve4%&cH zP;vG_uVMxK<%KO!e_2p4dp)=8sj4{%y|2s|^H64p)I!jh`fv!T745+^_tF(`S|CG) z6C|wNq{}j0o%jq)F#v7 z1~Kg`(3KHEQfCe2&VKxZ^Nk{CCsFf1`~1xT zs)8JKI7=Fs4!~BAlvwfrMnKi_CaRwlhhvxk2>-y)*QxmN@Jk1ZAnl6?(ECw)y>so0 z=F>vto*d!Nz|<$UXRj=E^y^2(OgXR%>dgI0qe{-U#u&(0+OX(PqfzaUl}sS5-&q^@ z&pS7V3in}G#`!n!l;32DAdSR5y()|KV}$foo616fC(ri}fGNy~Ey+o}Ewf3np(Onk zqX~`r!FHJ9tdJyqo!zqybfN?95Z39BKWlPS9j{n!)mOsdbzDOm6G{3+6L;b2ZFa6m zA3@i0P<08%Dz;`}o*JWo%zPLrWV9K~_4wO29n^=_%GotEpvO2z>E9tq`7Gl0()#Pw zEbJ;af6^ykfbX5@%Z_7;4!s0rriSA5R8uz0N44)Xe|DFK7h@hBP*p$JEWN!z6WCE^ zYv9@s66WyxAIdoP-T`!QZhaqH;z1S^BuapRd-!JDV}ujC`T&gI2c}d#@An}Lnal>8 z<)FPP`x|<+Q_oC+Fvm*nE8FMDU}@*u>X|&zc+ps{7{KixpXL8QFNg z_`cP^IT}pJp=!-_F^PGmo)I)3QFPl)i99$1iQ#56?e?4@pJ1O3YdzXkY@(=fH@pGBnKJ-{5g<>jtU0!roWoMp2c=EC zCEL$|RkI-ySYITAOo3tmCjBD1uyBO!!{rrwbN~2Ro7{(+*u`qnupq1Rs3Dr?gw?xD7iN6Ab%Q31oFk#@ za>TqO_p!J-f{D)t|Kl&g1nLbyySB`)gz7F6mbaTwyRh%G8xF+A9#N8F-`qF-E-u?BxE?Y~S3 zzi@^d3D1ImbVJN9`b(l*>Z0Ln5i>)kh?8==+X3&~FG(4xF5H$p>r8l)Alf&^=<@lN z%zF+KqmEWsS8W7ic`4LTRsp33Zy%VchMCB(qDxCuB@?c+7?$T2!0Fj2RvOPqd<(zi zZ{u-gXQ|jfKq!m{DXTb!gXvN6IaW>|PZ2!>NF_?Ok@55~7N?~6Ms~#H`PYD=mAQe0 zAK7pLy#Och4N*76-Yne`vs-XmW`-#B3G+sx^s+iTxN<3{K5B+a%!yip#c}%Ct3tqo z_~M8+nNxTa*w(hLMd;Hq{t0plot+W8>J%V@9&*5ezX0b?d2f`IE{emJn9B1Cm-Ow1y#2HC{!Ju>~SrzsqC~F7|2`) zaPrghzP~(e$}Zoep}1w@aiaF&Bhj73j=ztJLOG`Q@U|C+hY=7aY@qbUtMcL0` zAuDr((_N1U0Kp7Y!BRA!ZCbkQHuyUWfZ#-_4uq%86=of5#TuT6`C>GY3_eBR9s6F| z^fk=+!U7P-O4czCFq8!lK+M0#ne*Q@u>bDQa`w?y?~U*3PH7*6w5NUVEdjq!RVh=4 zog+@R&1D;4sbC86ZV?+NTdvZC@Zj(mqGPoyJT(Qdbu`Lx7T9uu#d#a}u9f#EY`=M^ zR2`5)0=>zW4 z_=}YWNP!-gCqFup-x~u9jykk&hXTD{1{zWFS>)>{wwZHPOr(fAPmyhP8f+U> zGDk>q=mt9eS!=^0G2~y1ZvFEw>&l zVNLLT|D9m#LlpLnBz1uY<_^@xhz*pNh5%**7dc5+AKJ&B0G+*DqcvL^a2x7 z4bYY!fxPNHsSq@@2jC)*mlPv9890YjzEO|47ks1L7fveSEDTJ7JV0Kj?59|BuU_N( zb*Ja;;8Wu{iK9DkEh43uP8l3tG%*Wl2CXC%Ck+FcJk0tR(uQvLU#k^5H-!bRIu|O| zG~M9RBlWUp>SE6Mb_3wntJKlo#$N~~NUAz}>W6RvAzQ^SY+YVa&f@KL` zoj*Df<4lt$2g^ufp)rjfUpV$BqbC}Fto&YwcHRmuyIHf#gn`oZcteI~QB~fqG)@1I zXl@;K_ST%)ZV2QwLpOcd8eK>xKPWT25(51G@DA)i`U-Oo;=4_q@us-j>mOTFL-hZ+ zqBrfsHTSEA-23yEIH^MbZX z#?$^8yM{WdI;<&xGRy3ki}#OHYJ3Gg`X*6+N>Wz+;kQRrN}%a|Ilc)9!whuxO6ze^ zLgvu%7Wk_54k`zu1Uku$nHP1APCytn#EiEZ?fUCa23^3Gze_JLV2t;YDFk4k7zEUx z1XHl83IP%}%FYs_eE!{~@qhi9sz2<@=9J-!l#^lUBt_YwpCDmiKXeyFLpSUDE`8=% zkTNk{qe4|eb9*T2;^Hsw3z)oN8Wo-#P)$m%&0Ra7~THDNg{c7j<-#<}R)(SRYZR2#&nUT= zcXCx_$fF!Rt%!Ido(eqq)rVf{*QVpfxb!dc^=;_PEs-p@@$}IeHooPmDhls}@np{e z&zN-rsJCZpPHeUK!X#`T6E)>JM!_*LBrVOilh;|{1t!kmmxi71tzApNA1nztO{lJF z9J4gM25LqfjNhS;R_>X1)8OSx89gW2z2`H$gG}w2N64wF0m8p9Yw;as;8HY6YiVgI z)oSazzf0Y%hxEzNgoRE{p8&;;G0-!&LuG`frpL7phOUomg9tE$$g4uaD!u*l_T}d> zDHxBV$KriO@u9fdS{C15n~6Iq82(Y*IIy30H9<6!tTx7v zgfg`~%wN$<4HIG8vj9?n?=k)(K>5$}v*r{Qe@|D(xy#kbQ*xsuYb!t{>Si0rX!Rg8 zU?v{tYyuYZ6?k|mQV<8|LM~y#dGL(KTvZ}78g=YC;22tg#LMpmfB#RIX?`ny0x+i} z(~6nblr&L~N^%L-=PZ?#-v;EwpQQ-U3L<0<&Ae!pa`cmx8c|QhAKZ^u5~ftu$jpRr znTCNvm*!P?9(6uFOvOan*>!-msDGpLMGpIk9{8pc_9{Qmep*eQTg;ed(6{7VcB)aS zHlSPkHRoCzERocznmI3ZGBMqCBp?#^#1i1jxz+1KcWJH#Nf^dIo~)rz&ay(_J{4$Q z9#5@m>vX{in<*`@4CFtI8eNuH3>aOVPe}q1W;&BE?ZU=6wAQ1U+>$jHLH?G>lSKRc>5NDpX-f=@hc9R3 zAZWk9mNJxbM(r1b1DEGFz^1RHVclEWC6D7tul^P^@Nat1W~bq43~V4j+@KWgk3xFI z#1y;N`!ZLbrg97P+;mj7Q*NC_k|DY(njw@h)Vbm=76trP9jI3`@o?|<8v!8UOu~wo zO?2H;7p%=++TXB}&NJhsb~lzVlncoMR7OE%rHSjAKL$I!Yn&vvUZi^{jc;_#2h*%{ z9QBkk{MedrdpVl3A&8b21~$7a*kMp2z4?U(?VEKiu%<^|(g&4y(v#s){zk9o^aLAF z>@Ye1*gx{MY~9tqRNkBX3rXfxo7-Ddg>gIaY8iTXD|3qBwL>8D-N)Xk2q+O(SGP?K z3YOL$Sb%4f;0AGr+4-&T0yNaP8rW$WaZmGGO13f@*2YujNjWsNRaM}-tQ)Mp3CKcM zd0ANo9KY98X(*`H-h*wq7xF{qR7RM*zem<)V=j2iLWO4#Ai{XMf#<0jy#-FR02lI+wc)cu z>v^U!fz;PK+-c(1VZ`3c&-NI&vawZ+H*NMZ=$dk-6wcKMNJodRt!>aJ2k!iKK!|k4 z7S-@t@Z(n?YczpDK*^)Jk8|?D37e}GzG_q1JnvG2{kgpMQC$8h%orxQazkU~n6)wh zyUQ#;>sG}k@Bv<>S5HQLxdUnwv4RY!Ty@SSuhi~_4@EaKM=>@aHP`Es>g1pr65m>W z#%idYz`EWT66#4Kd=6gag&x_cVR)pBw-W;ebpLx)Js7&YWr-cuLp1|S2TDF+s}9>1 zBo2Ctw~k8+`aVlu=uT9?J{uYbt)*EXg%);-!}3$?h1>p<2y@Vm$su$svYxD>KQOh} z6XmYdn)ETE#j|a*&!{=()TIYMEkf&+yi9%k;dxnH+!jh`%KH0eAUW#ae$V_DzBU@1A7iB?zNxtmjf@| zFW0U*CG%CaXiC1ZRL)4ATNShVe4ZYk{?cP*hJlQoV{fx*?mEC$r=wG6X&pH!&owy7 zk4u{E2WHVlQ4OzJ%(}VVvDLD9^QE`#6n%0bzJ|V5YA2@$?kovEdsKSa#}_y=Kd|zA zhAA$5NdP>~7bk0E^G9GBSJ#aw-o18rKY5+mG{3d=XuTNpbOUa(JQxmryvkBQ;0vky z0u$LOt$5lRwnY1@43GZS3+WM#57oki-wnIIQ3oOkn83=Q=luP^y;BH>SEbPgB!n)J z)9h036TwVA;7}JnJxpV6Ej;t}Ya!dVw`>`kxXSq2kFRlNj4w;aM>lZ!AyGvQF;MTi zw2}IiX6o5bE8RX`KK)+!8C7E!OEVUk1fTx$jBbdBi()OXc^(=Knw=VrgDe4uhjx*^ zyE0ry!OSge$@Aep0ALb(udRaw4p|Bw#SK!hzq`r)?LoEY>R5$8yyv=bC~ity&qrJ6 zz}nX3`JthK?iz2;m`8H;h*xV;sEB6A&!Ia;>!O|32Y4^Dh?{bc zW8?F5sY#sXP{5cfOF}U_YeP1jE#sH*=4n^0gSEa3$$r8+3?d3?nD)N;Y{43mWC#=a zakdSr@$fd>h7ciaWG7kHfTU>jwac0|TU1}yp*#ne!&rN)t1}0M>AJK^u9i2+OL>M6 ze5p=6s8=pANv4UVq;R`eMuZJx&Tg^zx!uXQQBG+E{?-`zR^#MTRtYVuIk3CM3!3K7 z0MIfU%ddkK;db;gijyr6ISVsel063MHN=O{F5KHJ{vd_>&e+_uS1ISRI%`+6XbWa#|Zc& zPptiD6lmXV{B;9D&j}}Lz#RavXiXJdckuHdQ};yI28@PX-T>9|*66a-U!IqVREZbl zxbX(8@%|o8tXy1NW}FlAdHC9xVVRuYaN2+VFV)v*kh#T&tCzu->;qRdo7*=eOFw-i zm~gpMSoLEx1e}oT8F%&4w7%ue0K25H(c8V9@o!8% zgr8c07{%ZHvW-hC-GPhoVvcv-E`R^{%5V1>C810{AZeDbcWT*6RX+BQMWJ=WeZV(b ziwrnv*lhy^2k)9@hkH$xTjq;yC9f2ZvYHZ$PYu_#!Q`26Iuxz=ffl=^6gf7SJZCOI z$NG{Z_Y;VC5<6Th4(%SdL)WyOZS`{Wnvy=ux594Uhac*c%)zfK>!tXQ&-^gSD@gkI z&ukEk4h4I?gH~xp33_XMhJw=&; zw6D_g_yHR_K&YeaaXhik!|Imp5=3wa$# zuFj>_lD}wHFlV6oIn*&_17fG19pRsWMvYxK&Ud=By?>y(a^ow@MRV#cb z*!Rkg_r4&y_e(??7FohoNCC^Ix-tmwlWLguK8ohIl}t9duw~P3s(ybf-A(H2UeZHO zFuV}~0N*kP$-ZF-$i=$hikvPO6)q6hb03W&Kwic$lXN!on-tTpu}vO=t$?iML2kLQ z`;Tviy5w8;l0KP&F1yb8dI_VJsF}RX&6)}KD`8C5Xw9i@kQk?D2+cW43WnwsX=$U} z3%+Umsz{lBX#TWU3O~sJbhR*REzJvv&#iI9Za$I>i_R+jX+!L=SZ;)Potur=_ z-$>7U`4a(L$cuL<&v#a89!%x~Wy0#Gi;SJ#H1V24H_B`VtUz8`cYLY&=RYJ&QXr!q zj^~{8F+CKp=@CI(9owK^TKzh{lbIVMf0$jPP`^6y%>Eyf=25p~rTb2^S1GGXDc3)T z$B~i@fj59t7D2Q~xzbhExzHG^$$j-#futc0ecj3= zrzj7`!NR8j7oxSz3{oYLOUP0Le-kVsL^n1x6hp_~@*0~o>ZBsx1C#rNnEG-^jdbtg z>vgH+1GL6z4uCnwl4fesa%YBU#c{fO)94~bWgyHytEBXcKRK0yp1zE?sn03$)IadX!TVi;G9Q z;Qb!i!90(VhutMPXRBnB0ciW7hv~J2ki^;yp;izOS;N``b6C%OpG)=R5Mlsr%laLH z8&9u*@}J@^N^ESb%GhJluFetKI)1c8`oG^R@ag5DJ6G$s0Ty#>9}c1}W^+pQj%T=0 z($=r)gRwgovJB# z{Qjbig@vMEYP)2fZwS|x`GEEf5hRp>3&xbvieolIv}P4hCg>`s^WYYlTVC5GMt+V* zx8%e2i1hXuxBPbtYyR(cR9yQRodJls$o)DuEeRPV*08_a{(kj6(zse6TFCiT?BmkQ zt5;#P(E%QW(H2Z-pd0hZ#H3%&D3>%y9mnq&<`z@{su}H=s|v*{SNF{9PE4k zuU9#I6?2Oc$J6vN?)Msx9bwdekaGn1VXPZlKb&^_*Z&5^X`k)FR?%@vN1jnFAxBs} zoWNk6Z~uehuI`^49G~aQsYj9hXniUKW)NU&c$q5GBA922A$9FD>eHYoa^Zx|bL>!} z(CeCnIIkS1YTM+aez@P40z!0GApnQ1epiOmd~jU$J)uNc@P^E>xnR(JO2+;P5?Wy| z^jA$#vMAnF-?vfYc5;WF?jQ!A4QRfm<`DJtU--Ry|B3KMB_f1ZGE1RibyVi z-|X}W!)Ne~6CS^U%TDMADTKZPTtVG|=>gjxkg@DXQG*xU^Y-*1_BB%Y)LsCz_nF?n z>sCWo7bNsxVFxm7?{(Ay1yU8iKI<0$CEAL z2|%DPvD&ljUyYt40}(D*n0?v)Muu%3OK&YjA`#gG8fKdl!dBn)j6yv!p~;C&F8aSP z;cb*c#t0^X6&Q|)Fo(?gQ}NmjeSqGif5UUJS;QJ>jti@64`A+s*n$C23E$DmQD+^` z4$-o$T6i#j*uDx`CtWk8JPhESE%J;($Tt!SR|f#_E!TfUN`86m^=jq_Lruy~fCGf? z9A#ge_HCM@5uw#Yi|#U0ATqTg#XmmA9ckJ8P}fC1$R0ZeLywXN-gs`&JC$O9V4Z*@ z*9Sz0?+!=!UxT-dE^)jREy+W|D$oaZ`fM)9Nz9gX8a>b)ezbij=VvLW-DK_dbgj`k ze5t+wL9@bUZTR7eT9w;K0e+{l3c-hi3kzKg!6f8ihV#8nN4@-- zitlYn;0fd()d&X&*W`7xj2_7ifL^Iv7Io#v*Z%f^8gZ9K!yMQL3ni>PXxdqy1kY#j zd`G)Qe5XRuin10SAZ1tqW@B-X3uL?=QnS97F_D9!FQaBo*C$u~r&=ShD&YC@&1#RY}j^#=fF@x^P9|NCjb2B4qD zfr>c>#jQPn;l?QEdae1g?!F01J&uhht<)KK0di_^2%k6FMxCz=mXfmb1#tbQyw5+F z-u%(ZXg^}h)^IjMEGma%b6apI1=d5yVreIUU5*zp4fTdcHq%x~c76jAvmU=}6=5HV zsB!n^tFQ&Mc{`$U7hU3}mi+2k$8tz=%FcX0i|$>X!BkmLJ^TGd*LKvVPhWv(0~m8t zK5Jt22UE5wn4Zwk&7hY{nD2OwwXmQb65sU0S5;f@f z4aZjP*puw?yG$CVxG%zN5Aufl(URfFIxk$}6hy7}-4Emb$U6y=;LBdgeo-pu@Kh$7u59{H`HHu`0tPou_OZ z=|hzLlsGA;z#^KX9QPQqsI#6S;+}C!x=+6~#_oBzEv#@)_6GISw&0u^cHw?7>4VL( z9Hq-O?BNB7qSfrpN*$-NB>{Amew}zawBZLW{8qBovtx>|0nz&tV<(zWuPC+bG)BTj zB?s83y{g$?cy>x;wb6pSrSzCg>)h829uYM~;S#V|`^*hLI60S;$^onN&de_X8OPD3 zMon4lhsR8k;u>#QPnrVFcZlRhO&Z}GDlg~~6L}{&wklrwE}jn@ISZdiCC)8+!G%xR z-AS;kf(wh*3#gtbAfNI z%K)|Yo@L+d#T1zVZQAfMh>(*2VD|8yfp-a(pvUvsOGR-u=R(r>B2$I*b(MijJC{}m z6gSGo2zxK?kpW&8)nW0&aw8vi^vM-d@~oe!O&2)72%F@n4L@pC7}>bdsEGPmdu#y42s)(U!P-+qqglCv+8P`5VH!XgwkFh%#ZW}E1td>Sq9TP#2(ZFN1 zw4(PPJO646T_0ecw+*1639XU*>II+Jq4Djr`krn{j|Rs9j=BkM#^)cya&Fr&wfhB6 zUNmq^%JF#6qp)KGdhj6|v=|rZ`VO-DlY-9n`R)UvER#{lEeDupP8y8@Wg&+?_NPbY z)B>pzzmSVtA}H`!y^eRmh(ZU#lR~DhG_JMkYrm2}ur1*+MJlE;wuZC+5Od-7`=8Gw z)~OY~ytjLQ%H(VX*wtnM&=fKW{;>r&p#>5!0qxfd(qmf)2-kOeh)-_X&}bGF13m%OLUs zUSM*<6=G$EE=Uecpe(wUK^*@ez|7|27K$<^S{*2$;may*vdE@BU1B;4&BZH%+D_@; z;FijN@NpcyI`7+)CiexFR4Uwm1;}E*<2ui)2=57}c}15^|GDwqX0LTjH;;#kjU6*k zc0)}`za^-=!if59kEOD&)QJNW$U89^8E*@Op-e3VT zo|*TIyzMT*Qict_%oSo)`B;GbMncDTCj+nGgELO%i%ecypbmNT_bBk6+SPYh&^dhF z+P)!4O3wM*d0I=v0B}R~c#(k(_4xE%5*wHZw*wHcVqU<7f?C0q%8@yQIPpHOa3!Tc zQj1tYbM^C`{|D=<$a&TBZxBalM;!oiWL)Oyf_7iXJXpv84)<8D%|`j~w=xBE=Xak= zDgevTy2Ja{Ba#L_?sqmxjzMzVfA8XhGGHot_oKAPjEqvw5VD)<-a<%2yz^XtFSN|x zeEg~GIVT6{O|wt8mw2r?l!XFy`1#j{?=QX^8sMGdj-ytnkXZ$^n!9{8zig0l3gj&` zQ}@-{b-lthU0ewJnJ$VNF+fvV26$aB1bfl9>xZ)m+-Z6aDaw@`(50HSUR+daS^|t5 zxeH@!KG`wArd?$3bOC`yV8Xy^p7o$toS0)~;ro-YzVFx*J%!ur{6O#<2w5+Wy<;6hU&krc8mEirRN%9YI|vOi#SGMA zI|c)YWHhAg_%6E)Z2EfV!`l_BvefGLuIamdA@>B;c7M<{u~FL2Fg_qA1x-GG%x4eo zK!+JrT>JPZCYVfy*6O9VpAcu|_2wb6h{KFi|0STl%F;F55cC46!SgECuqg%&5*4N9 zmc;;Wea^p*O#$)r`(+}mBxe z0+qp+&e)1U#JwL|+=d!y3VzT>6Td)w^BJx3>BosRVlAj74NCSW_{Z_9{VwwYMRx$I z`R@;jUjz?kPe#O+kGnV-Kd;?rly*G-UJzi47|l#9J`Ao+KI>`VmI#+a^q&MlB1oj7z3U;$@U2nS%P)=Gwo2+KX6B%WkVhbwcZpivWF2wWI-N$@vmKnqx zjP3O3tSgTm(50kco7^}d=}{uN{d45d>OB?Xk$^|9=WW3|(+wD|owm02o+8VFm+wK| zck(9HP!_ie%)6#`&Dxu*kqLqG$EN61TdBvAl~ZV@oYW94r_GnT;5$8jQ-w1CzS*GaIU{Gfif8)By*T*k)nb_zU& zyq`Iu40)`1vTVlyWS}Db1B(>;X-O@_$b06A_KGY3S{JyU8^ zcGSlJVAg(=y@*6XIbN`+*rZW+?Z;^F<-6+8f%M0~@|p*yA<7=&p7L_cd^7WSbusfO z(x3?Fb%gKuHzFet8^+hkJH93J4$PP&L|(3rcr6gvF6w*F-EF zXaM<)a&L=0B129wn1NzpFgTUjC_}+$+|W(OhnbZAwJ1GXwb&9|E?1`v;p!#7ncq5c z`pY-DLfI%kufryWv1mvwDI@~%qIHrv?Q?-nTm(p*kLO9Rds6XUzTQ#IXL0P>AE=Cj zvJpJH^zJz~+2MT`!4hi2bwSSnt^njUMMxiLxU-?7`FP;s_gAr4x{_}JvVe$Lf`i*7 zt_n4Y4Raaz2>2Lupj9xM%Q)pqVxp{$Pf;)*7*uX8m`CB(*FQypy{nSjW&2|L1%L`4 z%SrqJOxliMQnP9rtpH3YDZ0cDCN6;v2cb3x+kf4s*MBIw$_6dPkL#WQ+JVsCe(q*m zpf>0oO$52n|M(-l!9;NU z*aU;O+E^02k_`|PLS)P77BPf^cO@kPyX zOYl7kv{gO>Cp5e*jnyyynQr%^Dp~0O%O#Uv9RgFf^U(VDyYl}t+3sAH5%Th*ryef-7>EqY3B>N~a$fMK0i*jO1rmkh+A_y<3;E&&xVRe0B`2iE=~zaCNjYiF4(fFs z9oPm(d<8yYq9&0(`R<_e+K*rBQ7F7CLjNjcE&A6`zSTVbxV)jFoZ zM}f>!>i#J{GHBlodZQGZRdka)ld%~{D{s;`K%!T%U24HI=qu7R3xG~5`lOEtxy57B zz<+xd(?@I#2&x&RRX-DlUvDyu4TWmPv_Awee(7|=v zprS@<; zEr6sYMFYUo>%;Bih&Qmp#{qvZ)_|qZXbba~(NWO4_p<_mwgNgvt(CYMDQ{QWnI^N2 z*fsun*Ux(bAolkn$~76GG+{Fp13GIG7h1DZn;!vZ16Lef_F~FVD)9R)2=E`06-hFGznlVXws|3qNy;a3&lZ}rZIbit)n2C(6d1j)Md8-4> z8o|hLE81*=yaIW+d-NKO&g{%y0r0gNn3)<_#WU z<6fS$pJS3H=z@-S;z#h31{MjBG{AkaP?KZyET3Fw%fz>D$VEO_B6_~W~i zpB+VLHCnBHzB_Po4!E%%eW_i!cWN#9hK`a`lAj15$XuMe0FxSfanA2BP0ozWl(%N& zIm8Fe4n{37n90eRCga6H9WRmdlt3RRb7t3rd-Ra~dwEYZ8*9$(9;RPNNsa`E;sABL z3{&PLQ^A5<4fjt4dBf|AyM0$XSrzt=tAQB4a}^ zx?&L+=Gc?KctNq~U23Ba=v5C*dyE`mar>0OTlu~uNW0^Qj~C*$*89jHpJ^&vMqhoM zmFCv0s~fds1E}`lQYuLv!Ap11C+~^Zo~t~rV8BNp25o(xht>>Gb&qgNX;xyGf%#s# zvN#bLd=D|}0Ze;LBR}jDqo7arE&>0?Bw95TJz3vD(7qE2@XQ;LWM$f_o=@EcdCqI^V`itR7p4*B=Xg1C~J?HE8CmfSNqh+LXaNQve6* zxF5@oi^AAqCjAdI|1@`<;qAd!0l|Liz!otsISY>S5!ep7Xj{1`G7lWxE7KQyyfii1 zS$9_guW;Xk1?hAr&W@KFca=3N>(qE{J>(TB-m<}C17ei{@HeEn_av?uaOc>kD%YR8 z&pCcQqgZCstJ9{_U+$RkH+Z=2%|vNZR{)@;el==yk~5i>0N#XofM{hKEM@a{cCTjB znFV+k)EICp(wLH$r@-0d)L-;1gxQeW>XFvJ5$C=hNE5jYxhxqP%~j6ZsDsk(2TjqT zrgF;kE)f5Wk1(CICiGj3yX@p?KBobWYgct7^MOt1jR%>Sa{>P0PGcap7gHTk5q-3F zXBcbLW4`zR}o4)lmJx=FfI~V^lnIxo-WdU0eDQfUU5OO9%ReqI8>%P%^_yFaEY#yxG%v?N zg%jfmK;**g%;~Gvm7w1LYDYBHPTp5;jk2{o0ptoUbOH-N!d_eX4{-6Q5>Q{|d2TF$ zDoWH7^f(rQlU@_+LxsOWw!kTIPdWMP5$=G}Qrh6ny)u16lOj3_2%wB7^N+Trcz_xR z9XC{txy4zLe)9DFWt0gb5dN2&N$`(X-8c0CxH5x%q#nvByr$ z&Pm^fOv$nd0gV#RkJgW%y6y!XvM$Zh3cuUMJ#71x16CeKpZ&|j@8~hqi5?ZRo+BoA zW#R>`zw%03X;yto2Y|`<$~tVqd+2J(UBV#Wmh|rVn&o!6Q>2#99Y?GVoxthV8h#5X zB?K)+Ex13jxTEBfkXAhk3b%uPZll2hmJ%`%Pm^0f>)t4C&jOr-BEW9O*OE$feebUD znI%9sQo#74>M&@yBqu7@?d{GbK20yw`BvXnL-o=7!t0g}6kJ)pfGE^-ZT14l`#N}zW#n!1{FLgkL~n1!+s*Wy0_2>%LW*y5tb+qI;w zRX;JWKb!$LoIn=>-#USSDz7CC0j&PyH>zKne?;LqIyr`sdV8++3_LAyQr0#!*3cAMkk*iTWLgXd3LQoYJJ zmZ-UoKN0AkI5L@>5TKg+i~_M1h{)40fXtU%DH&uC190q*sX$tFh%}=NL{v`N2Gg_6 z(?!{CY5P$$RW`SpTlz(7!lM?UKg98x4H41eDm;<%Go=tz4O+Da% z-|J4aRn2D$2=d3r4+sm3ZSxTM2cM3ta)0TggI1l+)#)#(9Yljeji)TV5ftR~@c`re z(hCM~xcY##JJOG!&lLs4#$@;%{k6)OqPw$kUceLiab!Die*vP=7uCQ8?ZyGhir1fB z0DWFx0LjFN1d^3fG?pQT5i}X?V%io`M$13@AK6A!ngTVu4H$6g0w5H5Y=&o8s~13E zj-~(2*-vWrk`+i&sLX889`O_KumDw~2u%o{Q8IGD8?+V+sp>X0z!iDOqAR3+%T4>U z_rD<zaU+(e`t+q0we)Dvzz_>VN zy$wu*u>u|5c!@4eLmhx>UmN1Ir2vGy^k$o6SDwm#4If))J&_DI!o52oBM1l*Vonulb5BK@1$`SQHg zKPkjgIS1kngLcLcnhZ#Z{=}!8{oB+bx&|p2%~`II(PjWlU6&Mc{vY9db8jKx$Plx9 z`QDMEr_Xwv4QrRU$Gv~{bSO(I#}#&TpTK$!dmo|n%DNvzzJ4|BD`m~+MzgP;eK?eQ z_};VA24IS>f$XU*qM>)B*!EIlkwm3Mo7nPJ>ol=OLPCNh(bn1;&f4&wPKO6255h1SfX8n(jqY;oPHWmb=JU$?zE&+>|$l#KW6h%eD zLMnK4(YLP7ivq7j* zkoQ)C&<5yeo&uN`-K`!G4Th1Q&Foe5VpD%QQ_rEsCk${ z3ZvcdYYNFA4;;cBkw$|crJsLZumx17f%UZE2blOCv_E}&?$7=IFTUvSF9`JmYqZa6 zFv;C-6EF*yrV0kV2cK<%_QyqzqjVpyG`~zmYm#;^Hy(S`|F$k`$Dz5fGaO% zzCrWP*68ma`|T^ij~$;KP46PuUiks?n$G62LGCPrw9> z`-Xpf<)_J4*pjC^Dn}_x`}3&%-!Y$!%Fz=?xss(1z&C4|(B(q_XPdMI?kput zMy~EQb{gnspGRu$Iu7~_s3kUs4~}@#Yp^{R?tSnDX#Y!VFUMKvmQ;aUtsD@og`qEU zP9MFhp&(6?F#GIB^PDsM`0uBTxY9nFG?8{zU`K(P-`OG#zV~#7xyVTY;ciQ`E-y{m z36URHf+~T796d_#>rZOU0YGU-YNL6(*+M$}OwJ8AK+9ixYv*N3f6JS^*2sVo0V*Bt zW#DyMTKMx~n`~M4gX&26EZ^LBUyH)|4E0=2->v6>4qxf>-{;&W{OIF`XXyGOI4W#` zc*wSnL4N^BlP0<0*>=qG5-;C{ezWlVhfym)RJ4Zj(-K|x4BdwI@Avq;F(rxa4x#!7(Y3$*S6gvJF(@Z+|3re#Q2xzI@2Z@8FF@PWS>QQ_{b%R(X9~e;V@e;nnA6%AV)`oXF$3KnOEFvsS&T- zt3ADb#%Q4U{YUU9rLrz!yRL~*Q9#oO>3L!L#b;NrwLK@u-VlGe)hR-3Q%ZF^-B49Ju?0-&(2a0g@E zd~S7K(YyKl{EELFkrY|*s`<>$-f36!F6R8cZabThRe+sb6E>HqIj;&W-j*Bdq?4Pv4XjN0B&S4B_}oFzHC$RH`PtqkQx* z1pf$A~iW#FOR8ATZfL+e8MvD74TcgDC-_B0jk zdQ!8N?^@l|fR3S$=w1fNhBf_i5O4nZkH#_dkpN(4GLFJ?dbS#V1EKFm0bSNKfQ=Tv z-e^*S)YFG&n`IM%8mC2TR^L7lDaI!+9cf5kIs@wnA6f$a&Cqz|n3`O=u`I!+M(2Ky zaF$=sx!VP&Z{Nw%_XzT+XcDMAPJ7|d2qCicZm?0sP9JVpm_u)m=1bZhxk@!SG)i#o z9^La1_D_@%LyE4h%@Ic&)t4ZjRy*HoP5`Q(#`C6IWpSRSe2FuWA*ds{F`H>J_uQvA z`%w0P5_<_!xAw5j`slhNyU@aAjj|cRNRtk{by_vWS=1CRq8IF=8!v$Z*^Ks2? z)Z|`9Hu1vfF;z_{E-RVzcqcHXx1zv%(Mr>OeVGZj8Q{(<_7d9$-2}Z)XZX%I6D7H1 zGfmZ8ElA-Mko4?p(sdaQ0RziZwQ-HZpeeAF8m@*>Spr3g%O8VYL;$t8o?J{yBJ29r zbO|@SjxPzY>K%FS(dZGiiy7M6UEcc^MV+GcgLMwVk(85HVxW_Y!l-7c><9IG3bpOR zfxnty)l|VlkCJuE-oZHJN&cQPfwLr8UnA^-x%XBP5H%{o`H*aKjBeBi)4e-vnUjaE zJ4A8oWxw5N10tC%;1#=oYUL^RRAs*1$Ipjbo23S2U(}M)Vrt$Hj0sH~-is-9oZcrg zBwhOvQQS~(&J7Sc%gnr*a+W*{%0DQ^6h3eU31B;+!g9+ z`bYhc@>V41+lk@tWXviEtG#&Djhc;WsKKj}oi&EE^L9loA|3i~fJc$zC=jFF?BarE z!i&BAhd``a>K3gT1aT2vU{(Ao&vOL?PeI#DP+sYYq@J?pv-a*#Q+iL;2W)oDA zIpZT;Q4$46Sk8Ixk6jRLO10k03ZY6(v=b1CaK_JHN#IdgJ;0U{7BZzfZ>0!bWbG~h zU1^soXBYv!hQJ%R(=ToaQj!UivhKUQ&bxaEG9}N>+e%0PdB&ve&RZ4WH@L^N+DW9U zf8W%+zn^nI)TfWB>EaiscP1e0Em3FCfTgrrYwihVZir?q_tuxkZExXM1J^3wYFa7| zlv(3_fN{;cn(n@)Fd)Ono-Hq_BCcS|#2J}oxPNFXH4J)u* zYp_N;O5GG0v@WVIun6D_j^_1X%{t-HYAMa3q7b`m8GPObhu)gr9^KoW>CfBdDQ~&= zDjp`)#6kpd5kkI9>>}@0QA~8C<=i5f;X-|#=Odjij)Jm(*X$!N2edKwiMw(N85xYy zQ!^YM0-^UrN6^SAXW|;Kp^k~Ta3XU1Qgu}iCAClyYA<|B67=~~+;dSJJEJrn$8x8q zKn-?j3fJpmqb_7tf0~Nx>>pGKtPjTLm_`tzc%~Z=_}omRZPlP=j;7u&Xy1;pAiE^~(bWsu|m+kV?Za01!F0s6uTeQmSgG*G)pt3#e{YV^592r=ks=X#~ zYmdZ@tfh0m17?pWHPnIxbatSr-VFL|`Y!6yTV`vOgeLf|13lcaTCcUV3{_G`p?VtG zq-A*R>{>+Yu{q?6@UOT7t^}#OIjVX#}ky^dpIVn@W%QxtSTY;=#s)j~1fgG_-CH zW*u};rTe%puO8oF#S*~zCGB!rl;RZj9z+yadSuA6McAkzNf;%l1<7y3)w!<=TA2xl zws_T{z<_zh#fZoHH#6krAU>((dbG;p+0cEYaWy(#vTTd1R;LCl7fy>fu@(e}`l`0Z zLX2kv(k2^*fUIr^u;3&<>697w-DtuVM`sUFXY9=elosToqq>zw8dWF>3yzS^O{83- z_6Rcn0u$t5gVM<`cwjmn0k55{5vAcQrjFIHeU-ocfbNN)@a}7MFZ2f+pS%3ljqy0f zkJ#w+DEXSM+E}v(&^HFieY_Nnl_%37hmMrep9KLxGPcQrrP;E!iL`gcA5$iOh4YetF%)5PVnYf2KJ3rE7CWCV3T4^1bSCDzIRSkaF*!iXKd}6P@(>@eg2l>*AjncS zMx9crOamU899(rOF)JI!_i*#lk@5G~G#U?;Af6Khp> z%1}o_P^#*u9!v;=j=msi_e>5+-@f`@xKU`r_WTV2*??JrV&&gQ^~&>M$q zChF2oKAz7wZi2E>(3-}s0Q?(}uebC*YTEx)HSBM5`I z)9!^>-B$R{YD$p1!v!(AP%Be(irX}WqLH5H6EoE&;lr4hF&@|mi`lM^qO&s18C7I# zlVKbi&$x-3Dx31cwe&gNXT@#`Y*n@_!DZAUN1OoORXVJN4winnPg-W|(*< zI@06^${BO`Dle;p2b)?ks|Pi7kT&(@?Kt}m?<`M&HoxROLR)jdeBEo@zO*Py z0AU1uyUZV}wB)m1lvFIVCm^Er6xG$2jalbNgQRUlFi>2!cK0^+c9Y_+Ka!=tnqyZ} z?nq-<*gpq#1OXWET({9BR&45Yt1U|h4UKe18?wVq%$3CMZ%lw>@&tgLx3_v;^M7Lq zWoo)j%i$68SdEZ=LxqxBG55subzO&w@F>r#NhQW49lM6gIf(YAmaWqEKqsN2Fs5q8 z<3hQf$0)<%2AYzLk#!?G^NpXXw&Ro&5uh^CJIYcmGEqcTF-RT@Vtbe#PLhnYzPG;> z<+&Ettgn%c8SNVA+XIz^3pGVfS?JzC4nD2diV)k5tqfXc7|S9wg(QKjLS*~9 z-bUs4%leopORaR&+4#gfs&j<)Xml4Ny~D+K_ca^x14a}hO-$pbBw|hbn=ZAatf_Xy zD^67BZ!^ZBN0-ba$pqzh$= z(8cehF%u^R9T?gBkUK;W(7JEtHA_xd2pT7PT~UDx7jsCmz5c;vsVd z;q`*4$=+6W?p9RCgbZ0XVFaCR^$p^Rih1|iy;{(bi44vVZbU-{D!N$j@jM^-iB9js zmf4Bxj+oj(h<<>PYxWX~?pF*b+4g|H!JUSOCZOspEanKH4$lh31e=WCd#wZ=X}ZOK zaf?pToOCy6UjuX`HXGm z*?CW`b|r~TSWdf#c=YGhNx(Eh$Nj`0;`HvJrn41`nRORT7kpnw}pR-i%Zc5*wurXkOpmD5o#M*7F8jgxpB8`Cz_lfNRbD9F7me&KG;>xm zWJZ+h5ztPENpYC%Jsi{QkqD;Ro3OU3=?@fJLXN!>e0HFBzd^j3!2#mDmORdiR}@kZ zvke>&SgTWe`X(yuu=tlo5foblgV@xf>swryeGfdy9|=++g3_ROe{Y~}F%+s^TphGp zbkALB#C^#%?N~(7JMRV4jxuQ470BCuy_f+X-A4`s1Ev7C4jtVEZ;En(bWaIV3tKc@ zVA9=RrDW*a*dQ1qm6P(!cdv=p!Y3BJtC6TH`CX?EB!~fCQ+U|=Zr9D>#Ca|0u99?j zqs*!osEoY9wJ0a82VIVaYRVH06@f#i5`Mx0s=99^D)rxPMRC~hK5P+9)36JAteG0L z8&Dq{lyAA)ng6%IGml-Wzxj?&yUi>HVDIwR6f!8JZ6fmOj{9Z}v zJll7kfsA+73kOB==9L~PvBHtYvOysUPGlR~AV*B%Yym^23GGdbpkSAg6N(V7{&->7 zhD%RDj8}Fu)O%K?1|_bSO_zOmi;_HR0ux5suUyxsmU zd$mfWKJ;7ulHy7!ErXSaaGgZH+999q#YKUN#jSby%<^0&G8N|HQ(lXB8gsR0s zW+=eV&-UZ)&kPbnTsE0pkxekL2g+*YZu;u?tIK)I07&f^&oLQ~?Rxw%p7HafQw!rl z>ROZ$mYDZoi%|vr6|%SXm^Bk&qDB!Ht0y(5n?lX3=7=IqjdRY8Pk zS4`MDd5wPa+1t?b-b@3&)Zr6VA2XR^lvsG=pimq!VfkxagWziuuX?3p&H8Fu(e&a7 zUFB9j)w^7fJ%a`n<*?2>^4rz-WL~WBbtS*|+4#;0QH^#8oBP0&u}%jYFpzss@}*{@B4t-bbGM^H;z7)>zg+R3wJS zvtx1H08_2aS1uP2f5)fQW;R4U-`^(mX||y^G!xnwCf>X3%<1DU9^p#5RMYbyj>GUW zrKV%~E=0HiuY28EFLCd45Cp&;nBJBh=gSa@*zx;DTl>`e*R$HvtaHTYIQV^ym2iu~ zhn+g9`aAno>SHv;x$n4_=NS$Z+ukDU#Ta@&)YJ8jYshG9FH#{Y!L-7a~2`x0^`5OTkEBs1eaC1aB(anDxyS2mv(w?x!nF*JsI23$J>1N<#Wx{Jjb zmzV@P=jH=vc|{zNjc0)B>{ZF-!dr{C8pYqDt{+b6qGKug$SD?8{6fe=@1_&K&2uI5 zp~X6+fk!~YfpGfAYtCtm{?=^uo_EzdC90!^?_k4aT+P*p z@H7rsQ3Y^N5G1OZv~O}CYEr|sP>dNx-`dcW4E`(4Nc$V_id-Q1d`nN*1|~a0cA4r1 z*0l%UUNvTMS4xL?nqWtR*X53?d!Na(cx=MePk=vw7>D|7L_G7tU*UysWZF%Q*5=G? z2*s?!e+)?3Azoh2`3b|0*4YDWJ19FRijAtEpU94@la)Fb-q>!RdRzTWqVAg{`9>8H z$TG!5uXRmOz4rugPb`}edkD=~(~{TQh%IN|2*sNYX3rUU=ZdAUnh$=e@09Iih$u26 zOHL!)BE3VheB8IIX}5CyEm)>n#y2sVIwrCBxQ*D{*Zp+PHg|HRTOHUojm@dfqw{O|TtS2}$LQeEHr6^ag+kxh#GnL`vl>IKljK7~xHL$mfAmGvceIMp= zmUQn$f4&N;`A5f&>e%iB3!GtaWJl{}`U|R6TB_~5zHpl!A*;l)MZxg$Md(|omQE4r z6;-d_KnS-hESmEud2FJPZ-{Sk{-0-SvW^A;{%J=3PORq^J}M=uaF=cY>*gFVd&DI+ zd_0U_AUxoFn66^cy>RF1pcTG+-O1ruN$J(s>R)x+(Q1)hC9Ew*ofRln*YNuy zuS-~oxWmMPIN#Z{oEzWt&#rl&zr0vhXO3?;meFqq1?%#pDt1`o1xCr)XksJ8T zina4QuEA_3y(w98bx#?JR9g|`V^=o{H!2Zfl!_cfA`m1yy)wQ>Gn@&Hy=CTV&W10WSB-uA_BINL~_##-}Yr~W;wEyj&xBOB48Tv03it%32Id7_qcZqHds{O zAgI`@$A?%7#5gy{jYQ3BxM!J45|jFnjy|K}$|EJEye9QKoUkbI>4pZ3VS+wqU*2T8 zu2hf_s4mm-FnuY*i17Hno-HG#33g6#?IG`l!te2RV&BHM+m5$Pi~zh(F)G!}R&Qm& zHT~^Gq=-A8K_X!zA{;ep=R>@damr07YUMlXkGyggn~20u zo(Lyhw|Og&ajLqRT-e%UVE}E9Tme-a+OOiuls!V6}F$O>bOl64cd8@PrJR+_ri|1TyxP+=}Y=%?4H7QT8x3Jy*VJ#tyT$%H~sx zyY5YfBWF4Dl<#B(5eN<)Di^D{KQ7|dgp1-~=h}O1Bcp0SB0dUfqy_}|XU zHmU#=Rw21aggd-ktCV8Jd`shlWuixIb{e}{+ixM<3}WQBjc2j)=o*KFT$QrC4VZ@Z6Xt2fhy~-gn(iM;%JXoo_ z&LNcW6_6JgX@cKPStDEbHt!TPr&qKp~q`t5L($24i#a~BRG$BlsNTENE)ySs7@zHP= zRfIM74P-pkR^#dN)ml62yM6W+%w(Xv%y4frn1r-1stu}F(PLp7`y+46lxC9zAnxS* z^s;uJzOvN9?~HNb0T!?a+33-5MDc_qDgz#crsQM)5OP(Hkn&&6QyUr(?ifNEV@px4 z*CxKC3y$A}VH4$-Hh3mg4L?K&9O}`<H@GdSVsPAWJ+YMp5h1#>5Vl%vmA!^9kF>9Vk|KsMo$I>SLg(yTZqwRDzG6Ll?&i0kF#> zace$X7l0GFx!2*lvde?HRmc9v8Fy5t2&x^WxmtR%t4|r(>QhFJyiMOmAK6LokD(Xq zBc*mzJ^^!67Wv+&DC!)5t{1z0 zDfA9uwxFw{0(PEYsExpVF(k0;DQ=2U4A+*w9t?qn=5QArW%9M32R*!?Y-a!kD>B}5 zLeq$E6FT9^MEHtbVI5;uOCb{qMpu>f+TrkAlSVnUX;kQZ(ndXlT;d=)>Q%H2=n3>N zHB3QlTIyuc2Jv2{o%1lg{|?hB9~OChok=8;4Y;HB5_8_q`p$d^9Jc}qo9O~{9KsIu zsga)?P-mKhcAnzlcwzBmSa#bR_>b9#B262>6v&DBnv5Wy4I=OO^14rH3AIKGbp=qg z3w1PR)%~=ZTeeekPvBgjhe0h*!fB+vTlh1g-kU#HZ+mG6-XqPjJ?NS=fq0a}#0(F^ zM-K_bbw%K3qhM?QXS-=6e$%wE5rz}o*Zkmv`9CC8h=*ZXmQ&eftaH@5T4cW2q%~$tggsCaSuJ(X$ROFQ+W#y>UPpv-5t(|<;dpC-C@egfGeC4BX%*6`uHsx z$i>9+#|R~{Ex^wSRwIoC`+OI6ED?^W17%}S*LPu1L5pzuLJT>LI|#tAA@LTSjeX*j z9t;t|{e#TQuQu`#v)IIA0j4p8K(~5LCO!$Q{?X8t=Zq_LmFw}!$(c@YH5ts-(UG=U z)MIFj<>;rhmncuyml-=Gq&z|Y#zdMCJK(Q%r-4zl=Ms{x@RD7XeXb;G2;BfS8RKJ0 zK%J!2VqCW!0~|WlMW39@(va6EuG^-!xKgkUO)l*>zqF?{xq*9e@oDHY)dS{J#%BoD zeZ)53-J7jVj0zLa`XtQ{51$Gjts#f`0s^+UP}wP;75@{{NUq)5pB3}@ag3ZNc@0f8 z1<2?Q#s@8^^5y`mN=X}jHoSu2E7Aoro%+Bv{@H74@#l*?0p&70idSj_?&P~@1?NST zq6!Vfykp{Yyv{Ne*``1lS|Vm~fL8b!W1qA=H2v7ObHW$D9OtsP5-1b@W`IzYS+#kA zG1N-NI1_hICiBhd#1GDNiyKs3pgd`!ADu1PUUV7#E zsk=T9vAwZVzOTM?NjhT-v?nY&wySgFsW1HNGghiydV1gVp(M~wUfHaW{IH=ZckMZs z{P!rc8|iNnNq3%&N8w}mx>^mTz6Y&?()U7R*QA_{l`tH$zR?IKvTrHvuy~ z&aXZpr1j-RJsv`L0(V~6YSoKQW=z2*h4BHr@+Xf$yQWs#i z(5-joo?$0p1%n!6(&)r``7K)J9*`X65y&P8EH|RdPIZgI--D)m?VBCA*O4Gos;vV_ zj*|dJ=Pj3*0C)x-Hv6LC{W*2)^F+T_P&p^ zKxp1qBNCGU5yV-atw^$H2g2JvAehH2a95hLt;K52nGhZ8=hn|J8}7~|W*23|ly%Vl zH^v;`8s5oDzS?x9B$Yr5Ql?I=|ac@1TZ`FbyvNPS0sA^y>qko)XgO1N9z>+Vr(^x#bPj?s!h-Dk9>+~ z;%$_7-pEs#r3yCtKHpE0mJ%P38661Ah4kwaJsuIML@9JRmH6XL2v2&`MxAocp zt#bpY*WY1!`(98T_vCKK05F|{vjkYdWw05+?`{Vv8lYMo;2;OQL^g4cm6&*Qv-uLl zrC&+YeG6b^GtKDmZ_U)~DpJ=8r8NuKQ~jT)LlK^qtmGEz(e4AZ0&Q^jMjJpXFX9ft zu%~^h+RzmAuX0FGW%5WYWkvZOZEN=k7%8Rt!WP+Bi2QhXkSpLMsfbT(2Y^(qLrVqL zxWJOGPG?7)Av2gba?yrpF)bcs$kMm8?rgckNA}|8u+aK;JtrEsM_HaIT$NgOSGOjq zDMx?-G(JO|U{}ZAQBn_G<%D$u`V#XIWNr^p9xCX~87C;&BuU-bx?pD)4R64QG=)8U zHovfbGn}16WEA8^>b}ih3UndnDw|+#nFTOKShXM*Czyw$YVeGdUTgC^4fKXW-*>3A zH+>~T#^Nu2T0O)l@u>!PK$j~Ik@%1dA~pXY+QEEQHpICJ1~8ssF6t&U<@2S?!&62d z4iE@{re?LNG7?TNo{M7Yge0)v@^w77IE45M0fZ zKP*-))RQE}zpw?DF>oVr*Ih0ex{Bb2v&{i;(Al6}&9PA;FVW-qlCN+oHX+x-V7RE( z%-x&{nh4Bi=0y{t;gPZrVtY6dSFNOLj21Jt4No`sntD-{BvhcBXa#hrdg& z7jjpG0G^=ijo>Qi&`VsBqP7TTz@$$TfWMrBA@>hivf@Zh|X zpE~JP8Uj>HgWM;0!-G`cFw+ZN-ph9HK2%tRu6Jvzza78=u2wk^8Ay7pT6?G5s!%Li zayBk#M1ELT_a$qu9njEgFnjy02GcX1K~`gwT;po8YFOk2l$XwQ-B9)qWNEQorSh-( z0KD|g{sj0^q$l$nH@Y7e+~{F+5;2zql&2HU5q=2++2YFkJ3F~+I%B%ni^*bd3n=L{RxhXcub6% zUWh2&=qa_e2Q!NL)6&0-59n`fUhu#L*2gq0X+;jHbRNKJon`m41oG1CS?tG$7 zp6`jI)sRA3A084dYIFPw!(I5c_ru0SiZOBdB;Z8 z5tBP+*w@}9UNjQQD5L)1iTiFz9ws&!C&U$C0by^_b(ZP=Ttw@xG!e?c$kV44F9`@2 zXzR2p8@=!ycPoP!Ev=6hMbZMRGU**d%9z9{S-iZvu9PowLsG0DUYD~ESw(z-&&wdt zMifbs?ggLYu|T@xkL3`{$9+$uE0peVHqH8 zbjcPKtZmx6DC}W#T+WWYEUs?nl+Jk2Avf89jK@Mt_nx<+W;%mD75i+G{LjR636&B|O?1%q!g%@Lu_xCsq7JJ{Kb5_Q_GUiOBnB~Cg6 z1Ke-D74!|L@KnjGz5&k%tj6ei2CNp z244w2LnS|6`mq^YiWwln_egL?DdLYeaY^;Jc$+s2FYh%Od241ujqyT}Tc*hc@u^YlzK1=A@D5D}_xl+S@mB`KB=OeyFT(3;y9XO@TRNK|a>v4S1Rf>zmP=*Syv2L%$ zn6-{#o&pE?6S}mEuDmG>zQKJOGdT6eHBSSdmReczFd?Kj$* zi2&$&;{IX7Q&X&qX~+Wt%@9kmDVV;w3)tjmc=7qzPURC0EFP>7-@WqoH`)g9h|rr> zak+902VTP8nN8Es#>K=c-tx2C45r;L4`QERQ^Kq5*G6aS4Ja#$(r zv&r16B2@?Z+fPT zID}d}A~Se8(T6O#i=pIu-cvsAu)S1im%4HdhoYE;@-^XSH*&GV=S@JPR^mQr0y7>S ztR6PI5VCk#CZ}q2KCB3{p*d2MDceejj9@I_ZewpNXS04hkiOQ2&J?B`+EpZ?^4X*}d@a5P*lZ7hVc~MIr%~x4TPh&=$vpG= z?l51UZUYS6P_90YDq^RMAO#LQ=|!zQe`_-seaZUvU07wP?$bP_o)0V9soq+d9Z9`W zsV{Q!_~IyfN1ym0l*}F}^&TQ^>#zzAM81q*Xb-%AJPDcC!Ib1!z?caag34bR1xDTx z{|$U+J2cnZrPNz!M~@0Su^dSRlqB1%OQacr13>&4Vh9vF=z=9sEB!Ezn15*VQ%h)fscyQ_1A8qYQS=+a`_!aZ?P!OA60H15aNdeh=kPRQN%Dn*`;pb^;i6{ug*a ze2-?^dqMGzW&aq9d#vx%Jq|3Q<|S#BdrDtjt_-~`fzQ1(7+0$Hv_U1v?YG1E4=vGw z=W4v%CuV8xbr;?|wC13#cV0HW=6F$ogIs60`IYCu*01LTv$zoEw(yGO-pfyEzT48N z_mnLjDoYQ?82J?~B%ao~ej4 z$^KRb{ZWVCKDr<9nKN~xznRp}{ji~mnwDa%@{eWrf+;xuouNk$oSu<<4$LTe_=}na z^Dli*YwgM^IP71|%Hp&6C}X-AFbdf-)9HEZQh}(@iTjIXRV4*;JbWQv@Fuz;(ICof{a>P?cHA#Ay*>j;#6P{N?o>MbKQ7Pd5=;cNs zcI+td4;dA#3oB_m`l1n*^r&&z$ZbkN{;R;%Ps9{Di2$BEzxGG}JnA04KRXlc7gp1b z#ocL)H2?Ec@BKf42_NzQmR#<(=TRpEMT^;WpScl@~&w*vjnYW*h5`sWw= z_y3B0yN_1cMo#71pI`DD)bi)8{rzA3?E|#R_5a(Y{F85K7ok3w6Z{u`YcBt*U-(nt z$j!}m@ISk_+k;1)RPW3J3B=or#m$0E*gSi)^1Eqo0Ak zL&6VjN~;X!jK8`l`S|Or5TW>g`cnMM#b`fu2&i4p2|I#O2Fys?@AN3(+{+;B7H3m3 zd+!T`{#PpSzx1uKec zuR)Xd__?H4v916Sbjgij*3}RP(*cG+Qa#%XGFVsO#8Ux$Xq}E9qWO=u#KVuL-MZpN zCVhamq=GuS004j;#$Ky+%vJ+hz=;*sS=b5Zbpr9l*O4c zuz1kxRK?ZMNr{E_eWqDyk(RgYoY?^jy;C^MPle-@gnw}86Uo}N(7=Sxb@zl=#yac2 zwEld2_2Rpy$jL_*Pj3KozA_=g$D3v6Uw`8;y6FDng3+716K4e8N-i81P(ZI9jZ$E7 zIn2b-;r8PH+Slnt2IIlx<;$+XFsTd}?zj1YK`gX1oK}>Xj+9~ZIGd^72c~c9(P06O zFaBbm{iMi%0p4bx z)sgti^4?B53j|u~5{c&j(OZ;!5*)`pcR=>~*GvBQn`qAfnaEsMNJjrJ*YH={A=4j3 z8E1|3rh=) zptv0jb)*}$vCUDJ@baktf?11j-x~uV-`yQ$dXz%Q)%d0xbG!1(xgS`88m>gbPflzOMnKhl0o0712{Mi z*Z?}l5|}3GsxGhn?@QoNOyy5Qba%j>F(NecO%_w8fm3NL|S6QKeu0< zbH@12|9)dI9HJtNwdR`hiTk;3n)ENtu%iW3QpGvQyqZ&AJ!e%U86U6jrdt(Wk3-Q=m*R=_^zHv4n(}L@RrtaBtk)%wl5uYO(xIBN<9E+NobS0wQLGkg9Ul;&bd1zhr@yK@2b z8RugaNHnDhf~si}v>&QcPf2XYoS~Ab+OI}fbw3+-8^L_(%UBpFoHPM6n~SIqK}N~+ zG}P0!4z$&pK;@|i<)Uc^CIj3Wn8J*jM+A1U^F!vE4BOw|`i@CIC<(IxIutuFavr%} z42WLsz;bqZ5qNtwPO7u!q($MKG!?%W0^EB=C%BI$6v9&Tni`^Q6B46YOF7WDIr4I! z>}&$)-h-aKk-v@mw;%X0hD*TvsM>W-K46D-)YA)QaQiK>xRzxfp_a9aYW)jPro4hg z`hs-*)02pt(CFL@>VU@$fHssVCT0n8%+PT_q#Ka^Bs()KD{DEF!0{9CO%&8a1%Un2 zGjvA4ORF$myi00hD5meY00z$&q)2UJbmEz5k~sCigumN-$r~(z3NnDFQ`0YP*hrFz zWU3M8xf?jmRkBQztZFl%V%Hx_dQ{C)#9M&&!zyglnU9Y6koKu)?ARkQYXez4Q=%`*T4?TWUb2@>H!0 zf4veg@tc5ceb<1js#J7G0)u#e<0vqkR>{G%{Y0vx)V6HkeWK0>fX;NSPPr*D8sA3` znai?}-)5@j*C*!s{wB@~@S57iGFAyEZ^);UPNxD!L8&V5Hs=3MhEKIX2ZmekiRGSxu9~uO8+4ihoH?%KM>=WgZz6sA3iCRe6oiqyEB~UE``>q>jI;Zbc4D&gHRJ$rtT-TP z?>RsJtnG=mHN4z601!nItF{&Ju^GN z$>FiD#!pFI0ENws&tX;*fKf!+;en)PiwCZE3oPJHsW`(0H^3JtWAd7D3~p-krH^`8 zVTn2eP(wH@JG+=Zkp>=!RhIf8E`Es4UgY=5k9+6xPAg=8`wSU*oWJ;f95B^VuD$X~ zD(spMdhQkpH?r=^ZFq%<5aedpdPJks*!F6`C#LOWHZIk1G9ZFh)fU z#Od1ci;7u-r4m+z1PXy}pXDQP{77TUnHBW5Y4qHHb-=A)7nk~^UsFR4atvJA^U%*s zv>vgx8ikhG#7#Na&dl6LhmU&T0QZd|29@6c(HQL^R4ph1dNRtfURg0~XMg$p?T!8L zG3vm`O}6`UVCae1p3rv3pw7xYAXKwETS%i zuky9LTAjRRQG?4`0BGEd)Fn?|3-5?nH`fV`o5z@^#<-GcRxyez9o!cBGc4cHStA87#s3Yew>@r~tDEqOd>udFXxy)3I$e@bKAb~CMGT@pjIhda$D=y1Pe6CC73e5N3ltXLK#wJG<5k%nci z^O&f9%fUR?11-7*66@keHzfc4Pe4Y+MCi7{t z9E)QVgDmX%E;rbPAYMNGH1SqXg?nW8yI+71RH?2THDhfvS9ANby zrkz2^YzO24DmCygRUn8HAUECH1>!4biNKuw3?dyUeY#x)VzehWU1?Uj1R)hv;s&?jB2ja$!NJ`lJDPyttzF2BnYbhsUFK7B?=dynJ9$3rZHfYY1~ zYFuA(0sh;JGq1MKLv4=0zmY&EZ9K4EanM{QRrYT$$uHlYDu00dJv5=ztUEx_#wLm(Tw z{mNjK9W0867k~x500={bd0gwH-O*984JHCuWh?GL#UvS*9cB7^Q@BaPG`IWYNDh+j zrr!*_%En$a9`XoddVvrn^>@wS`uD6#i5&(kg@He>Y!MrA{FOeqKF0)G9>ROg8}}Iy z<9Uv?)&jI;@v%xZy>I)kkk-Hdev|E;5{e%x=c(`6*ESCK(H18GorR;iek*l#$aB+N zft1?cB!G?fTAD%YJCrViWwcr-Jtij2V~64Ft8sXXPeq8A`lNH`13s5r3p$!cZivD3 z8sk7!vkrMALgPdI-$qr6#@_Kz6e}3$i6*==Cu@kpo;;a)9a#MC`~Tr^6M39(8dB$T z2a6iF<#rDucl#I{4R}!hHzen%(w)F#HDgPrJB766|BU41yaB7F zUN)MR#-IOh2SIli!Ejxx0(e%1yfF!CPZT9axwd}n zi2-&34Pbj~meLko;Vbm*Z$(S z0G1c{rk5DUpWlxY=-Pa*1)J9`YM@;>Z&g<&0m#iRV}KvNUH|Z}TaUDyhk3F^&S}{C zf@NIN)kOLWdE`>7T0+!Jq!$XYJ9#L`aj6J)~U951lm)wKl>t|XK&h6)&oI94bzjDYUMwiwkec7 zz#<@w#uOLMg8+M#Ut(;t69B`Ay#>n++tEmpsIl_e%=*u@|eX;v_22wCuzHG3W59xzLzhM=R zL&VRA3{mLbx=ANLB)SUN!#YAZB{SqQpSuO$sp z!EF@@Nc|_`@pQIi1++7DJuKZY6rB2UrHLFb8V7!qbaEp{`SjgCLznlQ7E>ql z)kPi}LPv317S4H}nWyghL%pu4*ixh872_6$kw0&Et-AYe_C^_5XF?ts3NrPCd~H#B zYU};1MBOz!-8E(=A}s zA%&MzquOZRnHkj9lp84h^^wz`UaKGN9TW-lH50Kyl*4_rheUVoRCg2LpbcY$lE19| zeh8q2OAcvgpuPZRlik#=wkPt;X;)z}p8~P|lmkYo(1zj#?xYz}{pHvC)B)s%rnM*} zLhaTEc~Nj+F% zlDaN3gDxQ2SM;pg37P;H_$;>0bl`{%G?0_NM9!nTs*1;B{zOsNZkNq zQ9a6pNZ+v=|MtMdDTy2%IRX8Drx4$Q-z zpQynGWXldpo3&af0`l2JgZ19Xxvg1{K4xDx0MNhwBJw9+(MPX~?aHgU_dQSkf?v_i z&V!N4ESB5rW%B{GWn0MBDGgo8vxwTdG4C{&9eZ29T+ey#5&Yf@sB!HD@mELi2IO?vP=A(3+(e4P$TSZ#bj+vR#Wc(`O&WGt#t35=A5j#d6o&=oGIex?lOB>W zF{Z1ZctZJ)<~894)9SA1UwnRkI*1M86&n?O3-OWy)(Juoubcr!4` zwU7L^yh(M$EYfnGT%z=v!-pU;zH%MEB|KD2ge(M1t6ydkrtPuy1Z$^^N+zUF_7AeJ-_rWQvSlU=RRw=?XM}%~>_*ieH8(a!K&Uzw@yxFb z8~#z1dGy^4kO=l`lfz9+6xo`?fva{MkS4wI#nk)?PK|9)z!VJGKn;zpCR$G@nVAy^VhmBBOkjvehcCP8&ER4USJs((I^njDE8SGWB*s#aXL8kVQZNDN%q@)+o4 z4`dXBeNPWHHmA=nA~OQe4LFp~8otVYVo`W>nBk8T?DLI*lD|719Sdn#FlU)eKBa36 zb+XBCEVdhD+ik7!_zC!cc7VxQIkA=RGl>N|Nl)Xr*1T)lv)G7Q$^;*>^aS`7xm}>i ziUY?@>IGZw1kczs;#wa=25R&NRh1dJ(zKT(!zI_{&x0H)YR`{8HXm`FB&YOj^0jO$ zJaAU|oTuv31`5f~(R1mI&~J{(9+LVl6ZA&Hx?B=_-PP|V7qW!XBlt4|p zPg#(~M#e8cc0xRxflgLs6@W8?ci&u^Uc&lFf@3V!j!he%pvq%w$Cc7{z$0aB?0S3G zi@&8la*C!#!ng7Xb)E+WU}FPh)b@IKjlShj|6x$Tyvj*;z@yO*M%X02x07GxCX^FN`)_|8tmIIx9+_fVD0ysb4{!BR$ttg zoX@@v38@2nabjw|X z+FryiKPC2i)?tWnUdf26%}hr_4~&71o|6CbBa(?CCopT)iYn|h%aom`66`2c$ggiF z>cdgyLs>s^Ea=UH^{%m6w!}kxRVR7qXbPG664IsnTs_KWSY|Sj8FP>@Fc7l1d!djU zZ(3PSzsZ!d+gx~VzO*T^C^#k6khN+!dvIX@-?&Lti-SaN8pIx z(B==-!%Av1M^rK976NUbO|M|-`&5V8xyur3#n?gB)r~>)^^kK#!rHll}Gz16?zxy(2lMW5yQWhQ~s0*;zU4u zOx=XeJ-V17C|gi;Y2?$= zJ*Oz`lho_G*-ReGg$6_YMhQKN%l$h20Oh@PPtw?K>p#|Hyd4o8i9q2>G-G<*e=JBd zNSrsU_S{e3-j$g%t!98}*W+UfLm8)@c-n*y^5phS6omDY|5e#h&F7EW6rs=WxZAvZ zOcQIG3*IH>5y?0~RH<|83XGIGg67&AuaZ#d0m7u!9T?nkveu(zx3r1m2~SdN=4LQd zvES6zcRi1?81Z~Cx?#= zO0H&Zo~;@t9?Y>-DYC-Sebq<6fggBr;x&5l@g@{=cugs^r)d!6#AmwwwghnrXzwjCMlML=_H%||&^d{evsbWJ;*p35h&50ZE&{xh38y4@Wb+x($&dwR2J zZUSr7kpAsB&~Yz93`cd8F~v&bndg$BigGl7$JQbLNydl8rrJrh<5yd&s+ac|h%M{* zs833WXVI76M-nuzUzOR{Jb0bt6t#j%;{;xn>QAdP-}s2^n683dZg=TwgO87%ZhP1A zloGQ6_OJj;m4V`>a|FIV)2C* z`PO;}yS&e%F`DS{ z&EE6dj?e6c2D_qNof@if?XThkNvgax%|-GTA~sdZP7TNrL11_u`E5`E|5^jV5v!8t z<>sHV-HD}JwSQ8wIPmL7>;dz!aK2ip#IDwqm+=0TB!C|w#OH~UK>s=OOd=7cQRyu? zLl?CTkQ_8@!viwV-GEDa?M=6IO$Ij8ojF^?*=X%+niDP&Cor{&W&P{NoYTu9+6Ih_N-fH=UL`FD=97rCFAET+*gq%Uq{tY`krYuk&f=Idg1E{a41$cEf)@m zAlx_N)$5v{5F@TU2u|W-t3dFRHesnZX~uA7C12Bhl0|m)`x?UAViEVaV)*tRJ(KiC zPPJXIBmsd(Oh<7V)-(=-1ioX^?LPd(^ar(@k#pgCA>!G)e^!P_d*nrt-w9}M$YahYdd=|gO8|BwJKuu%SpV83VNEDA=oL8ov`TQEh z+&cpuwX^*~#sRGLl`7lyvK1c-b*a2E>t=EhA1{zd#Vz6r{u0%L10fC5p|wzNtx7oC z^}L$2NPa$8Gi;{@bKXU#bxCxDdw-TrhKqWGAcCW?!$LA7v9fCr% zp(^G%FR0;^mqv?i8C2AHObPW8AX|66z7fI9Bmk0xp_CQFV^W0oqx1`7yB;_H2 z$l4ZL`C7jkTUL>(a+FgA!&K2v)ST;2+TR4{!J*0i2e1!%xKEN5`@muJm2CKCk+)p( zGnM*?NL3GUTbJUm>cgs?cpjWM;<6HX@~VN8)z|HCb$)fBn=6ma1M6fS2BT`HTEz@= zf9ClM%Cl;@WH%vj1_+y?;F4P86FX()g}$1x3}wvm#k{!znvNnl>+=e4Dnm`q0*W(} z-$%vN?LlMLH*<=rC||qv9pw{Vg4Czv;{0I;S2bl+rP_vIh!Q74V7+R>sTLK$4x@IJ z$p}tCPgB8=m&F-*iVdte=3O}3- z5z{tPq1@X2#0}wtVU`swO91@^a|kxT8tEAxpD9QsY)0*=kFp5WF9e3KeLsKZ-1PJ5Kv@>4!=DlFnsW1g1+h!`GV z-i?BTY63UO+mmkhX*X5L*^}`X2Iz#^#$c7u&rzlZr<)6^Eeg=ENrHC>8w8=g-F1hY|0{+rNhRshmUiF}=r<*%qJzHfebrbd!3pII>R%LtlFqbRDYYvBRP*qF^ z>c0mwv3*tgxvMQKHEK7)ryRZN*_mtX5ke3Vq0J5CMp5^GRzDSKXiDf!abk31`Wa9AyXi$H$nHS-LY?dgz;G3bStWO8W%(zh z-)OW7TNLj2hh$+FhT89`q*1Nfm4%v94eiA2MqDr`fHm^$h_J|f%iU~ySAd`dtdMVc z5Nq8fnEuwqE1g8kH|l*5oz~$FkMF`(8aHlwMRL%A$r94^{mi`rfbRhZ(8ZqvND*lK zVaP1oq9NHxBOpuzt+(MhNYO3seXQaat7sH0Uuw`rTgc=ttrnleV46TC-`|nvm!6O5 zXReQNGf_810eh}Z+W#oh=15yGHBCXFmFL54J@yk!AbqaA9pN<1+Vo?kwC(z0Xal7= z7#kd1qgKXbo1-wBtF}zE9MtQZXl2|#<8-x+)ri(q`v3{q-QQ1jYYArG3ug|ob=sML zx~VVJnXxG3I@M@4hBe;6A2tH2%j>L+*Xu~Arf-1L_T&o{LQtMrBSv_rK*bxLL;`o7 z1K;9H+ZtF7CBe>&^HAC?OSUQTqv2#lQv#Nmn+tvutc>bc9dMeA9`0;_y4*9RHg$YyYg|;*BPO+&9>rWe;N7rf3*Gs7uPs$FO}gc) zud4`U@GopI6s)i9o3^R7csR?zPs;CPV$!W6f8zKPI1%U)*lXsn4OYeXR6YB$8@Kz+ z^=P&5W+>L%*W`RnC@{<%c2vd)jr0fv#fBqhkfqJgT{Tx;xeh|g7NoVC=kNhwr9=K@e}p?A*1phViFve@p-Mm^9^HAPzYR z149X@HIFR66fS8g=~*n?Khsz*2vhHOrC@F0RLnWiPb?QiK8*660yEJ;jT38sRz{uV z$?R9M%&8YvGs%gW)6PT8cycN_#;$cQwe_N+88#y>aC$Z)m(BZ5+G@3Ss`WOv@0}zQ zth{+!vN!ng|rG=Ze0=3qWBDByhIr+5rJA z5to@ieQtnF=%+e-p4y(g8*ykWcD08Wa{jogjGip`Ay;^)^%sKyx#xIMYJdbVzxXB^s^V};8q zs5(t~$TUGnm}v86E|aplzuTHx6fIRU!`}SUyq=|kKXr3?UsHcHoq4K2E!-lOtIoF5 zUtwuDO$bzEzc)0@*mu4ocfr%;YnKtdF8rE=IWN!xB)&U9b~7dQ(%s#!)BmqSUm$VL z#>g!_BYGp>Sr62EZL|g2Rc2E}bl?5ZJ(bI#EseIJ(N7)ZJqd41&JRd!a31g&qA7(@ zillX?0Gp!?LR{>?LWh@bKSpOoag$FRbS3!@VL4WHrg&oKUV|5llY1@9B7HO1Evzf! z49HilOGXI&9V+VZ$&1I&B$!s8Y-=+$7{VdR?S-MFwQMH}fuHBHKX7SU9DAQea~BF4 zsVXmkt73!h8`p(N!yZmr!{@}xLUwMQxB^qP;yK{;9(PU(GfER7h3mSM#Uei_iDGSM1d>QK8h{q*UC8nbak2qC!ZP7$FF><#}>J1zh>(BWsXsQ`>x>Y!U$2Or0 zfn3yXM#Y?bJJh4Ad_o@5lRbH&nad`5c&=w3bddScT$^8fH5R_>DFXL-wqNO?RASD$ z-fe%6!>n`XohhYXL(VMg8`WPnXV=9lZJNhR)mAMyQ$dC4n}cgMQf*IEeKVZ`YlB8N znIpNTEVjviw$qEe#8atQ< zR2^DaOsuo5GR2=p-$y5$vLf#cgt0|*x4-M-rD8dg2>YtD4|>I8Dis1G8hMX{%KFQ# zl<;xbKIon8mx`yEt*K1g#j_cd*1br1;b^3A$}ZZqL$gBC^1^xVty<07nX9Ls`OqMz z?GHZUJKU#RYPeJ|_;`Fob5Sr}mU+pBk#YDqLNuFv;bdUAJncQOOtgIpE&p%)@eNG|k25H4lu z(3GwtC6j?LQ0(u3WgbotR(U6bKjx{?#qS@m08Dq zkPoN;A8n;dr>Hgjaf-syGZ4TAY{UnL6s6TP3Vp;;cl?kR9374fUYf}%ytlS_s2Aiw zwW?Bh;XFmym)N1sA^hRjt6A|?__viLDMqAA$=Pd<4ISYRT+7~>&000BTlO6v?hDJ- z3EE($?JcDTG#eCxCyk%EvkB|dcKyedQgHV546hVv&gktp%%=Vljc#U>@Xg!^SS$B! zaj67Vr_zgBKlFWkQE)84?c4MG*5fT4w1pKG&2Zr7Jgm{!o^ym!LE3sg7po97Wm+6b zeR-l`1VrwM*N04O&+k5s$5&)$ACyyJZ{>VYUQSLz`)+Fm!wU#NFnt{=^(8HW3wDIK zZ@dJV%dXaJ@kJZDN+m0E^`b*MKE{IKF zeGrFib0oDWK36F*2%^z zaRv*-VLQhsOU1i!>8Rk|4EL}D#|2ua-29G>45jYDcO^2Pcct1@Qi@DMfm_+$)7d}r_S zfKNvarHv$Ne;M{+br=gbYo7RJi(8U^HoB~yiGWa@!&g?a%}Z=>Suc_pvD4BI+r9_a zzK~bPjaWQsV096cQ}GqPh{_T!BtrGtdLZrHJfB}*Y^&`v91qoao#QirHtqyF`VX2 zIr^KI?EyOw3r~zUQH9rfC+M7Raw0?>(T+0j?#v~NmT8H}oUDu@f{^i~s|NDsGO0*feBU5c6iV@Wl2G1-yjcQQz$aghL)=adClszUG$%A?4KPU)7uo$&6ti8Z~l&auK05hw)4DJ!uo{z?G9%w+`Lwo z`JQzgrBv3+7Yeqz;fn~*jj_CCg|Q{?L#(OYhpc;}N(m?9Hf>5mYo?BS2=S9{ub%!* zz%>kD*?l|kbMuH^0C$Wl2Y&$FQDIg=-h%h3Ghly9K%Aor*+2b zoF5-;61_E1m!J;3GD-Quui@VQ;_)iA5g9D1Qp>nUJ~PA{zInHqj9Y)}C@=BuRq5*O4vl`i^Y-&oe-7lyKL7P~ z8zU<$mB;j5+ZZxo^Szhwo(sh|h0$)yBXeD$57(uf^RC3sTs9Uy6?#vz%hGDH(&>$7 zLoZ>g&A?!uz$`k8g4!{DvoYa-zV_PdL~;P%so2U31AxleJ_N9Y?Y@W;7xCT|(m>d( z*MNHAyse|afetgm?I%Kf85NRjB%h>IK_mzPgD7qLx$Nk;Tlx;*a@5&P~K{d4)&n{^lVpb`cS@$$vjW zbo0^H(R?9mynxb=9qOI|djGo*#@W(`mFwnP|BD4cb6Oak;iw4M?8?7&2&JUX;cUQ`wz^)q2PPRU^p`3I&OF{1natol#fmH{4Vx~AKN!Ey z&z26iBGx&_U5{7)OXT{uagF!osYdWbEljN}q$pRe1*hPin$s0)^s?FSy^rP}IO&W_ z8(PG)QxYzRH;CtPHKq(UiFuPD!fi_qKPM=3RQA72m6)oV}`J@{4*Zqk9`l#l=HQxnE4q0$H&?wP?-^C%z*TjPXRzy-7C6|oSJCD>f zOTIiA@9m8_(^;@1$UIH?t2fBOEmRf_#Fx|&fqyM4<$UjZjruFp-^PiyeZ51J{^yhb z_@h6++T1CB0_udwNifY}-ZYGgP5V1Q`~IH~%ta6bNXk)lMm zTGKIBd$j)ZpGPG9^&mzS&(!08zWmRBmGl>_!Pb9X#D9F>jCl+orO%rW?m9&Mw4$1L z-!4CumO=@_{#;y!Kk9b)GlpdS)9qmMgT%8k$iMz3BaOBw{?22Og-2U~Kc8*N!HgC4 zEk`ZyG&*{{a9sH?*9>!3Rwom1p{J*K9{FGQ^FME_Xdd1cm&dWiNQR3Ci>lA#r?>uZ z9~Nq8-t(_ngC+mrHvQu*`!~NIMZaT_F^L}A_w2tP^56Z9Xw%~iKh>sb?Dc2=$LoFS z$NF@%3~S)apKmIv`$!A0K6WPZ!sn{Y!vVlTOSfO@O~?IEQa;7u1deQ0KS1&q-&<;F z`sq=pp8?fW%O0%)O+f|oS|8*^zC0m|PR?akS{lU(z{g0!r+!8jr0+o`V#5D*UH#J& z2hTmD4iEeUw;vD*d)2pkf+@|rnd6Iv0&Ol)Tr`g@DAD#t!+OHVi~KV7!9Tm)k}9S~ zZ7*2{W;%py&>#4BC2Fi^O7BulEPxB)R^`HfY{7H*~KpFB4EOx8Q)4kblUAqR!lf8Mqmw7>V=01M` z5qN>*>Ib+~=3;RDa{S=F^GZhI`O9i~rBsBUdt9Y;+|zxJv-8=xOK|p!u2#7rD+a zf6Pq&|G6F2xBW`_lpBFQWD*2}li%9?r{3MVJ&6#xDfapO{oV;Ea2BPdr6ma@fO}2C zV-?dHSiS;gF9moUzKQZr^#Z+o(*_ycbLp_xC^&c{VC1+!30d`&K~Jva0t9SRX+oYm z9ldLOV7M%`7mRmIf{1Zv9QG7d`|F#_wwvUByw!s#E?sDq*{~|8%W6slVylc!IB$*=fMc`}IH1qJH=hw-Sp-rx zDycaov&qqOdLN&0z9c6`Aau>vA3YhS2j;=A_zOosjO*Aa5LH1d!EB>yn?ep;BQMw8dMPulx2%<0!K!Dd&IaGNXtC)nUx`iWi38|jTBkO; zY)5$jDtGZApp9woQrIO45N4TAV+*nwdHKn2`L^Tj;V{j|9NfXPVF_v_Lki8|^&|l> z-)<|qtua`C8kTynq{t@yvf)aN?@B^TSw6o~ZoU2X5M zMz<9sB<3Y71l2Hh@?^6GMAAYIQbHUFyzi&oD;Hk?e85TYI+iHcp)z1ZIY@iho-pTJ zi(ghTsO4@Q5Fl9GYXg4}*+)Xtg}q-tC&!JfDj&6VTLoGG4Io=qmJX*-Pv*59$ngZw zq2m|+SOpJ&BXi)ir$w_yXdSWF|7ZDQ-gl|-8wU)p$WFHaRO@bCMdhE#k5lW1{}$=%Rb)5$ayeUqaIUwSXG^2Y#^w?Oi%bvus zNTXyRH2RRzSSBID7#k7c5fn;wP)K1fp~Z>w@3xU+4K(kjnr{g|IOw*2)$p{I_6#s< zq2Iy5Yt8K}l+)V7@ckX7N2Z|{wd*A4yb-!EEnR$p78L9BkkG(wRq3!Jeg`$Tfb;yu z=rEewE1XKt>&Va@{s5!v<6n4Yz(Eb%zn$AfR*5~bS_k(8FpXapGvAFjV890N`*-VW z$q)k5yHXZ->^af>64fmrgC*?OCCBDthXvOL$?EiB2DyWw{DMz-6Jg@(AF#FX(Q$h0 zw=~OxAEXp_P?fYEEn%ew^u7XNBm8)#vgXPSl293Wi6}5lC$%iSP>JslM#i6Y^u@$y zsa^T@p`j{e^cJP>RRzxDXaix~nWKlSrIuU^ly}Mp(8*{gP=a3(yH^+BfmX(-*OU)@ zNw#k-c;LH65eDkz(u=|T^%ReBb*nyg21|eSctxi1?$PQa`5XWOzlqZ2k(L()r6k^_ zz$u6F_m9cO<6|X^=2Y@iI2nTPFKnj!3P(ShIlB9Lb{OYC5RmofzM==R!ER#|lLUzn z(TTAczPr`JaXB1e6gb@pv}mO!q{;lw8z$bnm8OH0Q_pMdmzR{OHHEAbWcn546M{SC zlr{?8m^4c#i!R#P|B{mbtkLw&@7cu@n^oh|QZ!@!$$)z0w8i3?4yiD9YVOb8hQH$G z;AHk=Wim)Z>n~t@t(%cGO>=0I92Qn_w^k3^7FYNnKmFue_ZR$mzThfi>`s|3ZkfpP zOlhKu7T{_x(6}`K!teN!9@?29^O@0$!NIM$^iNBhy}CpbM(X|l^>hU;Sc|PN%zYAW zJ=X}0EYM?2D?LezpXY&2t=d8r)1v<+)cxmAxoJmhnCKIGq&G}bzrAqXdA)e5>cqBI z@4TsK%B3i-1DrcF;!KJ(H4Bm;8o!`x?b9SHyf$B-E>ZU?3*q@QVB8Yo`AqtXohb1N zc%H|pd1HTi3;JJ-^3?6^Upu%JJ9vn+`v;7^Ru=Yw*A=76byuS$0PAX_OJqbsFF%ES z}1C%?Of1|SNbG&4?D(4&q7Eh4dwSv2L-!bzbsKk zNBct=Z1E~lex<4tY~6=%F3Df?8I#a`Pa?YaCEWlWbLQ|T5b=~)w`s9R?>vt&=i7Bn zqZ)aW@?mfESI}N*(s$L+IxXT|@B$BoMweS@L^`SlbAG73H{(93TBw~tBPU~u1mO=0S;7hNOdqVzG-pf=;WgZ zlMX-otJ6ur!JQVFo=SK1@H$DSLxF#{Uq*;>_?7C5-8TprjNeMAUaBw-f}Up+NhZDg zJx=QT_3+O8X~O)^r}v7Z#^h!gS=Hl-qj@$4Br9vQs@A_Sif~5W>wG~}%-wmuLtNYQ zP_s5G^g|~H^%9uDsiH3LxJ5eyC+~Yy^y!oT7)5O&7kRB)6~BCnI4#HT@&Lyc0>(Vc z*QUDK516n&`GpoS?>)we_H>x9wPnJ;f0rYn`p0rx8?@&87r)zZN1I#XyJF)4Ae(}= z-w#x^@T~kyr4jzcO7gQFzK2;YRdNOMTBn9R5oyjoeH1a%N7UDD)wWU$R3)D@QkBy# z$#X0j&9jYR72Bv)ujmCEfqS0rK8aXKikiZF^E7y~w>esU+^+T9EKgW~a@mb_zWzIf zhI!}LD-f=kh`rf*??{Ri)U!aE-Y21mzaIJ#ZZAh(rfUuEebD7!_U`?eGw6*jz9XVn!}C#I2I8of39{s%}rW(iE3M7lX%Bkn#R z$*JO_d#=BB|C6~QeNwcf&Vp8n>m=cu-X_+^1{iJ!~ZXF#x^Dl|dzkCV~5}n&LPcDJo zjT)0iaXyE|7tL=87#+2&TC%woy0txDclL#Wl6AI_RqMejm_x#h_DERplkjmU;Z=T* zt?ne|v$$hOe;fH8H7Qf$M{xaUT8gYg%gq;y-!glGIA zrSWD)_a8Fm5hJ3CdEZy-@5flPM!!>y1M$7}Z5n4LriM)qZdl+B)?rku#^JC%?D@>F zailQ5KHqBNh3WdE4{TBW+!z)yc4KlpW+}>PW`bNo2c7kG$z<5E@F8}3+c*fvlDIID zm7!}g%348z)FAk{b*qt`snNcPb?DJdUQ6-;wZYNX?F)WTW6&pyxji4({)MEdbDjJG zGjvc3#vb6OD(syySZi17X(sf0GM$Ab{u*01jMwSh$DkuOMs3_?W))19RN|Muk}7^0 zVm_J0A>GkHp?2s}@HSbG6%~}(IvtYy`%T6eP2V=-Lsl6GXz;!;Qhp7?kd{AeSGE8W z2U+YGFh~nM$krr`YscwC%g9NvWD{an=HvUNE+p*Bv4yd7y8F6#xzA=N&c`kjhHId zqs6g3aAc{b7&;#1HGwEKT!T?D1tf4qiBQ|E$S*E9k>?D0)B#dphP+voL#8B~_;;VN z?@`OsBkX3Zzdx6@X`3x(c|n8A-xFHLjwUA3=(H{uFtssyHk2;X#=vwO@Mk|!8VjDw zuJ-0I;;YLqd2oK2_@QpmdZT9aqN;ywJlz}2{>~nKVELR?@dBPw>ovbV2PQnSd}lri zZuOc@)u3Y$Oxn0IzY!1nj}1gB-aSH14}Y~}t^4`Ey}|EZ`nCQPd-;-@_8AzK%+M%< zrrPfMf6A~fYpJa^WVf(dlb66xb{q#;?F_xgVoT-KLowPs6Qg-~6F|V(u$1t{jt{(F z*RD{^OYIA6=;c@=8!qCCxn0)!9-V;3E)4?fe$Q5-{9OH39D|pPpmRtS&=4f{^Iw|2 zXQ31e_0^*-92g*B+-z{(0UEhQkE)x+wlmAc_C>tyRq_$E~J*#S@wA~1@#`N z+!X;v(TOf8nuVyc1-R<4mtPm1LSh(r?;-vD5mh zi@JDtRPlJxv^%ZR?Fl1YEe{?a5TG8I^#Xb>$I;s)Y1)?VZ+Ved?21trA7`!Tt0`X9 z0%n17vlXbn?0^_Vom2My^Ds~)O%I_R%_g+<802*mJBoSrAqUox+^~SDoCp z=l$oDoFg?teT&twcJY z?VNab85jrWL(EpXKDo{~WLoi;h;yH>6si|hl8vT+RQ#eWt5?XfBoJUewOZn^5&3UV z!7r(zGbHpZ2CaJbpQ6&6-(?0j6n4!R`0C3*G#E%g$Jwe{avrD>sqkpX^Pi+jRcu~CdjzyzOY~$ALyM30ut9PvP*i>VDGQJP|OM!6C zeB~Wi?e60`w>_PMyOa_zw<_qp&d|^VkLG~Lch!8MNN{j{j42oPu<&&FER8HSXVrB3 z@inuBNCKD`ST6DzdbK;#jF`e}fJvBg)@vW^k~U?rik+|LN1LuLL}|u)9giq)qkNfX zgH_Lui{iM_eX9_lp_**F9vFLd4XRr&2Ok$w4h{{i1u>)g}y9EIy{2GUSmd^ zgLz1C8hWD>8!)i!;e^}xF&2L45iEZ7do|qE^!_K5L)M?h>@F2{7ByQjnrZ1BbbS`5 zQT0#ewdwn9H&7m{CSmXpN6wag4f>}UUl>)6GHa?N&LIV8&)8*!t*4o7x9Y*>UIP*a zSFp0>U5Kvw3SY5%Kq{=XWQ4vDsnt&SFIZ6K{y)OLIxebh{aXn zm5@@p8>E$H0F{yuk#Okllx~o2>6Y$6q+%wpdaFUZa9_d<>)7X5kQtrBHVsHYa_Ex(FttP`(2lP2Q zxQfd&1?~w!d3J(*yM>q^YWYge;tq4fTdWN?7Q_+ehFt-08jtU_FSXB zVjX|(JeXnTd=y19O)C^AYgn_H+;({{ahX98oHqqwjPaBLBxS|M^@q|6_8Ir3t z&lCIeax}kfq9Cwp6~J>k0P1-L)^IzK_q11*%P6!A=zf8TvX*GRFkQQB=2uR4hK=NB zhJNXA-MBT59d}UD6&Pa=UEDw19G46Cd!k8|&+*2g1;LmK*5DlrmVN)w%FLh9UNaw} z3GW?CB4J6HR^I+m$btS?Z6qCg;tfy{_dU>@e8pY66UXQmZ|1(W_zQ%>l`|Rafe!Hn zgW6T7qlQIVL1RxhO*e)A8?^rg)q?(4Nj|yXPP_Na_Th8N%%yGlXby=Y+lxz7vL*Xs;{d$ZUz&cT&O?Ft)zZ8L6=gTqRz{AE-u@NIhb1sd!Co&u}* z*!{00|MlYnz~yZ>-a(%56SJp(eSdmvpf!300Uf*myucucyf%jD4}nH(1-2jmU(Zzt z>LSDg045=;AKfQ$JNh?)?V_G&Cd!Sc$(q>N=6!TMjzXWr0jcA|cwh+A>rGrZXv~nH zp?nEux&QpdUog%AT?p6IJ?t9#|G{qm-=70K z5&&}AsyKfCkG5Gb@YQ_Ig+-M!Qw|L{3veVD7?I2Uj&(il`~ zQ9VrrGRd-7OYI+?Lv>fr9*}P+)w*mK*3SOG2ZQCoeq0L=3!lDM*4u_ALiI^Rth~mD zkFfyPl)3svg1+oz%-A-z`)O74Zc`C`^l-B-is0Lk{nT!Ux%MfuSyvZ@Cp25pgA@VXEJvm%A#aHZ*nn&1?S!A8LP-WDU(XRZA!88GTBKiPFM-)l`jwC0 zf`5NN9t(n7hs2^$P$Ulh71<%`9rkW%*C5h!2wWC7v|2u}OnY33O#Dn7E#y)N3Hr5IE^%+mW2uUb zt0w51kIA0O^f_c;km(8ZdxMDC12}_IyO+|_oxg2%(9SW?1+8KK@=Dv88otONI;Ver z#F~NzEmi9LRN9~5_*t~+b^!e=@5U^U2J@~#8#=ja3iAXTXSGW$<-R%4v-R65en1aZ`A-t@t+v$u$6od4ZjvC8%t{#fg?<3oPB{{G z9!Ur^k-1&Mxt5<3^9R(dqX3+D73wqa8&rfG*TaCxk#7xZHEtBRtd4?OeiU4*QNSa< zUnOefLFd~P%)9Kd(0uuDt8y;4sPSBDYqB(pA>Zv4NR@s$*=x1q4N4Fs03o{c$JW5| z9&j^=qW6shdCfHdCL4v&aL`w2aldM;P1!EiUf6R(U&%TZXJvI({4Ja z(3Kx4<`67>^2!mQuSj4Otfq0z8wmcj*2aqT^%3w<_gmrmdKM!oWv14cQHw+BIrpc7 zitpIeMxyQGr_|1urs(@kgflLPxcyw znA<&jf3m;UKaRnldeDB5x^{7VkN+x}iuZ&pWU75t`<0=#^m-S7%I}zxI4VZdy?W|F z>1CXq>%*rFV9`JGzx0ttyo^GUb$z-E>?}3m)QLh{mR7VywtmI z0+zN7|MIcyC@Ll1qG;WPb8K<)lIJzFu6?_6=?AYkJjNvB&MM}O+9ML?1Nl8i5!w2C z@Xo45ZqaBizcL)|hG0Ovcc!5+a;-r!Mt2~CUSd;qP2j-N;(#lhh`w6~)MVo}274>$7@`y{{AgcgE2_4=wRZh!AJ8lVQs` z0w~sHP5yG#1|}wP1|VteSE2r+CBHs^-KcXj!!{Pt;L7DI0^vtWEZG<0$Dz?mUJHI? z%Vn!o!1Bsz=>z|mP77X0Ea)MXU8u;Q9h!UX#}d_J`R&zj6%%hUuVz)qVUurP8DA25iO6@b!Gvr^d| z2yza6KWSElL64Yx9N)9P_=m$cGupku*dvy$C;eKUbtP!zo==2#uLhy%KMmjNsJn8h zdtA?>uWot88#o<1{<{2-!7U(Q8)N{0*yU#UlTj}0SDPg!om7H?GA%{37720%Gk`-Y2GO`A^Ww2*9S`q-E@2EMJpWa zIGjNP2X14I4hG`5_lkv|@4&%DDZ9qUQrUSSt?L&m>6hir7k*=Ndn-NV5{9A<@_T6s zUxAb16C|*^?8n&^MeIdNKX|#_&@b27QK`=oKA17v(`{Ny!!7zgpB4kpZkXs+a(p&* zF=T6S8(2T7;q)CBs9gKrJI5+Jz2eS)T+6<~-_p7slfE>4E{WFd%F9l&8{3Y`-DgF* z?CQ{M-B)eI&y-#vnaxSWS6PqgTz2O<{rK3ydMRl8#2|Bfn!2~D@SHrcIYYN2K-lLx zP^LxOcdaFfRc-`RJRY)K-KW^wr|2*`xa>xHNFt3XJgaCXZ*)I=d0%-A%u#GH3s-Fo ztZdHoNWoGf=3mx$Au7Ww^-*)a5#hD3ha$PO^@9p&^2ibOY%a?&p4(228jKavnOU^qbwd5$lx6w6>6Aw?eF&(FM=###mZC?)SOCWv|K-FZEK zJH%tJ@TEv_*LF8+(MbEH0jqp$vx;D_<*o&1s!KOXCmoSZE6(Gqg5K5X9UsvpEcw`$ zNdg>)`dsH=vh_>{;)YCVG1|QpeaMrLR^@}#!}O1l=hrR{Jn3?xyuluI3e6H!G$in@ z<@q-8TG3a9p#jTSqrRU<(ofWw^vZ}|$c0>x4|pJM5x>Ys2B`;B7HFy!(IT%(tXeHb^Y}!iXp}H5$^HD zfKb-rC7;fzcY>S$*;BR(wzxbx)$sAF;?v)+?wBLm?c$93;>pH4w z;o-6W9ovEPGqL;{!hnWvJPJVz;r%Tu!s#?23l!Q7xn5#bUyR9rQ`E0JVqpa9FYuv? zqY%{z2>9UvK=8A5Yn>-kZ|b!K!|UoM%PgPw;Zw>}FJ=S@$fw@J0QPKG#*g=SX7B<( zGUDIhZG@=t*4i2=hS-t)~=U6Gw;QJ0Z>5%KG ztgA2eUa=KfscvdodqszNV-Cl;zq16tsA%N0ifeUBpLcJ)q6-4rgg&hm$oCZdzSk#1@%7 z73(Rbh&vDdJ2s1-jlaA+&)39$X+o5O?w>#iLO;I&_NGY2R{SlT42$R;&06Pzc$R&u zSN4jNhS!+SrX>M z-YBW{Dq?Vwl~h;YaEImrrbZIql@9ygR9T?RLpzu3sCJzL>emHAp zTqy)KOyH77zE;2d(*4dLXWUD-1jx;!3z6;`RrZFTMW#Gw1@QQHu?|5Jgv%}VZA(5y za@4`*aV;SWK?L+q=pNhdEM0aM637?5%m^_HJ+ zgNIGNWGIPyt=$-GBZY=Wa%%eB$NRJ>+CC++SbZkNPQd7nZI^j!w>efYHF4eRbu;!F zEuhHkm^M@ac3_Y?4d*9pT}yNIqq>{Zu(mOe(H}aeC?!uBW$M>)&A{RDu5|bqPfh(? z`K_A8VACz4#nyw`;x-2xqGHrj`1o0`7yqvtcnoQ~h$+92xPl*yDu}tydOuFW6o)fJ zmX}_*{mf0;QrR{Yo|gVrA{g?C+mVtrg89%hKbOFqTbS>|g`fnal39Jj`;6Us zi`UJ(;c2DKVHDBahqdTd9(4~(|4%+$1>Noe3PBO1=D-u0tU8@>GQ|aRdfZ>w=AJvo zZJlNDI=cg;39hR;TOa4;;#fP~wFUj~qLZjv({SXV+09Fah8KKAnaerjxZmrDlbZla zCF%R;(tJMpYwblw z^Q7TP=_Id0`uY)!CLtR*Hd{oN%>yJgwEmt|w~&&X{XBf71t+-mO+ngQ>pF>u-U_lI zSlfulXs@-q+5!AlE-a_p*PLO55Iy(!8U``B9c`GQ&sAh7B+ALhx7l##RJZhd!A@|Lukn!W&0|ZV2FvN@(`ECcT8{Z{p@UlTfaf(~4yB`*N8NkVLtLlt!*-kGhbu** z9rAWg*Ug1SYVrG>!-g|{8ZGlU@t9D){sfaIFG~V(tLt-w6Gmvz>u8pM25i{t>`LU8 zj{~hgu;N=PJgp;Y9{Zkfy+A>wQzu`d*4y9|@dur%>m#+Cfc8!M%|M6b#DxSOpb zcTAQrjL0>1T`M1S#NbA`<`vfEN2I+lda>Ef@~hM+x(d8_i&OM0V52Rbtu zO@cOXf~RO0sC6`IIq(X^2K!QlIbwzj3*YB4+gUf>e7>DYse01m8g1;D-H^_LgH8sm zxCSF)PI9!jWjuN`-)_)m&Vjou33mgbrAx$MrSBcnAmg4QOz*W`EH7Fr;=5|tzTagB zW?W_?f?@<)Dz&qjN$h96=X6Ka<@x8j%a&%*T&Uiofom9*m^y>eYp-fXG(+N3mZF1o zm|F}??~^k%8Aj=FDm8Zyq-sxSO*?Yx^g6=NXfer(XtRl%E37YZn7zfKxrP;g(}&N< zzqukyU#@+S@j`S8&`%@lcRs;}0D&eQSjERVMvh#O4foQcPE?mlRs{p#1nbSujfu#* zmm;+QT2Xb}uc%-JK!`3&y zF*bsjj(la6NxN$Qw^+95(71x1j95YIr1UE%^HbDy*z)8PPsFBEW_4XFeB#+vZ{9Os z7g(iJM{<;K(S@kA0cT{@I(Mt3AItoT1Z3P)`i*vC>v0XfipmYCb9mC@^mf17+TLIz zp_yo+OdlyCun2f0o09NF>r@t?X|Krg2JWY);eDBnQ}>kCrj3mtD@jmgMP7W2brLiN zpPUrFZOH5ma&t6W!Bi#>1T1k0k$tp3iedUx)h7t173DuO+>mu(*zcDJ5je>%Yi~j> z>h3~H)(GD~444OxkTa#67uM~`Qc=rG#w&W!ZmcbxEF<2Ti+Yd0E~VjIdg{SZbL%rL+*`^aBxISaf z$#5`)gMABVmTkGa<5`~Nf1AvfBJV<9y{GV$9CPdrw|b@gwktsC69DkltAODE6(wp`0SV%|0+{E{lesk5w9#dxE*H zB??JH|EgDct8j0iau_@G6B$WUpS?zxbbOz{26(ZEX<@ae7xV*r=2E4D!`FJ<6+?OC z?7zOb7&CVO4Qhl(&qogDhZN`O8RO+NsJ>hVt_rMBd_$DDUYXHomTL7<-FDA&;_IKr+XMwNM`e#o@B(7#Nm~G z1^UYhiI%rU+v9lDXsym2Ft2eH+Ov4I(dw0>>DkalUr`^xa;0J-(&h!^Z8l(Xb?LCe ztMa;U%Lg+cCqLBZ@OS9%Nx_W$2M2=yCKTa*~q`2MhAMPW;i~ zD|UX{MsqPNo)>p>CWyXE+Pcy(?mqD)dQPOcS$(g#2q)%SuQ)m|?2)l6&BhD(w zf!0ga=*6eYsadSJU~AC+07dS+ONpsc%~K0h_XKtaF=DDBCkG_pekdEYa)4`GucFQ{ zU`L$~-vj+5E1YwZ#RYv}Nv|K=9A*Th_AyKvga*K(ViGk~ZpH0wx8pmaACd?WAPRJ< zRaO`u8VXo?5&i=3xA{r2U&?Y|;N>{l@4!EK6p%4eKt5CP18QMf)ad%HS@>e=_%4&3OW%r3B zOu!lCGaE)(Eyuk z%vs<^1!^z|vA(k%=@t+YmxB&8+tg%Y3?*`Ym4lo~bg#+Vx@*pz)Gy9rLfJ!7Sb~wz zbJ)9~JNSgTAiFWlOR4+b6ZOQLZnFJT36tMbL&S5u>Mvq3oS9s{A>SQGvJVKspSc?2 z2^We1MVQyL`=+0&KF6mJ`uN?FM-8I<7}Pt9Dl_zF&5^Y(w1(EXF;C(kBF9YKt0>fPQoh?Q_yV-njN`5PrRd1M)5`UeMB|m>v z1Tf1~0Z#Gd+l|LJ`6xI!$L}QAVE_fKp^6IFtv9PhoIuAhHBA2JKx3T;=rl@xIGsEyGe}&Bd&^qu+7W!}a`TfU0I;)#BBM&t4Hw z@PBd$@=Bd_zjf*Tl23wOkb7lT{L#z273-sY3JX-Iv0Jm~7OaA%HAoRPM>&Bm=v3A9 z2F9n%jYR!7SAtO(gYIWyzagX@IAhftUC#I?OWjnUuN2n5qN0lSmxK4h4`rNEJo9y{mrn~w8qQGlajym(< z1i0(NN;ZD?2{SBCqO0!C<%uYuG`b4q3I>GVhh zTu)6x1lFFufv&qb42#)|IfrxJndu2|dU#Al6J{1eP;p9&UZiVOBqzTRhlx`BUh-w; z`}XIknH8`1i83`{=2JGudn(&i3Y@~MhF!S5&KuYWx$8D6!4)<44-oUF=}%Z6?@tgy zT)zA~gwzg@o1NK~D@|@q?)ZkrJ*BHidOygWhWl++c=PcM<8~Zw-SH!XcD%!Yod?E7 zwKUq>DJxRKG>OHprn-OL^8jo4?vqXdv+nIQSVcphe9r-yU!V8;5;>Z^z<9(j#cpid!FhRnLHo zf{};QgJFgaEB|=)Z3oi)1icS2z+=ivsJ{depAQ@v>RdJsaDF{uWId29fnKR0e^VY8 zQV`yeb#8QE4Jm$S^<7x{`4{>Q2=<91`Lc>eeCxNrPXZM`j| z|KhlzV#5@KWuz)A_>Z5>O1){MaaYtWP5=9pWfXvZ+I6Y{5fA^)Zgjl*7L+A<;1tUN zl+3aB!eD~iE09B_-kyKsGvnW1>;HZ-a1k5r^d{uy@9u6mNfodJ8U^=l6vPA_=!^P* zC7%PP`fT1YYybBWeT%sa{j#%m&MZoaY`>its&SCL_irsbukowK2sO)s)#kJUu7Vt zD~q63Via5l-84?h|NH%)a1#QOCNx3-K>+zP~GnPC8U;`CPA z2NwMxudXK-5N9B^GW!Yvc`_vAI@+3Ei-14zwZXyE0YMH#^@OCr@$nR1F5f}^iQo)C zC7)z$r^1veI9{1zPoYrseu?F;QL&m`TLVeekwwp!9hvc4!rok8XWK@h4>z-}5K-sv zz)wDUpGgJv|D8}R4xt3TRYkj%4q zL$gvZPf{$dO~J8w#VtpZ>t;rrWdBq(TzW;pp*{)|qv&Cg5zj1en!eD|#;EIWBZif5 zk_dYw=CU=FVPNup4Gj{#asg{E8^hv5{P!Jz=L6VpV*M+s@r*Olq2RBcM-4B(u@6XL z*+?1yl7m(_L?&I3z6tc(;!fBT=efpx@n?9BL0-;5@#nk)LPiJv=s0fk(86A@$agjK zT1IO4*UbY|@!D)?FIM0P!Cw`>-e}MJ$A==K6i8m}i_0>o@&e&5pas7Z6=V8W4}rH= zAn^Z!g3z3xC%SB^!PA3b5hBVr%J)N`3t9vNj#waDDC65bM~})({7wSE!g8m2n|qp( zyT9o~s-8N3%WFhFfk(a;T>08HbU zD5K%bXEBYsA|nXrf|#G-!R}-KWFa}yT!h;*von8EK`=t!4 z!s^Gs_dtq}T3CaSoI@M~=zvHt+EN18c{Lyu6~Ci+_$vJmo&gAm%2?6?tQk*;umz~) zYM}SX3HCYo#MhwpSQy?F0!spdx#YqI^h;0(Ki~y1N(t>?Ay_Rg%>r$2VJMYQ=2Cx> zm_A6vm8I>EU{uV7hz$(zZW4J5gO8`QQvTUML8<(wdL39bJ_A#MpWS=F({q~?G(}rR z$bR*H`AKD$iA5rPoFzH#k$aAIfCGkHYzx+Msj!|jdfas;ddOb$VboCc^anRowmX4B zdGpBe=l6%@pmu=jss}c%5Ua?E+^Y=V0ivHA`8DdurObh#k})}-w0e;Fk`?YabHfJz zT2EjaTO50O630?4TFP~G*f~dR5Ci7Qd7!#sNP0Bmrk8}o!qyW%759EGdc`w#y%!vB zn<@mBM8(3pl8%5X@Dki?B*5U^buN@X{s|~_ssk%^Qm0JDRN+Z5#EY$d>Ha+e$$sPM zq>qUbk3)*VWl`%S)Bhm}8|;95C=74xZKZ2sY^7)^ZPA}$6;u!3M_!i6NP0S8DjXi7 zJsNudS4rB4MCa@KL)SJC_Z5?Nb-QnbC$&c1YiC|(`3~6*j1_3FsLd2d9j;9`zTXi= zGdoCroWFEDtSI)7@y7J_DtGgV|J*Zs)LQ;^&p@|^Z#}E8L)}|mUrf~AnI^I!?!UbL z3px;!{PvAK^A1Y89R!zA(p4Ru8^h;7&uWri1J9XGij)e?@y78zFfTlu&0WJXTw%%b z?YQ9}=^YTa!Lye&zj!J<@st|j(V$}x_Ri(9>y@vkA$2WwXAPdGNG}bwM^e$zYX~^C z>00qN1-v7G%T4EJ#{sNxi`LIeR&z&)z?gkDZs_mZ2{>jV#N zs}`L_hL`@cN6-Z!r(~kX<$KAroD^dxrqe zwq}E&9E`+7lwrq$ySUC~o?iu0DXz#4;0Vm2@dvp}sl#9lvK$n?fsvqoi>WMcSd@l* z8RB(`a$E*&>U3K01t%r310~G|LIG7>le_q%Kn0n6)#t|0z(F4zA~P<3S0(4V?~k;Z zxs+;7f7Tv)rYG@dU)^AoTz(XPJuw`ihzb08ygjF1ou7H3dfp)4&_m(@R~7Ycvm0>Y za%vj<7zzwds6^o!P{&tGGevFW$_`rpJK68A515x3+uJS=JX!4^#Y;*mi`*3OUnXm& zufU+zV>BC-yyZ{#N4Yx%>}G}i*p#0IAQr)iJF364+Tp^jqlzhv$tn^{hjVUqsyAq0 z9(lPsYXUOru&eJ*fyI-~@eUcPr*j{w)+2Nh@7>@_*AEx=YfkFp6am6SvAeow#UL#KM|b(<&2~f&@GLou4s^LyF_**< z;?r`9Gc3-$ipuzOR^h?9O(EHi6GtFAE_q4@;Ox94vzS(Kyzw2|u6!$QO3dGDrWxsZ zBK)YIEC@aZF2+y0qz}}pzRF7PVbNTnb_*B{ng4Q?P)(v4QF;;~z~TF~RrXU03nhpD z+ZKe=zl-1HdnYIvp6(J~&l57-qu%%`(GIGhiBxRexSi?QpFl;`&ET)}xbAv*pwluG zaB(GLChi_;Og}#n&x-u)J)j}Pi-ER8j}ZU(878#AW!VSeGoV_YCQwF0PBlfzfS4aB9BgQ9TIWbl3Oh-8LX+6pQh9DddO3eb zWpzhVCBEx%A&T0t)9f~53QUUTsxeQ#0gn0Yng!tsk(0egNkK(&3j_L6S;bgn@%ig1 ziVuDz*RI2M1034g{8qukmWe?jW8^09_yt;e)4KcPDMDn5&>FJ(){|>KVO;8}joJ)P z&b_Qr@M&dHZ^S-1Ci1{@CBKF$Fg~RtY^^iQOJ;e0CD4@P2{QJBdK`Bb7 z=4>sE@jHOg+2bk(-g(v94O?8U^ogGi<-`=SiJX~*ttp+Ul~A~~uLp1Mmp^-quv!)P zU@o11fLp!LD$w;=VHB--C}8ty*G(_z6L%Q~Sagl9$jzKv3kF){c;TEJnbRK52Ccoj z{?I1B=?S{=abt^(Lr)=?UVBnMh$#5s#a$ikc8biR7$5!U{s!Nz|2mgwRktZV*;J!% zSpYevx5u`DNRr||Eu#LY)ZK9)t#ntsV+W!R3dVi*!KnPMeE2dIgWPpzKz>DOPkq7J zSiF3h0UNT0jOBW$PtF43G9y|(sevqyw=7s2YTRD6B_suVg z2$%GcrW%-L0y{CY;$q@{z%MH5horK3-hd%6DdOIZcLEIvW5Va#$QV?<$c9?#U@hVA zUseoq59pn!Q2Gu8QRSRCQbRxAb(b`O6qKJmcw`#f$Fh6b zkBz4E-zOS2Kc-YuNccs&__Tsn(hlm_ER)P7WEieY~ympU1glbVs*_##<+!>q( zQ{*lznmc1NzU@brK{C9(I^62YymUuR+!gRp_aqM7@*7Ts^|deCS23-0ijf(Z66L>B zUd|os&+XWtW3QTBcVmA1ZQuwv=74?4^ptf-Mn@4P^fgNiKj&CYn9R4+a%l@KY!R&O z%;ql_wARFhx;Z#EoK^Xj?zehF@+cGQJnIRDE4{FH9EL;)eUV4R zydNU$Z2+afo%P~LAK5hlB8TX+Cj5H>@(qA&Q00$cDp9#vaV>5C-u!cKPWR6UNdH&B zjnx#q0Y-r0IL}nK0bPOkF-2>+mi`x63d)MyE}?yTHoeRk_Oc|7Vcf&oJ%SuD&emoY zt&^G;-$7JfHnrRGje6Zc9TKyN&-7!4abynbPxir^1qhyiU>~-o15RlV`;|c0ejA{y ztJk4%ZCB-nV&mR2a-x7iDU0*_#Oh4lo9xi~q)=0iun9AWHo46xA;3!+o?_po}0v*wBsM13hM_f8-@;}J}Mee`Zdz2 zXLzZJD8%W&6A?zs@`L9=NEYvU(iOSE^SzZX30Y@U^HWJ}Beewt4dYt$=9Bx$b`OnO zKpIv^R2mC=;UV?&#DQZ1)lecPa0m_|o$;$z?(v@#$Flw1(Mo*#F63&l!0I?TYcZOu z`VxR0NWk6wRvv`-dD1e5b&_Iji|~77B7KWnVzQlZoMZ4^lqs0ZBOc%qM20E+I>$gn z?;~}-0K_>5M_8b$Hu<=f(fc<$Cv@Zy_$urVCCwZ-nwu-+6V^GpppYQb)E0v*a2rA8 z*ZTm+(H6YIYKEXU80kj!$A59q%*j&t;!9@WD`4DCP|!b7vITJokbiIqtd}B_>Vmfz zOXP(`M1`f0c0gqC5|G!mSn9GLdx#5yt!QjK6A+#%bz1^Qbc6VeZc72ex9-}?J`8AY zcX&Q*ShKnt#(%x)k)soICRu$FN&BeZs3YKyg=9t z^tPDBhs#>$JwpD8=&Ky-yT&y*JJHQq+(Wj422XAg}#oR2YLPa<9>^R`<>OZpRD*$#g}(l>Bqip zwy>(hZqD&Q$z8p5k1Jh36@1YR999L^mCYT(w?r<*8vX`00`|DNR>{7kf{+Uo;Vq8I z_b{&AfWRC1@ntsCd{%@;LfL;MHCFu^@tVYVawrdGa$CusUOVpb`bn z_9_*%O6h(%&R4RDqDvlzE0N#B>Vn?!roow~^gq?1HkrL}ZkCI6^}yNR=eWqg4klP0 z_UsX|DazN~?DyeB-LqBI>E7c!4adf?Vm_ZRqmS%=Qt&;V$ebyi3eg2K5BOnYq##X!iDs0NWwmC_b z<$6_S7g~=$kL=H)nl#Bqg3v(KiZnA0dvL<`O(V=LM2U+m;2cG9tgdRh(0Lk3*pchj zJ;vQR4L@&k`-H#YvTsSNPk!{Z69w}|1)~9LF@`97`TGbycT?)78M%66_0=dgg*3qe z6|p?(tiCgAoghG?a&71*Dc~HC7P1;JEX&zFqscwWW_x)ye zw}}fZ--y3J2F}B6%fd$*9XDXZh6X%foPGOJ21M`xmR;Yx0s7*^d+&;@6F(g+8Wz!; zZmr!FbL@FG;V*HN3F8{uk@;x&>n=D&0U6SCv7~au<4-C?h5=~?FftV2{>tnzxiYMH zro1_NBDuMJvb*G7qT)!RAG9R>6Cg~PUkGQRSrsFh@>TVgkP>2JE*w-eOsHou-I~bM zZ9&vAW$-25(36_<6=DXS#@uROWDLGf2}i>pZIP&hhUyQ{h(-?oU|O=7^nD_9RLk6h zxVV9(IJ%XMAI4jcJM?XjqD#YXv&7Sc6$wlC?m}dAzb=JwtAAsPXsVp!9roG1==CHY zV918$sv$XGO0Eu?>3tB*UD)zB~?wV>B@8*PlW2gd%}pePZLv+yG@bf zS>7^})bbSYv(@gOduMATNrwL#-~US_wDAiJ`!}RIlzx58z6z^Jqj&PH^3>@ zN8Ca-y=dJo1ExO--xa)gD&;sx5M-;SF!yDzv=wAu2KXDRK8~w=GhVf6?~7ZiMCDiSr9iq`2s-oco8Lnc%!AbJWk5G z)1Z!xS%I=ZbZ4vXk}PP5Z~}h{RnFO1l&W%rhA8U96dgmf?XKq}^eQTkKj03B!JG0E z;0&2lI~#57wuqGI7XP$p!((`>a7~BGMW2t6oJuPh0#sX`LL+l!1>rKX>D_$ zuGeAeNSi3)j8DDXD;5JMT?*=ITZ8ITmT7%(j*X{<9U&Cc*6WaS7da1YS*1#WGN8vY zqCRH4yDECc>Ud28b@!{8{ma6=Y5BxL%hLw)rw>It$>-KZk_spRz?B1IZja)^lyzB( z{;IAkbpt`FBaojvM&xkhd4@D6TYqA#UZNDwqlH1m2r%{PsoFj;DDZus2e zOM3m5bO^jWxJdZU!x*Kgor*63*QmPL`AZ*YGO!(d-N9-%-*W-gJHP7fBy^ru|5V1T zX!m_B63~b@_EXq zVAk_G>ION-Dikcv?eo|=n0PaHRnrM&gj!xgY?zDIZMuzab*dK5QeK<3oD2~+d4~iv z$=;o^R3OJd$j{$$OYdjlT6G}JQ*P9b^hZf_fW}P!=7W=({(UD4hD_QH2onhh_OC`W zygJ!hU;|W_Owu=6&M(1~r^3~w(fw^oSHbQym{mg1D?Hib$k(FcL*ZvBzQbXy`*v}J^P;;-&m7g?PMw!)QuddiP~A7*Sl zOSU46{0yZdP_?<5>8X@&Zh7#mB-}v69l+BT(~GYLP0UC!b3)Gk z53%LasUsu6Gqp9ZwNr`VXUQFZMlg;|RbTv&BZeZ@#RfJPMMtrM@9|AB!L2Z%ca3#A zl)z(Guc4f4g9Y5MJbK7+(ou#O1B?L*H%%{v)BH2^>^vx(x;#)$a?@F~O^FEwKJ> zpb!Lo{c#8*sVt#A9X5*C=kT zOK9Cj!>M8^Ldw8oyeMtyQUG9Oi*GU}-lKu&E~ndYwCW}pXUVSb%Y}f0v{5Y*46=GQ zP0s7O|5%XSGe6eey?7PNrM}>@0^Q*~ozUVIovqtT(OWI`ECT%9EtFP0v)t*oY3_dP z@euU|yMszOjylFSKC>l2g5#0@YfX`^5V^k-EX2)8PB7yJ=7=U15wG=Yz>KT$$5;H1 zKvK*EFSik_i{^o%7)B$RpF0YW?-*o@$i`{hqQ2*3Sg@^`I&}&A|t@PF9vrr@$x(Xpu%eDyc^!dakLcx*Ggtn zYDseq6b`ucl9aui~%Beaj0}DHX>y+AfDyb+@q5`N>b0OAmFy1}AINMCEX9O`X`bv>| zmSLZM!4J6a=c$#YU4Ll2=zAoW;+EM-)W5`G(j}&D96wcQr_Z33_s8k!Yx@>w3)h&U z3eu1wLFda9NtZdtCVNxcLJYMWadrCO>sOq->wyceuX;LoHn4~HuLveS0o)r2jL=n* zrOXskwW2RJ5+hL8p@ZkSF{1_Kb%Mf8VhUr$Bc zV?NL#SBShD#L}Bx2F5>|FB$qxRj6QD>W~Fg%zbUfnB6|B%sGNfja$^8m)d>#;+OHn zKAU@QWFY3*oWh9oQz2%G7;v4K9RK$8CJI<3M&AWE6LVit!X;d}Pn`YYg;|2=F=+C{ zbxMwC9r2)Oe0*GKGo2}lUcX1*6A`z^-;%`75vJ;6?FQya6=m>3-bFva?2|%n1D$7o zE5y54<^NCto=t?V;zE-NE}E{CFxK-n+O*DuU)S2>%P!Y&|J;*b{x+7&H3!}T{2iC8 zVU{r~VOG*l{ux&gX9u?OQo1Xq!VaAgfxGUndKIT!Z>8s`K%2cZA$eCO1kuwKtEwRVF4Gc8w-nRHwnNdU^_s=}F zZ&$chnc)XVT`)1dxG&lsLZ%NdX3tR@0UbZxyEV5ir-z?s5cUmEDAZ4nx9w&m6^HP* zT%J`wYfKMzCw2i)=KJy7OOt)^9V0l^E2nU_UHHcVIpNyq{ztCu{(2w~B2M!a%6t+` z!lR(A1OJU*H^tDobxnk-);DASf*N?r7EB#`Z5YhLD=KWwY-?$>w!>{n`!9F0swt!| z>N@=8JMJ`pw z1F6akzfsa+WJ=GzE$b$zhb^UbyRPP(Xe(dDLV3GC_59?XcT_I_OB(4}kp#?W3 z6P`+C?w*0JbF~m6PK=!D*2TPAiz4?4LO_&jwL+m$_utJ~u8Tut(#!Ta4}!KX!%btj z(=4#^8!de0|9u#EH~d0{OUvgBdkwb22wfo&eHf6p-QjY%G_Ej)_sLm{8tW3D&7xZB)stPThdq1sEh(vkHg&R)z4C|S9`lcB5oI4 z*hso({3p|6n|@%!f26*0RpZrP$s|vufL_5;fQ$66zqtGM+C6(PzRG)h2`3k{SsRUY zCll{hK~!Fk{wj#?_;>%bzx??P8r6WbCZ}Gl=3p z`8zWb1^|FYwg!^SD3gDF_upP=>b>|JU$|fDZ@!aw59CZ@x*o&%|Nq9~2qq0SNf zKiy^i^CN&;Gsq`yW3USOF|8Hxvu&?4SIdJ!o3=AN(D0%D2FIFdMpq zP;UlwYVGu6?{`4vL`kGDUOy+Kpexs;9q23xZnyiR`BKr4t#GMm7JM_6{e^PZa zoB^~{!K*~<21x6wjQp@L@hswGJ(rh#81_m<%(Hj_We{|r{ufIUb!Da}z#wQ~mx)JD zS7PcwvU^YBc>Y_G7>#u@4~U{jxY9lklsKYWPw_vUB2wYi_dQlDtwnf9w6czpXyxI^ z6n+Oh!x+I(MX~5q{k^XK%6w7x)i~$8<}6BnZB>7@HT?-y0X=upYR6*hu2EY{|xZFRiMy-wVaR_k{xLSu9}8Z--owWxOb2JolQPh>SFQ( zj<*|fvaM$Th=sx< zG+$PJe-*4Zd9PNS=#Ot9Uc_iW<21fuQ-5I*5E)bfclz14`;Nj_KOpKZJd@=xMKzr5 z$+4NPAK5-*F)A^st%rAsbJ9YdN4Q_~a|xV2D|b6^%SBQXTS#I3(4YFE!t{?^m%r>9 z;^R>A$haCC{y+Rp25hklLDYkMI&kJN@p2ao{tBufK{*3OOYjbf0_1>^mrMxW;iZpk zGVc9>4D5+UN6WU7-?RB_WFX5A;8s%r@x`FdQ$3rBPR#op?uj4K3Kht=tfB2saA*sNJT)v4pSET1ibcmyvt&juFFhRAO zFeTD4v|_OKxsZTmyiRaXbfbe@1b-AU;F|J1Ic@@dT|a|b(DCUe^VHZs9Z7i8AcNKW z+dcGAdbJg4w+$~TCf|eTE!ji|s$l-{TqDpFQ1YCZ+mFn{4W6PTk&1h-MKW{6mihmr z?Redk2Y(|E5cGzWBCoR(l9YXaG{^1~RUM9Lj{Z&~T4$G*n7lw<`&O+@S~VH|rQmvmW% zC0e_V*TO?7rI1Zg_{yP25X-0>$@vlOo%BFI0i2yPIMR3`J5JGR;c3U(T#tsT_7rLlsF|OIFxhM5pE&mG zbq-*HTUU2wCNmSh{%+{bq-pozh69QcObXBO|`{B+0}EXErR1;-@vt;1b~ z(}wg~y;cbsm&Tm9glRoF8ZU7N23q27f1*41T|M)-@RwPytEz8pXArN?$*Eq`=GxFa zS@rrm&9dWUs(CQ%&WDHF993_Yzm%tSE^n;a3YU~J=p@zax68hfaHvpkYsr85I-r(% z*%1;E+3-OUa~{yiOn@#!+QpRmhUMUuBAWmM;~i6LbY|Am_mwwATfd)THU+q{Dd!%H z3C)ZPtf?E^3eIhdPgfBy5LtCnNS zBV4D1mYKU`3~!hO?qjcwm)9EJcrY0~P7B z(t2HJpELrU^8{pJiQ|EsR=EOSHEF(BLH-lg=Mu8VPMY6YRZ8cfKh@bKbq{f+j_M91| z^l?873%b`Q@6|3kgkuhlZ3;lPM3a@4MNk-OoA~{HeE+CIYbnT;!5>!e7Ds4f3r06dCz{Fe4SfLVYBhVQSvaYhlWk~O^fYZ$krJ{)-DeR7(upu$7g-I zry}voR9Sbi3Zacx8SjP1-@!zaddP~2p1f3+&;51VNk^zZp02G_)eWv#Q+NDpEc^8; z66ahD=MU>`+$gTsbZcC#dc9m)Vpb7dDP6+6d+#B&U4Q%%r=2W-(1!z<`4j&xFG8Mb zWu>Mhv)n>N80=WJCRzjnzy}+ENG*P|GTVIt-khe!&f^32l~Z|h^+!MZbj!855od!? zE^xtma?^sch3O*MR@y{h9W;QK0;N&@0x>Ac#MNux};8$-6I%QfZlD_uQ zEG25@5GlG{#Ls3)?|6~s3snT$sFfkVV&qX{R?}tEN7_-+_itfClHeFX_qT2)i{_zJ zeBGtayvcAfi1kFzZ0sb6(H6|q!0!tt*K|0n_m}OR0u1Qm6|n=B z`Q%{NGwPT6ZyUSkvaaV6ZDa9=T|%9xbU$t@6z$m4a9H$Ppi!#4&_;7tM^z)I8Ly={LRa zi)w3L8e+7XaxZbscU!K(@6*1)9L|fR z*SC8BPT>Ti#yG2W&YCuVPH%*334S%UocD#hCIYGuN{YD_k4ZHzj=0{yIh4NR;uc&- zaHoPMCajQqw!xTFi<0&h%Oech(uwYZn2i^%nz6b)Is^fy*b%~VL_rfgYn3*AzQeJ2 zBm7+HKwg1%((<}WX4*_oi9SY^yEhF74-X1YrEIp@ZtVD@zSj`MHq*}2YYZhte0yy% z>Ba7Y^f84Dhamhj4*%=M2JnU=y;(7XfbruXKVb8CgqiQ%$zB-fo>lk+v;aG*V2!O9@o#_IpqFz&x*w zQByZai35_V6^=t;aB{9hHtT}pO+3EIFYpBC%dFhFq;!O~=hdSqis{>_ReapJuHwieC~ty$?>Fc~l4NTsH#_IRrIe(V`IANF_a|u+I3#UbOt2oG5liv&i7f~aMF%PaV5JOUzr`8-M!hp0rrC9q zuTpuk%v%sVk;>&3CZTJ0FzrHg`}rU4`r{Yb0=3qC!gD4KjQR!!E@$M`H0zb*Z+@<= zbj#7Zbj`x;3&#Xs<%ri}Da&5>=nWlwbUUB}9i)579+7N$89xT-glww`r9fva|I};8 zYN|Hbu4ID31mJ+4MY;&1)k;n`%`3jqLcpRx)#_{CTNmXOd&#r$M1Nn4g>6l!H=4TY zRYbAf$R&C80t-_q#O5Ava~e{7C^$mv4V-F$%F>+fJA^SFcW2xbn9h4pF=043K&L2^yHMa(w!bp+Wzzx@ph3v1DL_Dx#mFlM!{t=OFI8I|lvF;@=3 z)*fv|7GTZF^p4;2yVh7`KWjz5HoHMC3iWH-0)*O7tQfgCo#Nc>j$-o$rcMjaQ}d-+ z_$5l!=hIN>y`Esup3s zW>7JdV^ZL3=pnC`-Toqh37CH#J#j%YT2LqE5HmViHDBb=d=Qg_@Bw{;iYXAyFqeli z6}QB4c#{vgTJ~4hG_ug&ysX^X4I9~7U!37>u-ybjYYrdLPw&2ICa~Q1M4(|R8_J?qc}`!Wl5YzQ|pCZc-k+e%B!yxm*dUGgph|xJZU2ZV+0u$_?C@` zC5~cRI#L#cS>AGpqI?Do)Q;>G9wPG3%U#5Tf^Ku%EG{bkTkC~9QA`hY@nd2V_N|7! z-Xz!ShaSyi{S?i!Rmy7X)^QA?k*Dp^#EtueZZU#Db5*zd*fNLdZxc|FXzcC~L8_Hd zpHyx2P=Z708)u29Kw>I24>237&(K3T^8)L-*^H_gf{X&3pR(iDH3g*!`){n1lPam< zwReyD@_b2DsLo9y2*hzt|9av0?p2c4yHPS@1YsCb)qk&$3WRzLby+=c7v8umUYFa7 z*LADc1SnXwg~gAaBn1b~@8EQiZ;h5z0NKd7a*8Xx2n(02uu7)#nNNkc8$M0dvU$(4 zj#0V2RH^XJZP16!#<*n8Ta=j8RQsr$Z~_0q(bj%AB%V*1P=gw+<3g!d3kO6Fdvy(5 z7hoeJ0qFE{bY#2!7P>;+u%8l(s6DmF*0p7Xs5PK&B-O>5J0_crFtJvcL}(pd+YTrB zPS#ky+p5Hul;ckk+4{uBKnK!KcdF6nhaUr1=)!OTlzNmNL;e>p+5sry`O3CiW#>h$ zo>2}j`?GuDPZ(}=FO@TH8Fu@9Iw%t_TWVK$yJd%+5VlboiY?qO3OJxyQAYkyUZaF4Cds0H0VFs2?^Kv3M#&v`l8qW4#)v z2A%6|y2YgIlVq<##MoquXwZd*UQ<~v=o6;Ma?9q~R&RuHBR3P^DH@fhE?RWFSO zinXs|un*_Ae1N4;BgZT2Uw6($+RjOb)5@;;7MlWz{?~1K!u6o`LJfL8gXNW+&cIum zr6N!E_8#Ui|Ir)v0ePxGAG% zkSi=$w&>y!%zca6%B^YB8`;~)&`}+Usif13uJ0nnT6i)?YHKNx$~Gg_Vub1@%4NCM z%p@mACE)l$xVN5-bNXiRcB=4Iah4FhnjleuqFgEg znNT#g*TDh=-(;nFgmTsHCc_LJdykmwX;;cjGw9VVHNcA=CVXl4C}l2seDCBR!-1dC zfh=0Bl|eWU0PwkIxIH6Qmn|hd9jn*Bg%Iv7DcW|N=mE!DnDvy++Ttj8K4^KBqRIHC z3i$5?=QpJV-0NO*E)ZJ$0RxKP@u~1@$scnnIoaU8G=&ydqO!;kPO-PdsO(9;wvvx1 zaI;@-)qWum5D&C)wn7cnJwdsIwoLS_N-f2H%4LLda)n{i9Uk|`+&T6RMXFtOc{i6DHcXs-G+J)Yaq%z=9u%5&r2>(wPP?pZ*;DbI%>_T zvh)yG$lMC@;~1RWcbdkU8MK7rL$yIRdJqQ1(|gJmC}nIP;?kvgJay zvSbrGz=sc6wY=T#JK&b&q#s-C+; z3h-vB*LLumF$zs_8xvV@?Q)lx-wu1%fm{q$swMh0&mh&S?%CV6cf!8SD-t|DyGTqOIHtcdS7dgp8`3o8ffj8#c#hmxgrsk)K5pw$p0y(F#r~frs z;D4z0{Xi*_RqIq}?NUXZL}<=yAbP}92DDWK3T~e61P$UorN}P4;c{y%Y%N8@KR$ED zcjUxBP2u6t=9(5&D?b-gD1I?!iMS6T#HgO2BN7zjC`Xp|)6 zJord-HEd8fzpCa#mOYElU5q{U6uvD`$dAB{lpkMjWl3 z27C)nlSd*miMDcZ!%yoDrD=4w!gm$E&k!(NBU2jVkjr#y&r z-gW7EKDQtNa&6D@_$bK!4;1qXJmFl3Fl`yMm$YB%&a83z8E18Q+%V%Ch1x?gE0RRo zk~lL}*S!o#CFe%&f%{;hl>21>&>_3}dZvbXjBKh%xHWpL7}=w1oB&P-&x#1GB>9?m zM|~N+R&QVZbcl#Dn}MZm?l7V6c^l#=+U8ZG-!%p(IEGR8%Er)m?+Oyc62>_t>Y(iK zWbMdI;&fFu71?5P88SO!3PR&aaBYaV_!{!(y+27iYmD-x!$iRClK zh+_NY$!2h*LXS)6#JCbAy~A^N#$3p)o7udrp6Du!YnjvNib=ltabAhdiiA`7X}JV3 zJ;d{V3s0z+Fx#(pzBC;Vj@xYZUTZ|g#~|ioQvB<3#oOa-)bIX*nIek4phIY|Wj%hX zgznae^QyM|%h)c-J#sn`+t1d3xi>diz_@K%?Ce#wx&)(czOw|r3vv;mFh*Lfg98(m znYdBTIYvEuG~*-7a^+<6mArz}q-Q;7gwJ>HP?XS4-`GBAx@%;u6D1! znG_&<@z4oF7)!I;uze=|Ks6nrJKg3is^u=$E^pD+o;eLW`4gErWrj0r`+lUroRyQ5 znjWwjz{#?bwiA6-i&R;+KEe>bjIyc1Y|Kxj7R$mBn%^y)<8!ARGT=-0wo|!WZWjHU zt%grjFmoz1x+N~S9ipDc>r(7XUdS*AK28sst2#g5x9OlRW1(T++V1`9xN*_#6)w91 zF?gYL;4?L)_?qcN=`X*eU#NDP8nBa~=n?eMhvPbWCG6iISMXz?&5_v=omhc_{rviV zyQ#y=aSr)El)6CG(D6p@&51He<&*!m$2)QKPgwb!L6pwBI!!pZfvl(Jd{nPF1#fX6 z`bY5ng4uDIJlyr}YaX}z6E%yyN}jX&(JSr-{|^2R*L)PoOt=GTvNk{*!@d;)Pf+pg z9swlOt}kuBoHD!USR~aR8Z?+3KJIvLH}$suZpfx%B=G5iu8aq>!Z~Q(aWF~|+rLLH zuY~pSJH!AGX|#d6P$!fZqIEtZ6bQACBp9WBV*dAE-!B`k_7ubw71!>m%QT0Z{QHOg zIhrl|nNOVWoPB!v%&-5QJVdV~kJ#O$`pYZR;;1}fHCF@C{{{;sJaG4+Dq2yjD zKGEP$p8S_t{y9DRY=IqeicCTm{`peLJdCUO0ZbbRhlDo(=sI+eUf4Znzu3*fkDd#I zU9|RYHsFlOGZ1SD`~_wB?OpUw2S%~z$DOR?+528y-u6%~sC`I2ipFxMaY0QyGf>8E?& z+P8=zZE+9mL1< zH9U&$k9G<0ecU&k*-yrBJ6E3E|J4`t=4b;;bwKS%S$hlM2(<(IJ#0cL@Hyew6H&Q6 zTv%qR?^!dA(Bu9l&$J9R89pKPg|1JVh(ia9b5Bt0%%kuVsYq9p)K@Ykmgw(BLRk-i zti?YIfBnvGhEAM2xwiu^XZ!;cQaORzNCtBZ?aE)4 z{}8UF1StCrbV*+QrW2lN*E*rxWj1EYmY}9D52lWlu?ke;`XwA6cP9d%a^fiQ!^&%4 zkfjCt!gn*9#@I;*!c|NpD#wG=<^a;bLb1r>^oMzpP^Oo=x!Nq(>b?OBiR z{W0&W`OpA%eNE$EKi=<%@No35-CBg4S`jB@j`I(el6@D78b5V|m=ZdBTg<#Oh)uro z(Zr}(p?l8FGo<|~>9{ZKABJE(fQ8J<{5EUZWwg2vw9OG@qaUyol#qA6zMG`e3kbf+ zo{YjLBN=}m!gq4x3S{7k!cAYD6KzI1RDQSdVBMY~&OhCVw8c@w#;=w=YOF|qCE0I( z=}b!1um%gKWlPcB#(k3JUXy?k!5$DKsB{PDw;M*@H@?h$j+e|ihOJ2b_*8XkqVOh& zcGb?ee|Dj-)RVQ<*8f;Tc<#}iuDMrO&Q>nW;n$<#=kO#;&=CaH=2Al>q%#w}%^rRL z#D~u~{?(^zK#k4i#6}Jo2kPK`5X@}!(pvPGbNd|Y8mKKQ&?4w34Zgt8&JaDD1T+PO zmIW|$8)*4?+Y^cAA85^^?M#5y#M3%tK#b4lIRU>Ut5PB$0yrH!ao(yw--0{8f-UXJ zD@GQhEAhRCPpFUUEY?8J6>M_`)hH3W+JDQ%7_AUdN=gRi4~d{flj>~j=jn5sihi4` zR&Rs${LJlm&$}icS8z_azAJG42fz(4#COT#2MSm#MDLu!klx6le6cgh7lL07yURw_ zcrTP!D*zS5SL7@(8knC3zQ0!>k`T&5iFbGyM#L~<@7nN{#sQKtuz2f9(_l^W13hZ3 zXto-~p~`71v&V*EUhAMV7B{vvt#E}L(jKo+6F(i-VHlv~mW0ee^N!GM;#5cPPR2sP z0uCCLWA8}|g~rY3Ta)`b?dMI^$&MCvygFM5}x#*194=O3?6GQ7n31%0)% zc-EaBcG*%5`(3snvn4GWDKD}=*{ng%I*$Q8Yoh;RIQhx77uq0rbt{D7N7SN0AEKze z2M7H)4rPuf&~AdQ)lAj05RZUO7%Oz64GZm?#;B~c4N7Yi@c5*Gtm!z5?~EYYv!^Wc zVeEOmsWQMaydrs;!*zP!`W%;0as$XUIk?Bu9pvNm2nCd|HRbOwWpu8iwbuJlV#t+P z61;HHja)BF)~gA?=~<}CyBY@G>AT99TXaEx@rM`nAh&$m2Y@Ho0B6s|vxco}_E;~? zeYp7R@t)yH=elNsi6l-FVq{W)?{Ha53MHK`fMYWI;T?r_Exa+`=0R7$-nNs!+TNSL zfm^93#gnV<;K#^@^-WrgVG_{`Wi$RRT_uwl@y^n~2#ImqK0v$%5Np~nASVfHuy#VJ zSFW3D)RGTy^v`UY21U3D;x~}B|1gKGCx~Y*kq4SATUQKYYkB zYbRtz0-i@n7x)9=Lf4a@!i9?MfgaminmiL}0z1A>NaUCeSQ{5u6_pAJRU(UL7X3j% zKx+O%!&=ncV9?A92-MPJ!IgY0A>DFVG`VHpo!Kco@X@(ZoCB-I4{l}>oWfpNIIGf_ zY&CHhX}Vfc@};vM1vbBpC26652xsdG+tp_nUQ-l^>2>c*f#2#>r7RonKFZjX zhGw8CROBLR7^AR}fSEyrx@<2m%utY_Nj=n^y*%NM{&K?9xB{by7c}=XDpdlt(Zx`3 zS9aPpaS70iWDC9f(7carj=mdsax^s9SHV6BzDuX}B7*&2B}ObPS(@#DrXRx#Kz=o1 z1i9Fj1W}HuTdc;nN&5c?u`+|~uR`w^2axK`ZE(}Y{|}4dE$m|^FEC|zDxsf}zBwZMf5z zH+0c(H@|%Lz^Mv`hk)|C7tNI|!Tvcyb~Uc${p|;iaWb~o4n<_a_5Fl*H4C<5({Hnn zZr!hfXIyu^fDQ+6aXv$&+#K}g+_q^%IQ)}AfvJ{eL+%tLn&790o|udMzK?f)5Ck6w zSFA9`__euxHumd+q~cUhU@(Xk6Y$?O56~pR<~E{Zp1%}j*H8YK1+-Pk=}=ErZ_7%B zyR<-xT1T!VyDJEPFqbjzoCl;w4#zvF+lNt#?Q7~u z$E$sZn+#d!SuSV>9`2glCs4WDdiYwylP~l__M?E%wt6_-9qm2j^nPQAdBaH3*n6Fa zUa+cOv$#m|xg25bQHrku_{l847|V{{9^vfo8b3owL#WGIwe#jgp`ZXY=Ff|91?*`7SsiZ?uO zp(wVi$4saFFaykEh{^F4Wax@VZ~z_?8iK31GcAU3>&%!**V41z25IJ9geQ|Mo_X-MpHHGAIqyh{dS#rtePkc7;Q}| zKR&Zv8SqULKq0}kwdg0+N-eo6LyAc=dPRBk8ed)p3iUc6X9aywW@VTcv1eWPRNzWf zb{N|(O5N)Rh+qqE@o`!r5BXL7*$tlkrR9bg=?q5aO zhwR4a2=Z#pYf?s+NgBb2QX6|)A66RKUws6u3S^uWzSj-okwXcATX(y97*) zLqEdGW2#`AN?ulU6u|bf0K_0h|{Yj7Bz#J6&pYN9OZm|0pW&z8=*fiOK z4Cd0oil;AJClGA#bGf?&uybgO}0|u#xOg2TZeD?u^Y?zKzZHq|TeS z59Wu{MXcit@8PqNVhRt91G53aJf||8`{g|aZz8w6T{H9+5}OI}K8&)@Pg9xsQ5Det z9ofEw){uZ=$mjLAbqc&R$qALzn@|4guGb}@SLGQm;M1(FplL?1JRyY<21KT1Pn>;z9-SycRI$?Q*QM3=OuQ zGuH6^8CFxXpFT^!z16}~G3#ZzGKYXvQoclLa-t0{w4 zbmXe<`JwUoCX|=0+Hz4F(47};A>3+Vp5_&PHomxO$`_uRID%4kVpZz~j?F~$i$uw# zs`;!%4=CPrq0rflQbiEcro2>+A{NUIyd@Tu0pei^?~KC}BBWmczcRWw<3Eh!FUuWv z`UsKi`Ewis9G&groy)tmG%s0HNd~MO+>@%>7L8{~1>s6j-evUhGkHA+H5*lc?|r4E z`62Cv=9@Pw)o}2^556T;v#B9L$ZXDcuecSsxWx{Wi)}#$H4&o3!c-1oZh7nC-C9mJ zdR;G+Od67Gdw^X|JJ1&|IIi0h`JjPUCobF(q(Sgvefm-~vey%Q&mj>;v6}H9CazOz zn#bG1^v3Y@ao4qs1^jKB~%u zwedLePTIn`jm!TC*MC3!e#-;x`~qL+!9If|@9WVbAi*qZb>hD`!UW}3IWqbKgya*c zm6pvlbwDVKkz`6g0*)ExxCD-aH{FVjl~aQPUi9UUKoX{qh{#rb@Dk zTrWn(R3dFGyuTi|-CJY#EETXUtw$lGHTgkd?OWaud>_=1v-9qy3f%Wa3r&n_;`=P` zL_w@GL{^)2c~8Z)yNlm|nr$8B>TZWRw9=OWDneAB5T-jE? z4wFeTFV^O)pK3_d7ave{CeK#M61M}4Mtu5bp#$a~uS`EYa9bw=rNwLKPbzMUul zr*W2ZT^<6I!QRqao==@Zm~J+8A8jdHS$ag&jlfFG!G)z!E!t(nnb?}gbe~gcCFk3& z5l?pBd(+^kiOi+!I<45NEty2pIASL^0;7e`1o(>c5g+tpQKpy&KuXt;))ucInINIi zWk)D6hxzq`4eyzMzZ9P?EqklxhT=Ld+$f$;tf1dxDp+RAy9KHYAf=-g#!Vk+1MQm! z2AZz|4D@lIgwj?dZJ?9f$rbL_OQRTwP4~*^=+|r5ozNHfA3e_B{j0B-@h19ISabaX z@BX&7lf7ssr>3zFxGuPSgo8l0!^d))0E2qe6IWPwWuTyG6z0SJAfh+88bJ~Xc*E=LEkn{vMX-wVoDN%>$Y8?)LmWa>z5&ba|ad6V^J!Wlx^T3>J2Il zEPUa#G%tpCBq$2h*+`lmmnrcf;#{+buNv%%#*p-lpgE&ZR$i53Fe2mC1R#;JOU2!y zA>18fG(xav(sPFbbUzI`y&TLsMO^Fn)~}beveAp`mpzm)?d)y5Le<8!1~~_1hM4sU-|&nz#4oJ00J^-4)PM&7qo$ zK;q9W%0ML^LaP1xeaF_0L-g`2QfDL;h#os)J|DF3V3vBxyi@Id+tC#Nqv9F>J4hmS zdCWcKOjO?Sj#!dK3#pTBdH-^bsEk`{1+C&Rf0CdSAcR@nJm#~`h?c?uOjmObST6jq zx+QLOD3JRqjSIKLx-6iKR!HML?-+XGXF1c5s`LODCH$-j!tY<-6|rm|?$5%b(3{Xd z2Jp!%4X0=617Qyw2KS<^x+TL#Ed@j|nwpYRPMC9yUtRY|M@UvY4N1RSN}lq?U#RO< z_1}8X`TcQOZ>NjfTy|!XG@H;h=5k}JxI7PRhVhQKl5>F1&>mJ5D`!wfwn9)DaV#CC zMRy(-LH4hkv#L+YXj6Ja#?qtffX2JLOEsQ-y~uH>*vh*-PHy@%-^Nv`JpPod>>I*w zQnS+Wz(W{!VJVG{Kh3LPh5fkLcf){|V34X5FpN<$m*>NKZ!DB8k9c4dBice%l-64r9x5+BM#-jbvVv@4<6om`K*OWmNaZ zYN9^r91ujWfD@a-Cw3cj&79-SKMHu$2c~*vw&$WG@t*xF93(&k)uB+u%rZ(T8zA*x zsT!+zKe{Ic0sECY`;RLm&hls6Ect>U8BbucH;xZtcdnzZ$;)L;!MhIW&RgFF9;y@H zKvwmnCysWTnxf%l;9hz2W||Z>Bv^y%5DlPFn}Fv^aY&Hb>8ZN_Et-&4_PkU%jy^DD z3#2p?ry?;)CQKX;-ysFw0q03BbV8`IcQ>@X*xcXwMRg5+`Pxk0Q@dSlfV6w0#ByFBLsS;wjRSvY=- zqx+oXw`MxlaBo--k~kaZLDY4=WmNNQUuIvmUp4)Ua$pASIV`T>Y2k?*1@-kAY<`+= z5&G-p=*D)1OEu*i;MTe8M=Tof0@|26PfwJSGW5vR9+{&~saSiEELJzREr0ML_jnHD zo6u`%jn6tXDo9lUS3+a0aiG4ABm%nU3boA{@1HRwiCM9jF$gW7G4rnBw^g6ThgQzn zAzpVh5*noc0_fcp*v~e*gwwzB?yY>D$=AZm^Ws+}(|ZnTB;&%XfxETzWsL`g(rjGs z@ufZ2WZ?BFoNcxC0F<=(`to@FB4gOIeZ`fHeN1~;wBJQ5fv(`yLuC%pHF0xX2YTQ0 z0%-Tp>ZjC?XDG`h!|JP5yz_4{DhJD7R(m^H-A(!)iS+@eYJDM+(H{v=K&>ZUA{8j2 z(k*$PrWD!6mT)~3J|&A{YcM3cHDqy<# zO4N#651AJ=Qv>(w+i?;8I1A$3)`phazl40YM531rqi{=q4TRBQ@my(i=@fHm=_Oq4(wHUd3J*N#t=w(&1!{C%&*ZBW{ zq89J>>?l5mJt`>$!ciFjY$tP`@Ug36lBncozNXbva^;8O_h+=?0RDiTa2oKXhzH<% z1#BWN_p-`PT<#~v7H32*SvOw(IQ8_&&IP)?gK`L#uz0cc zv_+TRA(5-h;dHSz`=sOOy+%3C!U3ky`1XCWTc9#M z^612$y#K$w_U}K2?cJ}h6?R+kuL8|qM`7|P_*e=j8Ge6>zkY(>ekJhCZn}uDdpBSI z`eHxncYpGwihjR-i*D2I|Mj(h4-NP=!Ly$^9`zsBz`u-`nlt!V?riS6{{AffcS`;{ zC4X-b|96-Cf8yW@?9ddw2}1;e?iFzbegub!=O(!~Z`b~Kq@&Un!lA zbaDsu{*%*y8%n1ePag2x2Xb((Bf>#vY;{=R{Z!eGT=^c)^!_@|I^4v`U-)d z2hXD!T9$NvG)V|b8N$ScNY)Mlc8#RfdH8A;jTf^~Q%6x=0hTF8uXq4+ZZ7&$R}4X{ z{YjnHj<5;{dm4OwJ}IEeaF}+PzsAn~Fdm+dz(Wj&el)4uG+rUN1SoGQfW9qjhU<^Q z{DITE0&q!V3JxzFqS2<<&DV)L+e91C+w=g!#!>|!2tA-cD8B<=oxQzKw|mdN&%A@Q zHYq21-2ezxrgFD|AJi8b2Wz4Z1U?A9AMc`J(x3zB)pZw8Ld7-S(b#(^ zOKfRQFJL$Um>WIk2F3cJtrgWBk8hi1c}wlKdE}f(hM7NblKyI5>?8u?n1A$}pBeI? zw7dArjL`ZeT_6fi1TnKQdT(||*GAlCl~>;ts6y*s>eajm{7njnfOwLI{&a?f-SD34 z{sNdzt-#N;_#XCyOpP}|nt_$q(SsHZm6}e+W6X#Hg5Smd`HX-4-8dw$nqC-4+UvzY z0btexfZP`fE~sw*2)!$64Zt6p#gQr-5E^BGl=2EX?gL=9*Za?-XPbpkFGuy53ean2 z9z)jA$64Y5GesdSdWVs*6I!>~SI`D}rraSXN?X6E`u0K;AU=0>fWp2lr=a#eK$P$M zZ?B-@L00j3@q3AWAghvztk-O3y~^6)T^rJTUzc(HmaQsf$aRCp45ATkfI*dA2k_Ph z94carP||}>Ta5u*8kM@9sYj=j!HRANoqsrz%)I&liG?3UBl|EMJh8v!0oJMFF}9M8 zl;s0>)6>d?RT?~T<>j%%j{e_1A6UtA@oQ~TzaRhtRK_UcU{S-?(o>TzA%Hp% zz|+RcCEL{WH3Rh3`u>l% zDanUe@T)-QCAQ8v`Xk4Z`aTxw2XDTUsOM{5Xm#ODW_og5SfWAO>;oldb1o%MJI7rO zgGG1SL8r}67VK@QP8^liJ)rGh+hb0c9dx5vZ&-c8Hm@afTvP$YC}HR~lnz(L=a#%I z51B6EB`soMj=shI+a1KM*K=SPE92pvFUkY@@Jka7m-zUiFAtZtf_bctaZ>>ZfEu9wo+sk`aAoWE0Fb++5MO)yQNS_ zS7xBB)Z7M)wa6#(L6IjU{IHJpeeu@i5=+fY0c_Q5-bvxS9zsjuw4cHo5 zbP+T~@?wDpb(}#}3Uplc?Ck0Eok`?m9y{v1wEgHwhe2U@JZSW}!H;5Gy&kG6Pu(W& z_@CUZC2Eq-{N!Gbo9b!`>v4MfjdXBiGC(x#h4X)jbk9UemW_kN_4Y8J+ zSv3=e7?~&w3+lgro!AlwNYYVxS-vdFS-6N^-d==(dCRxrJ6~Q{J-fuCap^Vcs1rNs#H;)HzHL`*NX3v^%P%^fSY)-EI} zfEXYwC9iZ@2DPE{fLV8Iy;1cb!fT_$rq!M%&)j(#B0Ff%1#7e z6xwXBx$MZ}PX-Zyb^0+R3_ka6x?p?0;M3j83nxgzmN9S1Lb4H9PzMUC0;8(DkbQ-e zf4dB%)U{R?;`q@D^yStlA6TGgz-DH^{r&<=K3=U0Zk*<>7tRyEFOv!aAS0?u=JD_a zQ1kGu#3kB20{bp=b`DfUBTg)XQp{H~p7X9BKY&)j0)Y*;+g|n`bGl1(U zR3-K2l8m`6_;*sTXZw2v)HKnhm6Dr%4H&><{FIyMzd1Ki8WhfsOiNN<{bYd_AL0!P zgUh*v=Iybs+o6u_$YG!q$3wgB?n&wm4vz<5&v*wHX|2SrE7dBtBOCT&L>BbA@+o)j zsoQ9G`Q!nNc7i8L1;NdKxC40c0)raH;P8lS(77`fh&iR(P%PWsL1z6wH!7_2Gr5+G z4jVti;|*@PY1&qPu#Yd=b><}-SY$v~CuIpPU+F1*v<^os6x@LE;0oLK)Cg9){zWqC z1hjn#E}(p-P56*5`6FPGavOASydFZ-KsY+S6$=&T6`z7UDCY#bM+FC52}>xTzAayD~XSu6rWp zjfpLxdWfPa;rW$72Hva(!ejW9Z-IB^v%C;Auj@PR*U{?prtP}G+s;@jhxc7C?==X? z$fxW&(ZRRJa_fs_3}D=pDZYev7+~DwvlZmxkt?Qns-yp3*Zux*Jn^5^kKXmpPVX|8 zRc;n!j9UTG;d`Z{*KsjBq*fcK{GKI=;)RpY!s%i?^wi=o3|zb^Zb?T>V=1AWju% z?KAA0fsz&PT7KA}1sDxaSVV`uCerS4>#whQxS4z8?Yqyg@f2(A-jCX7vu&%}#@Fqu z5m{vrO3WbwUJ=_e z3|sIo+iXKX?zP*BV_SiwgckLqL_(^|m-h;Pp?^eeP~Ex|!=P~9VE5a;YmBdpFTkA( z-~HU!Wq#+O|NEn=tuTU(ZQl|hB5;Sse{9|YTkS}j#?zadAS7#)I|LD%ko&yWe~;89{Z13HJN zTEoe9XiUtxv`JUedmjZh;Ji{}DV`Bh$qSlgTo3C5yE3Y+Muy}p=?r^+E-yuZMgiyD z1U+16p=13AI<8SGb#C_K&I>xT+ucCw?ApxgzqDs%GF-l@z;|%Cc(`O+9Z_A3orf)ftfZu!Iv%Diac&1(Nj3^h(7sy01iaRLL>czB6TL? zD+-6S90cN#e@Kk(;!jB=TO|@xK%Q<2Y%9D1Yn}S3W6Z?plX* z^hM5#sC-1QDLtjz15%?c{{3GdnL)3k#7vJk=bud423t=#p@NN3$AG9B85CYh1<_s` zXdz6<7eFyyaScsT6BRGn(USKM_?q8i3)C^+URiwgaSRHk`3n>?LnMc`ucctU>P|slBs-+A(!5 zBUe}A)fj3(FwqW}Dq64cw9LmO%--hj1P>1^lLdEc&C50E&KC(N^+xnN#N=6SFGo*g zFlOj~>;_5RCu{GEXfe_?(5F~w|5&wtOi%{322VW&TmM;{oMp-O6a$UPJz;V5HY5cel z@DLf!z(uN(j)Iae`H@DB2AWbHE!D%z)>WPD`AkJ(v8WFb3-9*@#_SDA;R~B~N%KQ6 z$E5ev4O+%Et9Ta9JLims1 zIGpjmNB_p&Ds>1v2udB>gu!cGR>y$fW-LevY5IuZhtJ(j^$RqCiK7C*b7X+SgJgw$ zb2V^t8hyzn*TM{PQd;tLfj{U;6^I;6n!Pn;XsNPoqSp;6+4v`Q1~8w+k=Rn-9`OR3 z4fa`5SE#?N?DfJ}I+$$kAOJ+}v$GpXbZRU2SL0U7qYDd-la{=iY!^v|Hf4s?6Q>RlHRq5HBwiyZF~K*7Qo42c#2s~Q25e6O3roU|vFEe8^(R~D+12mdx6nE{GK+1WXve1Q4B^H}#G3 z-y1RTym4lrvPeY}D$+T8v5Tz_8t5a7*%g2biPqTXsG$Yo_{ zo;u&xyR{`_Y4o)A}NlF zul1=VN&fvdgcVH~LAMpCUS^@~eQ#R+#|>spG~zg^Y*HjGKo^Ig-A(PhxE!mP=Y153 zW18s}rtW(=gbj`KG=`&pT?MrvH-&A#uEhNM+~C<(j%v-L$2op{brte|nu$3A5i{{^ z20)>uj*+V`xOo~kUAyyUzZ=Z^46LT)tr*wo+O2uhd7>AtzhA0GjzsX^9aUt#_kpg! zXkV=Fg$AXtG;|C>Pid*Z5{ntHtLtLhiTd3GitsdG0;OF+1UjI96Dk-2+wiGapypob z)VM^Kje88!0MLkl<85p39Df*s-)ha2V4$1k0g|(*sA1;)DIqO(OG7(WvX{v^hrmv7 z`n8%fEx*&-+ty!y9jx=LLqvb0{NHI{K6}^gBx9&}kr5jKtif?nPu!G0LKPefEn75# zmI1d%=ivIIxDL7`|G~XKQV%#c6m*hl04RMCumVM(gMF-A!d8jE{h=BEQ5_&kJD&x{ zn>HuJ--O}gc>p(h=;^pEi6O6rWA^|iXR3R}3}~$eXFkVynpYWywXpTM*Sn4=^h`jUHu6nmy~UbMPAKnJOD+{Iz=<)v*E#I~=B4t+uri zVb@6dz4EBsg@V(5Sg1=ee0BKeEu+^00;*{DBVgsLg{4Rt;%wV*H?$lh`96?BSp~yU z!!SS=FUqx)%soACaRX?$jyY6Ax=xCgO%om6-5w)NlI6e)e0@K{dX6hsg|(;GryHTc zF);k%Q>-ym1AxyFLIZifi-%SGG^%3%ws;pnS6O@B;`OKgvn8poY?_4uD#k+d@Yj9 z>q+fa)!K6mo&|{$v8{$0w!l*$^--^0OlL>hACWdv~ zh7CDGjAz_y>joAajs+{o`=39cgQrOo=qq%lMv7vm|9x_ma{BdPl%#Z-#2Uw z+TLV*uWMLnJO0`Qr46(YA?F1*rZN8f>#J9GmbDs`u+ev$+?S%(%FUj^h1(-3TyuDE zD{H}BZC!H|XO|B;`vo=>Sm zYktKAJiX?H4$X||hgy1ElZHy;J2nk*T7*Qebfaa+H^Otc>@u&^j_cY5=in8P3T&jp{sO-_DwWnJ5+)$z6?Vj1)eWvSF58UxCuVGdw(U%48u(wd(J4X_lCp)+gP z#B`?*kGAD|?wA&VZ_7$Y;5^uF{%Azqj5dtF3C2-@d6?s6i3GJ#rf>EH(AI>zO?V&n z(FvTBrB1rTra6)_iA#`Kt9B5?bngyrLzINb0vVR2Ne24ulTY1jqi!OuXU`L` zK%?(+vD2;5XAgbKSielDO!7)h319QTuV{7Mw)cqT$f3b(A7^ODI6ZOMf z3~XGlr~`r9X&Of4PVpfeWpvv$Cq<6XoRZ@81+KrRvBniXQ5uYli^!~kqXBkbo>0bs zw&T#sF-46H9XUzv%y#8|U`z@F!0#rXt)1asIj&voZRxsXp71D*|C(D}-GGvTaX*0} z0ia|-cdS6iPB&OhZztrx2?9z?rAOrq#b9I>mK(7dwm#YW|JA4yK zZ7I$|@~hcRjCf+vBQs2+`78WtBNNF+R-wyIP!)uJ5jU?L?I@l)fAjILn30XLn=Qwr zZG6M*_|MN}8O+^+E(t^gpuPyc%)V6&N$~~P}als zdMI79*yyG6*XQ)D5brRDVe&vRCTyAc9e;{$;_QiakuMafq2Q255%YX!%1alCP!bR= zxr4UKwU+Syl%>1w`lKq(B@|r;moya_4psMTz+9QqV%eC@c_`g=UPBfF;o2V5=0pk- z;t4d?!-nPw)}Jz_3tcLdi==7J|K)b@Mo}7J4$$%Z+@z|k)ftNCdguNKTbom|B`yRr zu9Td-&=A#eK|tos%G#4Eoc!$3TR%ft)OnRCV!mu`@P+bg$t}}5{tgO)J`XT`mu`rO zh+q%O?U64o1HT;u;{9s|EDv-TJuNm|*5@x=Y{47f(E2|1YkcN0{{jKBN?f7ru%QCl z;~1+^L^!-VuKIzIN_Q)Dz~MmroN)S&cw-q>DrMd#iW*m`h*~xM>h62y3YAm(yxjC%jZiiZ!UOW*_dmqTWK*`$z>51YS252VaPM|3JsfnUBH$Cxi9f6 z_va9$D`%P4g~u&rcx%2OsZBm@q=96Nl~9`vhLV}W}njRk-&Z6r-aQ;pRt%P$O@n(vZlr8`7ntrh&^E#>j%T*v(q&jfR&j~j8N zpnpAj*5^?jC-W;JWh7VU!+}(0!W*|CP8bFRx~+?R=RJllHx?f9`<3tBHF7IRLkane zK83;;efOnn<1-jdqOPtwchXpr`BGXT`4&ANSstfziAST5 z?ald{mO&^n4}jk`4pHOBRKM9GbAV7A%TuVPP`ZyNZBs9$!VJEtJO-WwB=J@to-9Kl zUvMgvY~(sm;4qS5d)Mf5h7t9Rhr)IW-R(fja&a$!h|i=*OMr2Zobt6D9CyRL6hhs= zUXHDJEJ!FVP;h83l8Qy;&=MMv zc#BspoZHbVu(zA2Kgg&4IUNFGpN0ga-PHT|Yr`QViNemlshaHJ_5QnPgz}rWg(~N1 zx5HPED;9^z>WSHI>y_#7=J}B7Tee9gR^Q>;bA?xzwU~TqyT!@^alMC6Jn5yglP3o zA4nEw4L%3)j2VGQY<5na*Vq#;;P!pd|87C}y!uWBgm#@L%T4a7^Yr8Zcsn#Ki#`D4 z3}afhZCrFM(Fm$ct)8f`P~{xY1Q9~HaI1P#ahWI|-SInpC2)AS4eB4MzO7aoey7fX zF%HrwGSIPkL4ImUWhETzkP^S z>&sj6F2d9_KEJueJlS4?jbU;vz#{u12@rXY$;j{N66ziI zcGl_R^7Jd>Z(=vDx}0>>>F9!i^UsC{W^zicTc+lfY(lR^D|Fl`ouTSi>X?9AxkiQ& zVW|grJLT1EN-nN-U%Vvex&&i$s^<-3TpYi2Pu}Of&RIzF$B+rn7G3 zcq`W=LP{XQZeeV+{Jp*kP79GrBH?3JtR#{pCW;IG*rPbmHD)~c0L9a<9n(1?@12JN zkD*y_OKYkzKo4}$_Onz4Q-#7Wc2=kv- z>c_(UZ@K>a?poX%tUCKWBdcg>@)Q`kB#s=F8gLvuEvofXM5VHVXm-i`{alTdg=|$V z-CSkdZfWh;UDaHb9#759$4m^@{TY0I$?z*=*s5K4G`_p?+R0)*F8V5m;kA}u$2Z*G zjke!jA6ck5+648*AE@6;w_zL8eR`?$|FDhy;gwX7A9(U_Z^S;oZ8{B3w%ft0KKuNe zf7>$(htSD$*=MQ{ff3!uhD(3?`Mj;XLCnwPMmhX%|Lc#xmG==2^6~L`Pe+s8GYSUr zaB41-4_r^*KMP=e;vmAU<#W;DI3?3*t*>WR-)fdWcs_9^D)HIqyGF@FQRRmO=FYLI z{!;#dZuU``Rl%7C1A}rT+@V3_?S3Dd?wIo62}vkeSgkv0{xi$*=X2*bqR_kNr;I|X za7Et`g}C47QK`#TKmDi0`OklL2H8=k1&00~)+-ZsdPMVkk-xuS z_eJ4W5M4D2_{VDe@dEhIf0hb9u z@bI5+=>PnHe(cJK%T^x_F=*R$iLsjvygf4c`t@r?;&5sn3uYv~7Y|}@T?3+UDCLKXgN%q_|@&bCpkIHn9-^-U@lZyVO~K- zx@3fd!r=~C>l)8IySy2x2!}k@-U{uaR%XnrFv@PaKB*2rjRox5D6#wM5t2*pRrYs# zDh1ASMy?H#C0$%uXqC;M8V^q{Tp^k{LZkoJ=+MDK*Tp9Mj>u#C^AnmbzIg|6O)p@^ zF+a+zEr!!|%}l8-OL`&AR~>*m2ngjEvj>^heROw--!Mj-g9#qK@bx30-jb|zz-v6Z zD*!Q|)f0%-y%~bL3_$Q|v%4`PRfKAjw!%R;^c0dN6UeACL_*s3lzzfxulnRyEo%YY ztm1&MNxvMyN4>wOYnA7IFiog)x9!mrxFd3JHz_(;@|e8KqXr&yp-ymSyjpg!R$u9? zAtp(_G{sSkZK+b7icTra@dd>`2T{fuqI;sEQa}I+%t0?$uNh*GC^ZXIQ!oDZhz}l& zd)yWY1dk%{VXBWwuV}KpoB~Z5mdg_q==zYfkJ99zuK|Ns=*yRvNk~Z0)y}qTv5Bhp zBOIn5f2Y4wPuLZ+BcD8pY{jadse)%s*w{8z)65x~?^820&@VCXPC+{~+`_F_2GzSKo_&}}2e&F>aP<{jySk;P-s%A0L=V%+R_D?GvE z#tt&p4D%{`+%@R8^NLM+n**I8O{vgr!=|es2n4StvCTtBsJ6fCSpXK8SJE7Tdukg9 z5!^JoZ8hQL22I=YY;uVBg|U1O0uuzE1T;+)013JAOvnuVuIH8v?08OkInY@w;pd>p``$(wFQKHU*Obj@@zJWFlf%C2<#f^5s zEPE=*IEG|-9XO%%3WH|_kU2GG5%ugZxAHDDy1MU%RXJ@Af2zE=uyPx+&-Kzfd{E2* zs!*j|1%@^CcFMTtX3kcN7^U`A`qx%IM-w|2o

|XJ{l@Hn$-e>o z1krh4!C2VG-0O-(-E-r0=T=rMAlD-; zIbm;m=DGa?aTw&L8A}cG)CoRecAvZs5=&c!nhQeSzVl+kVSHMOB%ft{&Y?qE3%ZG6 zx45-BqORk=CMKHD5Y2E|TAJjaG~nMfWvItAjHOFv%Rjvu$7?~De|O6z>H0p$QA&(( z37I@vZuVa6@*IfCPmUxf2lHNsxHaD=cE>`EQd17lxeAgEbX|(x-ge8cAk{-oU6}kl zY@z4_%L&zCqPU5S*>ZFaLV1%e2d|U6+$X2DZAvP_gl2U&T4=sF~K%+|LU_ zuCkHOkDs6wXadL2LKq)ti#5AJQ$K-rH6_c$sMRZW!6 zX>1%LtyQGFDn%S#{QCScEtf}Mcu;Yk|EFgBSGAJpa;zsOK8o(z4dJ4~*(g;0TQTCm zIS*}klhfB;lx@R3CzEzHH(TB5D`SlFJzpbE+b6P9MR3sOZ-Ae4Dt9=dwN<((OI;1H zm|p-u<{K_Vs-1JThO zctUyn!kcFom({p%)J9)DC+&8(H>P(nCB|K5T0gWVA!~Bg>!XBCErH~q+mY_99qD7b zELcxqPelcS+T_zZYDkv`lGZLSks6Mmyf^WPg@#$wz**gbj z3v4ZkUbvm(YiG71aT|)M+H+^_IkbtF;3Ox{)Fw4;r${hKu{D^$YBc|B zzLS{)nWgL~oj)9jIFGAn9THHr%XMAG>swKfNBA<bJCU(9-c?tz{7kbD)~^# zMn^7`9VT6w-?znzP8!4fvXJfNsJd{l$HbV&IV?dKfM=@<`<`{pDPB@OnJ7}%*$#-j zU8W=xM@viYh6c_^Z!s+_2<^R6mt#5Db_*3iHr>Ee^55o9{XHS_$l}CY=JJTo!*gXKEdIVPuH|c54As`T zJlE>2t)dB2lYgwcU$f|qy)0kkq9xv2ajZ@D%PCp(OH7YNX|cuVP+kleZ7bng}FvMtt~LELST2|YHnh5kXc#=L^;3WUZdTM zeYN}Wvg`IUT8|fSvkbIlNRf%JP?C`sMc?H-`;vv%{s9D%x2BEs~j8jOqL zohd?#zuM5Q_JEfx;;3`Kpu)064&SMEl63->TuNLy3N&a*5_*;FOAY9UxsH|QlnNEJ3jautbfY7CP^W6sEs~0W$G~1&OOvkzp ztyvrdhz|B%FX-WcK}fW^*aww%n6DT5Pu!xuq@Vu2P|A}OhQYY_$>yhvrMDE~xw^X` z;emnHZPB}5SDlHsQK#6o0AeV*P6#?++41RMT+(rbl-trW3pG29$~Znp|1sh?EQ`i- z2axtvwGh-#33yh=cYUPksC7!{fTKbVAdtsImpDy(p%cQz&BK&29-K**5@zu0sj-#A zgu%%eXX$*t=*B6**$sPtE=?rwtV#pRPSp`DAmZ$~H@E^*gS|CMOq_lDyrkPiQJxmY z=lEE-<-4vj#0qo zyo7A-5Ng~t`<&}1!Ofdtq4PhFBqeW~HWDrTj1<>@)pE(#Aeg}NPEY+SwEte)?c9El z6>`YL2xSWwufeFbz|t!8T7-QZ?~S-~ZRb3#o{m!1_H4q$VB%1_O_sLBmk9}Ma6LcU z7%I?v<_3Cyc4!C}r@iqoT;otFUc7i`*P2zgc0`qo`S|hUGG3uVZ<}=AFo!H68+F(E zBMw7r%8$$In1Pxe{x)7JJq!|IseeTkskp=K<^5*X&Oeoc?3$#5b;T)OWVCe5n*nLg zg1{EfJ*@-|*Z9JCp4=HBT`>VR2RCLWx%xd0+4OhCR4uEn^1du30||OD8*P!w#)h9- z*co6YeQG|`PPx=78&g(&c&TmM5X^GsqMvf8pQlUNsXo@@%3CHY8XS$o#w3k(Jfzp( z{^bxISkT@(NZQi)M`%Gm^d_SY7alxMeJEKe3RBEqngA8dr&YCHqVR1?_DH*i-5fb? z;l{oas_?l|)*Y&6oL9QI`n9eg*G-eE9?sCvumxsnAbs{S9Ox}|y972*%CSvg^$dBFv0F$DTiVpAm_@Y#&=Y!cFydI(MdY-q92Cy^IeiQdoN3ju`^RoP z$Y-7wybX#*PIcVs-nq7ih5_y#J3?h}57v-IiX80UqMV~!WBJjX4A5mzW!F2i@H(j^ z+*-4<)K=r2`WB5}jWI^&E7(StK2n*ub$-9iCjgr->%$h3eCpaDLF@7T4gd1zE2onS z4oaRreBJq8F{97BXT=^nE{qHeH>!3lgP-*?8M{I=2;jgC+wVw87VTsg9UpJO&V(rd z?~IL&O_D=wM8p)nY)}*iqTuk|jB9oyUm8nS>fX@m(2wDr^4SzRoIXX(uH3d5xy2^@ z_1$r{aPV0*-~2?Q_wPXkrr-!?OKYeq!QYdOk2hc9c^m07qUa;?*Izy({_B-WDHiUH;bBf?oTPOFcD_cu z%O?z`kjz|QGo1oIdzycoOIFS%g|*vY;a5ifV7~eCUuD76FTDYFenWQp!MU5fJF(-P zo<5;ztwjfwb`PA|v(to%j^H?%JM2Z@CaQRmE)pk|X2{$>QD*Nf$R*swn*ixAHt&wo zJE?~vvmkH&3a5e7R+7LLaQn-_QT>w4>mnWIzCoM^Qou8PlPo%QYp z2p>q=jmO$?-w2Ji-(Fe@1$-Z>&1YDdy4@B~#Of0C;1Brtzsld=p<0v60Ep6!iN}uK zdDaQ^wBO6^&HlRJ0W~yCesFkoH;9_+dZy%)g#>=*LdSNP^3tr ztm#DhJ$R*SZX#hSV=mt%Jv?g5vVU8v z>f;7<7$WY(&ki)T=Ic1cgkiXYoS}>DuXDjw+CXvUI&4|J^t3GgI==c~(V3rMGwsU0 zOUPa{_tgG@RPB`%#+TPmL|&+oo&WAq$iD;Q({wHKHT1FHdfUTg2gQ_Xo&VOVeFRq7 z)v%7xr%4?Bz)q4gj`VsTyW_l8J{BYpzknj0lI&Ple)DxuvYb1gh)xna?hMECm8Uik z4UC|HqMzp2Ak!cRnZ;rQ`qb}^vAE^K7a?cJ(R*??2pP~N4Tl#Dh*hb&&f6Q?W|1r+ z4N(XSHG064NB$@+J(ip3YY3vMT6t`z<)R9%O>XqKngXk=-HL^A3Uhh&%b+m`M|95a zh{+fU>F2eZy%f`nW%JyS((2A6;bk>=KP717=()Ew-iTVp4iqEWf>5f) z`0X~%fPhfqEg(>mPNO9=Lr>6X+POow>QHni3La2`5$UNmBhW(AbBJ|;Sg)1*F0WWD z%OZ9;yrVo7JEr5hzOLh9UI)V(IB_pauOU%k;oQCpc@qOrompLwQ=U`J)M_ zsO)5Au<{7_Iw!G^t3ks*I~-GNQBlTKZg6;`yPBquYy0VWj6c_LVy}Xi=)hvt($At#?}AX zpMt5HdyS^_R8?G{0rOx%lE0T>~AkH);+7LlU?oCLWzLRqkjo_p(_4jhj&sR}q-T0olC7-QTSGA2d#uE+pzJO{2fsW|t} zFf0c*a5P2+qOTlLr%>Vm+$40|WdVS1^3Al>_U zKj_E--3Gb&3QQY)XN9dHV*ddZMvMNPi%Wt6OL+3un*ttB1pJs^dcw{`~pgx4AebMKYg&x z1M!{Tw!aC!1S*(ErKCx%k|%t9Z2he4EfGi~dsOJRGsWh)bilIw3n|+zi6SFoiALNC ztKWV=^*`C#;7dS~27AS77olWkvU@OShD^p6{_6rvOj0(0R28tPhP^u~K*|pZ?s^64u#4-z9u?Kf>r=?OaM8@Uxk&3qSw;@LCm}L{tRrW#YfH zBEMHw;ot&eG_brt{FkF?{`ri4|HO&d<0)!8mn(z*Ne6Pq^V(-s13-Uz3xBfTq14@8 z8w?w9p|P<{;Gv2_(lQiTSD=E91epCZh=adDO^_LW(+U|Ydr5@^0p9_X^mF~5hyN;g zn(V(Ut#IPdG3DU{sSxli1-_aq79()6Ns`Iim{VFOnUqe#HjP+awQFi^%?XD|wn67x z09+5mLQ#EGJ`2@B8Rc95@{-j*X~Pv}78aRf$B!E{_>kB@2triL)XdCG%Yw%`pYdpA ziN1k>^UMGwMpi>dha}EBZ*4Q>;kEKRO5!|Qa}JXLQb7m1gU7qkHQ)R!_-{ZDhfV;+0{rhJ~XIs16~gl&H8 zJ{nnBHI?ir2+U}KvFCU)tj}@Q8#t(M!(jAe*D!7E*CE#G1Z=4QWZ*-bCe(x#n6%jI z+smtD?y(ytU=3TI5!$PG_o@CULanQLBcmK2@6jGx06Fdj8@Pg>F31{H!8x`J{hEZh zxOk5T^suay+I9*bZuUC>J~+hscw+7b5mcE`Fn70kxxh zpHWHZnvYMA6qk)tWHaqYx22L^D5+c`Z!&v{BNW*6d@6M8<+dEI3++nGE3Po`>dDdG zG`wD0x5MJeVQo%1`zzUgmq3;Nk27{KU@LXbY+hLS&3y$BT=7so;^ zY}2cvMrY2CK&8=Vl=!qj1Bw8zNqcoAW`l~dyqK$*nq{|+Ki^AX{e8<+=TIL12NznM z{JI}#f~;aD!`kqL-OgI#Ld<)TXEpsHKX?L*O}89{n765dOWN)CAaI30NWM*V&zy^@{`?<{rTN>?j?XEYE${c&xf2 zN28=U8`>BEm4OZb>+DO!BiX4UK{cy2T%nU-T2e-KOK=Q4`!*RqB$KfqUUaV*q6f)GhSU(a|n>3^>%@Et=r%QngUbgG9(tYz&ulI+%mK_ zQDBkz49e(go~9b#HxasfoBh|id!PsT1}g3~+T}ii%qqeQqO;Z5kW8T2VrO1_Of8j$ z;&aE>Lm{$UJ^8wIeysE|F;^jyDR6+c_U`y1VkAW-L+djlR?;SoPpq;53pRur4%(EU%#e7MiUtAfFxNIj!H^}JtPB) zN(uDXj7+LIH!L-99EC8b+jehvt*kDZndvl{=g2>J)>(2tdu-3m;FvBfXxIIQz5(OR z-4Wf_wPg=4L9A79_=T)xNSl)eO7E>%t9Y5Hl^*C<2;`BkuL4nv9y{)qO(A@6G`(X)$a1Cd| z)f$frx8It}d8Ns&Y>ut{-dXJ?*2dTgf-yJ@c+poo z)IFg9Z4o^-J9o;Hc$j+Ti&~5@F{u={dsLKamCap+D)vK^|*j3K|}*yakv?5jJd5#qw}4L*dH56RtEj zlK?-u@{UeOXk2-X&|q1()mXz$;iX3(}HV+l&q`7Ni9)YUh=L7Vcu#fwV;% z;Lw}l1fU!kEjCsHgJDQ?DHbND`X8}C8)|jDIcQ5wlR8->map(>19tm8GAo-3LHuxU zcW?k5(bVOMy|FG~MbD#qg1y4^q5-nwsJc3O$WgO({OumqZl$z4q)H>`=JZ6n1h)e+ z779{zGPz<$yu(5kJ9XbmA6~L1*`M8=;%Ol*MZ2~sQTXaBl)@0ZK|wAm(wN}y;n^3& zD}!s zVX{3==2GpX76<$g9B?sQcZ;Zuxk*^KOs_3)n?Cff-N=m=hGajEE?jR?0~_^X@LgH_mjNB@`dQy= zncG9l8rr{(#g(EU;X)G52fgQZsPtRF|6#i8*|{(YT85ZWBqs4$Tom;UlNLBK7^sDT z66}a+H9?SwWNX>6cpJ>;EH;UYi zOX)pHUxBTrNm~2WKk3yA7-b7;GlKxq`$lO$leMF92qTOgF)9nudSGNS%{1xM*hm+i40C z1=T!YzTsty$c`N{?AVgHw?MR($Jm~h!p9{Q%2HLUYpHsCCg&w7eQ89i>dS-kPm0u9 z-qajULE-H(wO?F`yLg`Y$s30lXXAb!vO-V2)j-KS{M^8UeZ--{8%M6PmB?-cvZcE8 zGT+kJR)5SH_L7BLN?cw2so9QGR7{gN2gybYQlx>3hOi*%w9{~y_QWa@Q5ZYHFzO5} z)Zt*UVTKNCq{GmePAMC6VS0BL=x^B&fwc$4Z|Dqk)%(|heU&y#BPYWy9j3<|;4JDw za^oHVET$KU5U^GMM!>cp)yrF5;-_;(nbCe`UBbL%Mx9B~y6-yD8>~vTK(Xk+^|oj3 z25`9o8A?@yNlIro16l}lUok>FxFg}uyo5A|;qq)_oLf)yNWgU%#y*nch^EunKIqSV z%NWeS%L|@+n$zuZIQ#tQ2u`owbHCNb2(3DW@Yl-5Y8924)@#MQRH1`!-fU#`mpuO# z*QdZc!Xu9^@=r4uuJU*`r>+{_)UBhSx}JuzhZ5N#YVL##0VU;-|9N65f=!M`M6y&? zmEpKjLx+oG)v#Ptsbf5A1$}A3Ga{*e2XEPdfA8e}0JD3i9D5_E7ZSD^V~^wFsA-6(OWZ;xESdopYVu-Db=4cm2w7?x~Wy{ zo=yuE`$E%h+N%|pnlfxx}7O6@MMY9MMQfs>GYJ;XLt0dFhc@RRSrPNpM-tvPLe zdg+@>^6^9^CcVp*{lfJ~y587&07|a=G4Y`q|D!px#YwwKBC)(RNi0b6q53yF&7vd# znOkXWet!$>-{xom?*%o)bu}KY_V~!?%ei?Al=g392u8h3jnS7&6!Fspv0L}|jqj)U z8!HdPc0^@kN$ytlwBFR%-ddG0BI7SmHfSn|Ir)q!+uUL(Yk7l><8)x8h%P&A4&2rR){sSRpUZb0b4Xv-t16vuYX+ z=9$B9UVi(vkj22BF-tdJ?@AMXT(;DsazJ%GG7n#Ua$njg_}#X1@6EO4H{}g=+_-YbI|u*t=aW8_wmmFtiPAjzC2u@m4Kot6!RtDZbZp1 zFRU2SpaVsb0hS{$VK8Yh$!uqC&Ws9zTTam2l81nC=Z*fl-Km9lnjurW#xFuhxs4J^ z!^vNvwu&a-67Y%~&l{Bx@}tA)qqz1VtwVEmK|^Z@6+2!;J&>-%H1ychqH$~I@$gpE z?!zu1_`aWV#%nebA1F#_3DDyIGVYpB#lAp$v^j=%AzElHU z4L(b~&q8r^OC0H?1_gB;*oMe&&5z$I*UO*#?vlEJW}G%VmrlAfBuXYGX zX8w{j;#}a#IV13aSL_AZt1K{0cb69x7ABRh$Y~lD?TBXojJuUw5@e7-iZ(h&7yI&R zBzIHJ*%m^sZk^7tW5-&JcSMdA1nU8-J60?Wq)yIjNMcWtic^&5V8VpU(2+1~EQIa3 z?QWSARU%%(Y;=w~N0;?8%3)S2Z;H|dgwg%&!h7bmw5hZLFoGS5hj zRr1_;?LBrI?0s?H%H6xEc~zxMB}ymvyS}1{?n!&0K>>f_X&h0GbTlnfd1*k&o`0TU zdi086T!G%L=~sGlCeK?)$&%_ak^b(?u=2NqWLaIVIo=1L;faJm1eI?XJ>IC`Oqj5n zHV?!hJrojR()c75$ZAiUjf#YE3$#h=_k6=)&>ZV-t5m7Ej!wKb8MAqPxKMqwP_8Dl z8U_GHdvjMlB950l_N$-KPcy*}^4Ox|txtwvkAyQWBCISH(3>lZHPAQ;v7L<}oe^V_ z6Ci}!!Nd=~6r~68Q1FbEdK!#XJCC8ja0yZb|2CVMJ&)g;t?m8>7aNkV&h09?@vXQ*BCaQYnnZ`w~z>}8qJZz`+#_f z(g({pc?}#i3D?1;`~E~0q-9%^wU=mX(P!2-bHBJ6^PVnBOL6+XNpBV%eVm=5XbtqQ zvF-)Z*5|rzJ?9hL+qTUwgHGnX<|*B#j*e5~DvyRClw_^l9-@t%aL@CYwaVYNvT`Z~ zpfr~$-Ot%o(zEb3#<1-2gxYVGFWMm4KEgLf>ujc)7vD>avtl-zTR1S3!q4udcnOw) z{(B{F2Le>G%kr2Y9=f1x2VPVs+-}4oBl-KX9eb`D)`*W@4TPaP&stsJRIaYSv$Yp{ z(dxHbw;+o88Xj=px!+#Gt=7v0&jj-q(u(Y?0FHd>az??X%G3SKA| zRrAc=6b4O07hmbj;eyriVO*)H5ox()x2z>B=6^ZkY);>LOcajP76)sWo0-sH5%1zL zA!HYS9m*>fWLg;jW~mj!!Os2-3FNMRqh;T#az=`j&O8x6RA#wQvIBD-=5y>YD4 z$eDd$_Cgu3Jp&l1=0+) zW!S5W2%wxn?J*U|(Z0CA-BBd8$e^)yb@(Bup08DfkM;?gI}>`<@L99UmfQiMq&GL( zAnSq!L5J1WVHV43-&u9*ids?rvs6sxr~OnW06|&L0=4yf#Gg`x*kohO3@hKLzl1@r z7HjmIA!I|{UT7f*M%HnRVQdr1{RYr-wO-}dp>~j!4ug9@Tv(X+9#>VQ$Dp*khDg!G z;0_WKn;3+9`lIm8V=WFK|7@lb?ryuj%<}TE=Sb0H71vn7T)fypnUdo+DO2|JlCh_S zUe3k0U1doxRZDkR*+gSMDoOXzjjG?ucXENcN@phLUzq*>`IT_R5t^Gy)uS+WfgM2B zuq6aKOcJ3ko!x}fVGi{WGsM~{K>_-oY!GGK=159_!3;^drL*V1#XL&SNwRc-ROA&K zNHbvb4KD4PszpF~wMaNU+xjqX?p#$;ZO{)k~ne&!CE}W#qH| zD$1c6x7mvNb)eEdTdj^=6+n2wF6ryoQDB5E19=56q_eolZPRgZC~;;;6y3%2!-Xii z@T?92yLa3C>;+~(SNF5^9=F-RCAj)65^G60_?JGM{V2m>s4f+Er{i4QQyJgrPzAul zdpW4|(oP*}KGT^{MNOWq^axw*hXh}XhQZ0t{5Sy@44Q8(S;&BIYHN#ta2N5N+TH~3 zT`y6AC{wFdM~Uup{&c7g1`5yhM4$rZW%bhz9Rb$}LQv zx^rydQU5k!jMcoz`0M(Z1OD0|dagNXF67y%i=ouU4IOH8a659>YHy1i8tyEKGw(RF zFBbZf+Y_M*ceRSKyozb+t`m2FKrnqxdwy?Lr9qG)lv3$574pyp%0T@ipIL zyi&SaSN*F+#Hz3)$AkJrt78chi`kLM7{jzFNg{U&q)$`gZtaNhe$hKJ-e)Gin?z+# zdXk1AfRb&^0r$Y%Dpc0M^U+whIt zZjjjAhA}!iyD*4@bm%)z&qva;8w=xz)d>)nb&y%dh|E>u@{UU@Oj)L@#{>PF=jX7t z!PxS?ujoR#PC|CN0GrgIW=&yVR>F+F69O3`divie*)fOj>k{8f)w;@(L|(7CYCCS# zS=PZ2Ea4aF`O$+5=pQfC#-~;nn?s011#e^smLsU>fU#ZLO4`=ZjFC_+OB?(0uEMm2 zV8m1vc}|8yv&y>kcD3P@>L9y)0S&_lPUX}o*`aOBWA)|5!%!ywci6sCIH)t6P3$G7 z#6rx;>`E_`TysZ96oORt<+JU6bIF;!2=!OfyR-53^nGg-D)~s)L7fUsLri8b=l$#~ ziZ~b9^vSLRsSqaB49V=07m`qqyIQ4U$fSw$IO{WtjYl~zNf8%=c!>^4u)(;(n5b(5 zjPVay;xuFvsK~6|dYh!4>{)tP-B0Ne<0SBY7;qEkV*_TA)ZxVr<`~NXdOaH6ok@jz z7Yv9}wnbhV5b(d9J`wg8X!3vly|oa3SrmTtMWhl`8ys!@&_nBwPsbH{zHblBa^2D% zcE0d-KaZ}&S)k~Q!|i?*?xpZMm#A!iujTVTx9h*Uxcv$n_(3ZdWt+4Zv9_A2POp%>;4AIdchvvyF8SZ}{`YR*|6T9@tGND7l>+tu|DPf_<&$!NBqLoX z?rT`8?2ZenwTc`6V$f1nCUc2n0LxOpqF%AS{_lFikp`N}hC=h@llrrrZ%(W6*<+QR zVKUp5HMNKf5XDdkRDKpa%}0gCLyCloyvKnj4kKD8^+bmv8sHRIlXlxTa4A-6F@8I9 zKiZc2EZ%@RO6BX+{6bm-Q3o4fwT3VD>;ip)TcNZ2SKthzfGSscCiRv2-`1k}Cm0WN zB2k%tvSx_nU9{K(A8?LjI%?{$Gy;D%v)~pt<)TJ6q&)FZ_aSG zt>n3G#k7sNh~LKa`h>;oW&7>tdL$DGy|#@P_X>%EK4qTCiEv-lAEDNyRXfSFg#@17i= z0)&AOBe45@5)Gl}mcw~Q2jW=tJ_cqgiaTgAH}W%OE^684H{V0XsJ#TRKy7u8V#itG z!2^cl=a^qWB!WODz4Ypgs8|q3CglY~!%gvNXx~6X)nDueksb!<6bb9KGn)2la{Uek zLDnUrDeXJEn}K9oN8?HigBMbQ>%T;R|gWNC$6N$YyqkfP9Yd%dMY_yml`D<3+2m z2fU>Uw{TN15)W{CGT7kRjw_Ome~95p{p#I2kX4UZ;@|V=eLSIFs4t=1jLh~bvlwmV znB3daPKW|C)MpSr{9K}~&WCsoq+DDo&Bm(L#{k63=tZV`u2dO<5u~a+(%qD%zF?M%!%LWS#0;FHvc+=WlRUST= zneXFqKSR6f^Fw=G3QLu`^i|*Ij8|$?8$iF{Q2_>ke6t1cFs=6T5>l?Dlmx;IyX|N-G_+=tuMiK&ntgd z=yDF*c7!VEPE>~;1*;?pXYuz(+*dZ21KZiQ_&d*F%(@MXg)#srA5nx}!w;R^xUKMV zQhJEwQdadOeEk zl}4O&OC}3`f&cvpbHaJu=3w>gtgknyltM`qztr|yDCkm?&0Qz(p4$mFIf};i7i3Fl zn>yE7d9c026p79*a_;r zC3GLmbvG8&O0iEeQ0hBHzYV$k1Ruw3cT7=s14kDOL!G_8?cBFX{lXqGP>t`d@p!65 z6y`P9#B?!L8@-LJO|u4=Jhor-T?no;TdKHLa#*d-W2e5u4~;cRr2d0_+B*vg0#m8s zP%+cu%3cf@R|)k0a!KBRgwr&%)+*x(X7u_zHzxfE=zXXbe)T0KV^6rh>Y)540yz3u z%1BB!9W$NJ{9k@O+saA2c~rK|Y!y;ieh9f%{J2b?o@;xjjJ9*weEtCH2cJ*}-iBGvD^KsU=ktywoQ>dW z*VnN%b_W~YkBlFUh?I-A1UvJno(8q#_LF9#EW>GRD|s1Rx3%e5im;>%&cp^J-q){o zeoKlEb#;q9c3uHB%+GcU(>X?5xQoACD70ELGVy^M&bF7+b-W^~HLK1Kr2%ui zVZ?prTqiTj?!?J9ZoQ)Nt0T@7_Fu)6iijAZtJ1T zyFAEoip2#%H#9iIwt2pC$@+VMfHPlnw5+~BZ{ARbI{NfhDt^9?PgMgs}Q0DK@r`n8LQUUN*YudlO!E}l66#IqevNeHI7e}A>;qnw90T9Ey1=`CU2Z#&Trcjkxh zMha}?$@^>k$ld%+kt;i+VfGMj3Nu}9;xgS1vhj7?s<9+2a*h34V?&75Q}|j}!}{kB zY|X)}$Tr-m$sZoq0u!}9e0C3ebK$0E(ay^*muKOiz~2tzWZJh&6Fa@nOQI=YwQ>tF zjk1&JHhxz_zW8&_Pc zkmRx)Z)(}$Qs!gVy|7*&X=K+m>G-e`I}goGOV5@*dbTAnI5Zr6V8s_M8WRpejHJ$h zT5nj%y2bS`929?bhanct7zksaobzl0j~%w?KCGwkuyW%)FnWJ zTcM^kkpNuQa-qXj6W3~=V|VBRD6F}88Wpp1wQ1o*rpbjp25?lO((DYmNz(U##eqH< z=^ij?#{jJFt70^$a|z<+9D9HA2jDsA2lhIzFJKV>nqxjy62y^z&IaggCxaDVXWQ|^ z=zD;MB4v#Yt&wwx`_ZI+q zqO1&wN0Z|aw!kA>g+pVZ>9CR6e&mtM-Av5`ShUBLvUl_E;OV<9W27^3jVr#2ulOM6 z#Rf^Fj69P4z~*uoJBW-sCnFk2ECpn4iKGJOdN-JzT1c@|I^mGcquu-rUf73qknjxM z%6XVqnf{?aMZ^6^GC=ojL3JZi@UjCwjEvB}0;FpK$)ITbZuAaa^p9k{># zm6;5LBsU3%7w20woi@jtoTp9J#Z3(rFnZ1${9Fo`yKm#KGaz>yNW3^yu(ftzq#^gAblsV7TMkfIL_6v+7m^6NdN44oua0=X zjTszqhnZ+vc;&{Win8V^TIiI_|FUMkd|xAM7r9=xzHG+c`wBV8xC z0zfA^aCcBhbvo7-%6%*a1F~^?HRJ~>fuN18O9kALiVmv#h;tooqL}+1R$R~LHclc` z^E)ah2lt%kCIHSB`@7D=@aNRO#GyKT%`DoLf9}%ATUXjjx2M}yc;vsM#aKLPG6PS1 zFw>hopSx?31ff3d47FoHvwoz#`+uOpsMFY{~11KBbzSA?PlGPX& zJwLhuddYoGh2;*h>TvJ}Y{kk0xW~C2DeKLpeDzGOwSLo7+gXi8d^Vt4?>xMS2h0T= zuswXr)W7Osl|}}9lQ1|aV?F~UnW(6pUye|FKaL|?E0t2D%$2e`<)YKsySryJnlO}` zMyQ-|6fGw$GDDY^@sD{Kqd+VUzz#Z}^iN!u7!OkiB zHrP40?(}X8FP*=*Kb4!Wdzjr*ZjZuM9$>S;=dU$fV?G5nv0ljbtWp4|4<l!W|F3QtkIW z!rO?d3s-J|Zo40l6~PSbG0yy?_8k!JuS7|*AYU6Ek#Qc?kI1;aQ7ap&qX)eCyewhF z4Do%Z_tmI5?pRStG}Fe%Kjv4*YoXkc#ifsvVLh5=Xq={ zKW0I8<18#sPW$37edR;1&JXr|MqEV;qVczn_N{zhoWYD_LI>d}3S31S_8+4^?t{$1 z4=$Cadq&z1M@O`ZQFH$?cwLRDeeL$iklUGAz4?`y`QF?$Dvmvnvy)|IuVzr>cgFaLD!GGS9EoGUX?qvU2-TMI@fe?W*wGm73J{<9$~jojWdJtGsJaGF_Qp zO>qugM1MSfhrW23F~3{6Sw$B;$7VnloQDQa*gsUrwY?3h++&xf618`bg}3(Jre60_ z9o;ng429K&IwBe=!VMA_Qmc5!l&Y9-P-HK#r3z9o-}=@ebcVm0(W zE!+1M*^6J=hSuY4*E|f;$VPHEP1+Azz}@Xs#H_XzPa)N}N}p}OFIITld1>|eC*(z+!ITd2RW=EN zT2XfM5yk|#W+JVACZVnDzzxTRolEwzd_Yz|@Mye%+RrECRJb-KRemkt zo@%wsFW-!PPW_k4TpI?Upj@=o*G0=ztLL(welR8RX0vHedbn+`o!q$aH1F_R|19U( zoRfihe7MgP() z5^wO0-U-q5>3NotfpjquOf`2|LXCyp@JtIc3CAGaTN?4w)|LzbaTdcMke?QmM%3z-dW{}CTZ3>E>tMpB}u{=ROS|}sb zfnuVn0Pwfrshqh~?t%g{GY^Y>+#K*u>Mw(8+L2!`!8*9?=_97Me8nGsD!tG4Pxf}I+kvS8^QUw8uK+yRr~rxLt4!~R^vyL1oSS8QPa6Q zu4*DPWzyt1ijy~4;s$~d&2j=9MbQ53Ya_8>HxXOMmFxnL|Wxo{|6ggMLy4X^~MF*xez$85rmJG!=BDvUDT z{0x99#kpL}s~8sg;AME7_Ae{*5dAXanrOR)y9yw5RYCZV5w9N3a92?TLg9gJ=5wrt zIC=7q58?Os**^JmSyW!*S-0oHyeXk(mg6f?a65&x`8}XER;9pXy348PJT^mDZQ)5C zn=|LI?>rwqK10wMes5#5uygdC>{wM}9FJ(Xr9!$&$MQMV`BdBHv?@p|hw5FJ#{RX_ zcaNj<)`hkQo_c0sbtlDU}R#1b&R-N#13TK}pr&KSYqb)*}NB2mcJ9I-Si-%w{IWSQPsO^ z1^xNW8fFX>`dzn~5nY1DHs+xzr)13j=eQ~pg|~7{H{MlVJ(pR&5NC4YmemsHh4Z6B zR0h^7i+B^>K2Ck^-9=7ir79Pan>@b7MlXL?rkTfi%n`mbo@Jl|NBstfcgIVt#JT~_ zkrC7w@xfqrf^muan=xR=19N$Ed|XkhlU5y3N_;22>~*HlG^o4BZIUb^X)rX-*xM6< zd;e}fs-Am2P88|fqaYDOP0P4 zvYrAUmp z0MZWmNtW@I`#Dh~Z-_i7N=M@p*KKEd>uf&z&0hZJbNBm){WNqhV$+*mD(ZkYZvqr| z02P*aPCAmYRgYQn5uF;Vky8t3d{ON97Wr5sVG5>Pq^CF zrky_LYGxj@G&gKZ2pQdu8CqCzZRJ`pgPaNDhp4GQNu3s*?<}E43ktu(Izk<#Xq>J64`!$eVf) zbc*JncGjArbpGnVddpr_dt-MCc)y+Zu;xTvTb^q#3}*v~6=NYSO4~2!0l3=w$#PM} zKe8oblN!@f=2Z}oR)>9`hF)n`}0j61o=uWHw0_VwKQG!Rr(;PyNe|$iR-@W zvK-uh{XSJday;wM(^0v#RH(6F!=-_J|G5Uq51WKxGtz)Iu&u0efcccF8Kv~WDNjTi z@y)fsEFWFi0!5f*8dTZx_kq~Mt^LU1cyQ7VO1V8)SZA_|-oi^tPNP;V2x_hSa2?vr=k1B`HTc|8fMLoem^Q9M=4@~>i z%g(O?xs(1odH-Z+gnE=N2o@l-L|gaFkjw()JlP*g9dnXGdv2_`K-X~G%EH(y(6z1Bd|R%lqi ztQK$1RNdjqdnfAa?v5uR5Fqmf12uT&2U!{Jn&E3Vw;)mnE)-tKWPvy}8ZwAvlG;fa zq9k^g>qlpkF~o|Z!DQ0KgMap_OK=EAyLI%cO^>j}ZncoI^)*FT5d=)1{Id7;Q$p(h zdhE-5`TK7cz*9n?f0taCdLOVbTJuL#tIJghQP-KEqN7Lqn$k-$}$(x2e2BX`LQ6kFv&bv1^XQDoTlHR$zbusLY5 zu*7Dm=NFZ~4IZ|LK)?Xw(RUziESnNyKwU7OVKdOQG7zjOVjQ}(6?v@=MGXL1nX@M_ z+C}~jupG!!blI_dAA2rL4|jTZijV4jT{lzu&T1#z(QIG!q3Vxuf#%ud(b!q!^dZz` zzIWsAtq+9OF-R$%Kyrm3BxnjhxzrgY5{%F5?K}$Oye=K*+^^L%)f`tmFbiGKfn$Kx zl1}=SpaOwsUxrz>aiix96e?lL<>jb2e_hjrK?hy<>PYF_9~}q4tBYlKD7V*uIQpfD9h-0K8tpdT^v1_qG1!9UO6_ zkL)T9^qLMo<0b2Jy;sNIfQ4WsRV=CXWTUX-*9l=~ z^G6Q#0Mm`NgXRK>uW0dTk|B8Y3@8tOnBc@9>#u3x9KpCRuVsRH$Q~gX|8J?JxsUwzkc4YU6${K29%8ul;4L z{OD<4G-EcDf{(_2H2L+@l#^N1Iqwa;bBMlJ;dJ@+mM+`*cq>bPl`9HLJ$bnMoc>pq ziF_1sxswN95jp;c!n!o)E;E5YvOB8HMPgyk(E}AR2sE^Zi0KU#u0d?^BHg*rQ!*43 zxx8zUXAn0da2sNXr#A(R7$FYk$J~h9LT^t0h!-~S^KuZ?+qkC4BVD>FBac33;-Fij z?m6j2ZLJ&P+wI|j_>nOWvf%rOEDHKpxz*m^y0Q%;dt!Z`=HmCV_im`t{=Dum+~qJK zJR+{3)SYz2{&Rfy$UK{=K$}+G&J^j8FdAjeZ}iqav3uCb@^)>|E1b>WcH}9ZN=OZTpYK$4H5) zOuomN46mN zxJ#N{!kADGze`%F>SGsilG2}Dzty7Ua9q9ouG_ev^vMfI?Etd3Llm9_g~)Q_H1>go zKlWeS892kGaeZcBc$ixl0TSy{1|Vvf_?m*p{w@k<*#sZ*EGWDoDt%cJ@j6m?-R;S6 z(+K2rl@!6Tb=@{u-uYB@X1|lrgdH(1CHoz)HS{=+NP{p#M!V7vlv68 zS;7~6hIZ76p&wLXrrxpIErY7>hTq59cQ%hsLhP=;F?MknjA!4{1Ai4X6B=f+imNBm zom|c;?DTEr;zkU;v$;z-{Y#37XYIKTx=^!Hdp38O_Zm+)Ps;YM28}koAr8VQ=s5`~ z=?5=R(ZIGd?F$E-f3Lj2`s+I&OG4jSN;B@uL?rr$3cbF{M&RJ$T#UXT&O&GV`#z(j@T7LjR+d=8zN0X ziNv^?(K!W2(}f$P!kCy{jHEgnNd_-*ZG`t_F}X+B%Z=CHb+yraqZ4d6I)DFKW<4FW zC2ah?I^EV>DJVMAe^w=2`O5=CKkGLDiq$b(g?E0+5#8#}kg+jhWViDUZOp_HCtTdy zSbw(Y-KC0m4^%l&mF`4<5S#SDh~RLK>-KgJ@gbJ3VONZA@b#pIZ>+k_cHg9#_-@~) z^u%O+$=eYhT-2>riWS=hOzjl`GJ@#o1%Q>I!1))^T5;aTBqNdCg7YmXjv@RJqM>eR zc^T6)|3t}Vot8R4?wwmsQ5X_k`izH^+gu=?&Y%DO+C4*NOK=Sn2H^6T zUGdJhP^IJ^`L&OvqRop!7F+=AwNPHU@5&7m(_#+6rXK4*nCW@WAYKx){BFlG^YcVy z#V_jE{-#ys{x92J{5?zfcx-PvYrWcJ$TDW<2vtmL@B+BoE*Pw4)2wRc?*HVfyfd_% z<;Kr1n1=1*m=YMZF9j?p7xEP-E*Dm`(=4&*p|lH*#6 zhdDh!*2|ty+4EgZlVr3?=5}2jtncf-cCJL|-+%u{Pd}iAb z9ZzqiO9HhNeMh`|>mVx|N~g?8waW1_{?Z#9nS0^8ttMRZZNuLfgT3&|{|Y%h2LDSd zCmDnfX7?h1gR}JO4aIL-J zvAAKi!`TOICLBWwnlisai&4E`thFge-n}72cwo4IZL{ShvWB)qW042AcQ6X19B(qNk0jBHgm9PirlA zEuYauhqZFT%!MlIBdf9}`mPSlaR}52!I{k^CLhKqnxP#j@s_(mGptq3Src0HqocDs z$$Uoxdzj@COtBrqHUYw2s>yp;@oRZyNlK^-d@d zS<3{NcB7l&6Y{43c1|EKJUD!2*s+|mH{h1D{@t!={pVb?-`|3wCM&gS^Y~a6@A!b# zt#)^Az&a&_n{BEiydSWB+Tprx#%YGV>={l)?xcN@KJ}d#+HOt7h&yWJ@p}qxLR9E& z%ChZF0Z~(NcszO@Q=@g^Mp_Ivzlcmh0_Po3fy<4nyty;*`nN2DMC*ZWM(NS{)kjWA z4yHO2FNl=K<5+RXmIh@EZ)j@hbQ#4?=W$M7Z|>F^n9g5XKvnpyeu%aEzcaToO`gsL zXDIj8v2JvVuvItawdEY0Z6V{yFKm~_Oq6z4)^T6yR?H=d6Zm^Dg_%)o)@a)w*~=5h zO|mL|L4y!=a3;^hG*@}Z#B`wdCtv2FL4Pk(zO~7)KthGJVYrW0lh+eAL4~N29n|UR znp_hu?RG_a?Dng_{gex&Ie8VV<<9h^4N>hQybvwMP-nqH$u|DouC_32JE`IRbfO8l z+^Xg|>G^N4XSY>d*mbKib5-rM$>qqH-F4A~(q(C-ox*q+V^~PF?H!aCccq`&nv?3e zAHJjw0Q5+%t^<%iwbG<}hL~Wl)m+jbbeqqc-c;3O>Z@%<6^aO3Oq{4zs)WlH)rMZb zQ(*m%pHvhYUK2pK@|B_L#iJS6xNadI)XB2MjtUcPelq_3c3w7zBbm?6R(-8b{*BUS zi_ab@ylopv8|$xnkBC(5a}1Zp4uSliy5x<^pVy7!7m$b;0iR||419+O@tr`#*bF`7 zs69L$%Xerc7}D^kNu#RGy#;04R9j!MN9=n4tIg43YE)0_*vBV z)J%X?4!%r&e#t0W6Ze zHTD7a<_~AjzE~OkRFH=b6nt2&8$?_7_T~PPVZP0?cA>ZVoYGS05^4uh6|3d5_x&;r zKPD&L(zEix_BkaxPx8fg$!q8qU*L0JdBt-Z8+TEqn4V!ufLqR$(Tr7&8?Oh1JYNit z0?LFw<4BR`P&HIJLlX6ZCrYdOK!7(i>=wTJ+C0@E=yQ0u`mwOD;2EthQ$wPXLvlz} zm1KFp+aK8%L~Pq(^+MeShf|nCL>taT`I_y`kcOD1W@neZT2W*xG7U5@+@*t)6JqN~!TgR?);pQepGXRs9atF=O= z$vtL^qr`tZ4SR_}x>#OHqVo(X^f~DzL5TU3iw#ad_CE1Yk`Y#E`S%5#IdLWK!RMTJgv#q&KEVoa|T;>r41zpTeFNFW>T;jQL zark`LO=VJ^@M5Y}H%;S*e7U0d0$G86MMT6|_=X2Dc3s}E={IVDosX4Y1TX_)(o$|D zI=AaRz+oAPyq*N~%Vox68$=#K5I3g*tXmH6sD>)hG2S#o+7gi{y)&PID-|YZ$jdcIY-YpvkBh7@lb{U`r>uV))}375476`wK@rUG)*G%f=-7|I|315*C0_ zqG^?NL@m?dZA?3BlHIc5J;TvFTnQRX$RN%} zJ)u*x9+PAe{QJLXotM%?Y@zqzvNtraEtqK3Z(JroD~XgFuo=+u>Q?`uPYf&vrkuJE#D%P$CY6Try;uKgZ}@ z-8{m-v7!jI3i316zH(%w^K=y2_rf|;=M+cDo!e}omSklc07b~q^!6M^4QJ~bAaZ1sdVDiSd7&Jc6FW8{vNGUpkVDxCWG50*n5l@z%;g% zkP9dG5-qz4E=BG(1hj75O3^}J9(Wx8^>SAlU3o-n^JD&EKhG1Zc2tvU(u(l^*TLn_ zy~f`}X>8uU^ma)$9$1bnqSeO4)$ljnO-Cv!C>Uh&bDRn8`A0t01e#E1>xSl6&(Pvv z4fb|Lm1wg0vl7xUu{HWZ-Yf+TdSiv&!-$Cy^Z9r`M#_LK1S6s247hpMzDl0xpW2wh6<`X zi@D0&&0U^&x~^YlmJO}a$Y-ZU`-|5S*I zU$wFSeoL~K_(T1pX*^9@pReU@euu)2~+5K-X;JS;-LKI_JA0ZOx_%hT;YG`MHP(xw{i$xmeXdwcyP|Y;~wr| z4+F@BOT=f1v|MNB-)1ySdGvfHRs_h92(8WsyBC#&?JdCMoeo|{Etf-Y2`0(>2yr0* z=r`n`nThEP$SnlNM?Jy(`R|7sI}%S!jZjKLX;r?Tc|yNkMCDl=-P(nkqZ+kI;=FI~ zE0zXUGD{8obVXE_1wJ7kDW0QcjVVW!l-5vEoCc8%oca?SPyH5l!4XAz&hh)cvWd0& zbm)pN@-r74+_gxkp>DamHfYa?geR(2c)ZG-6j?hmcwY>xDmDN_j%^skIaww-NW zvp%yuvOvO9v2_fO7sTDCd)cr|+H_aOMY5gm35tp1vkNH_I*=MIRf;iO1lXH_xIO@q zBREby-%rN&O0^%AQet(fpBvErqsqNVZ)6^Xcaq;oRRS3*^XfBT&(btS^>u&`T9?l& z{X(Z;E&Znt&}FZgy@U#vpTvH3=1QNTjazGuaC&9m`pD%IJpk+&&x8&XssAJ-GxmwO z$<-wpas$l8-LeRQjQvZ@V@^B57GP@@!SfhN;vzaB?c7pPj=SW}8fiwp>rG1l`)+n` zv9LpQ5GS1);M~;Ebhg#{xJSzna8;Kjy(1)^FmoM_x%p6CmC)i`6x?2`f!jpsM_tE5 z_5xwo8?7MD@ySg0p{*tCxg8EGO*Qx2Zw9qxi_c@sk_CoNj3G5s%@^u0!W!w>W_Yvx zQMa=YTASU0UK~|2lW#8#w*@tdB1*a$d3p5;1Z>=`9{+^x_p3{(RC9-e^2iagx8No; zZY+XQr@?1)zd4YCwpn!;-nmhUNX3k1OVUN~chOwoEjTqLV?#r+zA3A;vCOq+><_I>^5DSuCz+7ch^=tvSfG1y1ob>$&d2%Z!M+GbXhOBn(4EpF25uz33!z$eQ-T_0q_-M`gBE zI3;>t1N-vlQgsnE0kSvLlSdQ26WduErEQev6uU1t^0kqhp4mL(Z*0mI)lSM{rI{H` zJWEU&iC!k9Vn>vVYDd(c=5Qu(oPb_;6Z89S^yBF>Z3{n`3v361245=O$tJtlQd~%m zE439KL6|Wz@}N1A^03*o`#*7C=Z!UI&nWCo_=LE#YMt#o4eV2$(0NjrJto?zn z?7jn`g_AD+&~)|${17ak21DruSIpOdEy0SZN+UWgw6fou~w+b#fC z$K@J;qBG7aVJv=(S^!uTCC5)KW&vY@J0LOL7ft2|Fff}Nk)Y?_3CcwP0Lx?Y&Z8o= zls>?@`4R9fi6`>rQGM{dA80J@SHHz?#XuuHe336K$)>-6H z*x5YKnZodFS4>N`jT@{RY^S}+vVIfo!<17Ku}Pw+0{xC0Yq>CE@%wAI;I zSN6YOl+)LMTRi#|_?B|CeYNPo@)x%#(A!}?nxoi9>`b-Q%*gx?px-(HVcl>hFDK=} zAsfAw;+(O3%CNBl1Wqym;3@M?5qj}sv?J9kF1zGaSr66+Aj?RzXCSNm6ljgL?PzKU z=-8&cBBt$-|LLJI$18Lv{GmA8y?s7JNkSX;|9@miZ4FAAy>!z{`1X*9?( zNj+h*wBc{s%%EJ#LuC_R0x#byn_-*yl-&s%@hFw9Y`mXC&wdOSVae7tu2(#n#PPH) zQL@?KJL7&i<8a(XM09&#&Mt@?UpO<$1U10wYS3b-{qR}jd!2Om z%(HVNzA%m`5F~UddovgDhLz6^0);KhG6QhV(TC)-zuw-bANt+(lMe-of+U2(POu6a#+ELIQRCLAxECwyb;v+AhnrnsM^nuLG7}i|1tZ)dnKAV-hF9Nt=U4DNp8Hy(4lU(s z1^Q)FyQ_uXWLHB8S~`G;kD{m@xB3BT*m(u1Tg~69VJn@qVH&YYOVRzug!7cpo=C=# zR80*snZx!BB*wWHD7-RNL?IlD-}-qOF`n1V^*-WSe#eh6Lbuo0 z^L=0z!|Jrv);7@}*47DSk0`X0$?|&pV6h7Gux0xQ6P}QJ2@+`bcKpmq`o%b2y8L@B zX^e&;d`F7{<3w=&U74eg5XgZ52LwNfogC&{u|_a@Ej9x@OFJHz)$P1hc|TAQv1zdC z4U7g#g7dK_7f{*A9uJ4ewub0`XG-fXn>BL7nMYqP_BEMQaA|{D7`ih)^6oi*Z|qYk zigB_F?9R6&eD3F-z?{$$RxI^$6gB`9S_GbX_^sXD_JZyW(^l#~mzIk+-^2T%cXvO{ zQQ-dea6nXAyLX~9A$mh5PTm|rZ6+5$ck8_lRQzkD;_4awj-QnS#*JB9VZd}~XbQo; zYgKyVnYMYb0%Q~oS`j)TfaKBjIdntrA!V||QKeDEP2!u!nvB3;q0_VwGNp=K?P2VE z(7U%QubauAw>t01S$e5x7PAI`uKW{UvkEb&eT`Iwj@Xj%D3QH#{zkHe+xb>?v3h@w z3I*mjJ@Ap#&9zhS-B6SUpZ-vV6?stbpGl4E_BQ8-G&b$0pFVkXHAXMw*PPdReh5|n zG4}P#!dB^w$*jxdj{C&^5T(Y8VqQ&wE<$#nwGVT@tF6mp43Xnim75^(PW9$L{TUnE zCC4MWY061YUZFo8keSUBh;%6MNF8bZYu~W7IG(ehN7PtoS8M0=5xpa0zIh?oa^lhj z7E2O6S(R(zYRmMJ_3LJP6S_c;AHV(i!U+1z=SB84c1*b z19+}P^H+8&yu7daT9}VhzrS1FvU`Ui(TqcBblgHPw-HC?UgQ?>u-R|Q~R zvtRHGV;D!S0sHJee%+OAp&voWmyd=&ZZC@>G76j}7PHcSP(SBHmE7q0;pnv;_!-Lw zjHV=*$47w?!_C45p@P}H4+&WO$dMlO`HV~m%IRg-d#M-Fa=a~i@q%6FlTx+gha21o z80wa!rC_;skN|Z~s0qGG(hP$=4*V526;k+=!Jxw{>aotFVt|V8Mv1PMk|^rk=<>Vv zit@Yt*SWblBzAdzi0v(KD=kXPj!HfWR<;7jVa?l%_;eonE5LTeG5Q6TLPzqhHs1t( zA%FSn(!kAsR)yXuR|&qJZ8EnTYO3>^5Po^G+7utswe-^OC(sLzj`H=WQ}8;snyc+_ zk0}BU!98vJ@Hd7%>n`P^C61AH9+%=v=01ozK-}IvZtW^t_}$Ys@iVxl#q-jF({iEm zBAb@<>B;clx(`ClpLe~OA==c%dF;@V8t1d#-xP@1;7UiUsO&T|>V56INtWSxT@d2A zH%K*TQ-r9M_SwGUz4^I{mHEB)4P+_)+WLk37)PQhbpp26%eEm1ZuS!RcILY$8jJD1-k@$MbK7zfw`={voX`U4EbPKc+q-8qG~r3K*`0!+sn<9b!&qHLdh_ERs`z!a`byrtxI zpkrTkrw|k?6fFOZ)Ykjd`$VUA7>HPqT%kH9nqCAR-4pq|NEu=d{m5xHD^UH_7#ACAy~0Y`>j-sQ_sH>hP< zwqV}U;*xv0_r;^qrkqD@tJ1Guyc|huBV9kY(tD&aWKjQPLMfs`iAj+O+@u~aBBF6Y zoPc)w{zTPf4PhX^R>G6jNs~5pcTg`vP0iGghiLWvDc3E#Q+F}>veHy<$6K+t-KF2y z*ksZb8-*6@MSyG`f1ySH+x`rEZTln;q_E`LKAE_jxw>A?d(FJQ zze?I{%wQ~c5Ofzo`L9W0^1P?z$GTVND@a2kEd0E_*E2oIk)Ek^yV+CttR}Fr<&(Ys zc}A$pJDXfj26>kHeFKop$$(#lAhB8;Bt5)5An+>9$m6B2m}J|eVZ#-8`{K^%Fi7`_ zu}3%00u3&n-`0^XiT~>5gwRp5@xyY&)*0!+iO}#8bVzc@|ESrv%a!ZwGHH(f6HH3W zBt(yUDAg0!3@ngn8Z}|2D~y7|-VOdZPDNR^h&P_N(RjH~A~~<$W;RULYo~s8$u}>N znukZy&?~30+_F`<;(FaiJAG>s* z5*p$`GjXbBy@Z#(L)MN5{t5zU`m9(BkGJDuZ*Ll2inYO0%D>-|%EP_Y(^HD=ixe>i6M zN#TWY_Rx(pUAdJmqh1C(IcZtMZ`X61``7zuY;lWe6qV6fy|YvC9n zVTY~?_g21vKd(YAB;~EI5Sb_PnGJ5p5>}3d<1gdJvV6Fso15PYY^1Cqw7F=~v#-`D8-gAdyx{B6-56RzXzRECV>qXgOffOWRn)su;z~3`$S24T|&5X zH;MFYSfJwJQ9`30fbB90dT7?xw`~NZHh!S`v|yQMq@E`z_RgH?Gp1C-CTWk|#RiXn zhxQfVxKN7!{RX9Kt|#sNj+}5|Bz@@cSnQ!o=m33qeiuw%#@BjTOpsK+K zJ#}LjJRTwP#;taM3(vuag$=ytLjTo>%=91eIgweo9r z_VOFOet9&Gk^lU?{_zw4oL~Mo&+&PDlRwp&NdDt$|D!Pf-zvoa!z&?CPf+DJkc=Jw z|Nm#-9pKMIEW%v-=Kt{C{~w-BJUlNt^tk?gKmWVP`v1O(|NZ)#kan8jSG13YT7X==LUsa_45EP> zAL_s8hFW54`!^79vLH|6l2VO$G72>9pvibhH3;=glnV+W7n}{Sj*d>ASifCcMQD*_$7=fdUwG?z zuFZy1VVZ_2uPtj-60=KRc1Bt0Pmu`cR!SSL&3y=Nj(O9^{vy>uur0zCLBzjTko`B=sw@gXo(e$cf%?)1Fc*GKyH!NObeX>H zqg(Fv`|lNh6dwt-U$(SefY4b?VcHI0Vto$HdklI5gzDH^_PRij(dHi_B{(sIK2QVx zx|L)u*eJF*1>{!&yGo_Mmu(L%*=0;-)U!Yf<=&JPk>~Yh&avS>TOKs+2#Li2Zc@wSL6iqmx%{OaD;b#8GDY=8H z3AIr}@MN>!aY0(UaM1>0hQKJpr~kKicVWyp6Kx=+p%$}xWA{X;b zc{h-Gq0hOTO$o!-Z9C<6iA)~)eSFCR-4EDwGj z=l#E(O8&PmGO<1l37g7OOk)DT^UO2ZV?a1d$vaym5TY8$m6L1IUEo`GlWe_O$L{az ztB|Om@YwnF7!S=q?`EnfdTjy(J60D3;ZadS|JwQ7rm!lK@!6&dU#;JK>7(f2w#c{<-}YBToM)j)hX^9~;nVJHCn-leAi-xy02o;k&XOXa29ON+*26kn#-paO)r z?G`pqL$0VBfy6Qt*m9F=`FRIs?jc9-E8}i%X`uq^-_Iu3wdvOz)9pJkp@NQ%jzFe@ zvxe6+Kz#S9bg}u%AEx}-NFU%%)r!^=7&`-344VAm2N~T}^r|W3*8N8}=l|V$=36BF zb1(rLMKfyuF*Es<)0lXCaQ<)#pKdSuWuXITiF@(k z{jAW7Ep>msLg0S<0-c8(cc915Fn%zkb_9e-IuexzK8We{$y|9-0btq40pc4}baa>= z!!#cg8zO8It9%~rBREa$7fIRlUT7S$wR>MpDuMI`UG^;wUJILb6i+`UXcdf*T)J&; z8mHmiJbCwx_ZNUEdE|$8aHQ~9#d`tyM*k0cZy8l}xAhGxAcE2;NQWXNAt_xVCEbnm zCZ)Rt1SBM-k?!tJX^`&j+Vn>HSzPBn&vVte-t+Yx;~v8g?k%$ad&OLH&H0Np$(T`R z#!qJqyL!O-29R~SIHlcxEj^T=`Zn6};$tb)yflAk+-%AqC_=`w*aS2*><5-mS2JYi z)jpHK<3pII6jaFDvdk-vp`Vh^qw^OiLS0LL@I)rwl9M9TI`T8V)eN;)#e5)Ml8-Hz ztqRVmkv1Q=Gy$HSspF`qH)}r^FX7)RN?FONHqaE(Q^Qfzcdz*7`{8yCiI5bidU4`f zmK*!CYgTD~Y@e`5+WJ_BIQFVt2!oc)FDFht1+Wri&WWwsF*)-qguf`(x3qPTep1(3E&u^x z^x`KY7n{jwmvdYD!_K?X>2>U>=;hu9vA@%QyY9pmffTyLuU0Z9Ipz_y^?AWvI5H%Z zW#TG=v$;a!?TyP5b$igX>AC-qFh$j6 zrqRu%B^iP-v3OHELnn=&Vtd>#||5j=ceK?@e zj>7yh^cJ;Tfy2a|?`Wlp)XwtM4^Hj$F#n&Vi`>4Ai@=fML?3UE^ZLYhpv8?a1-tpR zkKU6vZ>*9x+}EwnIDlEZKDc2-zq=1t35sX4b1w?7JbeVo`d-pcLZ9tG7uX$EG2{9r zQbtn+;=Z*en{zpcJfWh{+i~aRT&SkaA;`i*(A~|HJW*!BVU&} z_RE8hazb7%{pm~Ll+~o!CzP>vGUS5aFPq`fzjb_oRK20j7f88hCAnlk9o`a;h49W% zkEsFWu4@JJPng2p)2lEZM}Er4t3s{{QnY+_H^+D0bpou8K|KR#4g3}L^#Hf97ToLn zU(WD#yr=W&{RgNyN;qHd6kZC!Zt>qMf63B=b_B%iFp<1S8$B`LiS;meHh@%06foB@ z#x;Fx6*djpY%J7w&_xOyZ@ouvpSYiwhRuiD4=PQ8QHyrS!~ySZL`a!- z!rp1~SRBYbu$27x25#wDDtWA*sPNu&MOh$eZL8JiA0HP8Q)yTk1w~tc7pH`bZ^6uQ zMCEDODRZ(P-X~?sd}$6KEZ2<{)FM9y#OO$7guW%)QKX{3b)k`Jb^0O`1eExSRhO^4 ze~=FOO@!neH=AM?3Cl;~?k7lY`>xV9J*a<>bM5p|zIz~>0^@QQE!%^%w!7OlDSnzQ z+FZEbL9|Y%OFZ8oT*CNi@5QqDjIOrIn=j1nP&PcmgIe47RfJnlkhqj(%0+OBW~eQ1 zi^ZbNUj;biW?%Y;d>qxp8o_}6#OQruY46x3YvY17Y+Xe6Nm#~Zx&Kj*a0Nh|`2|EA zq|R1brLBp}Xu_6!od5_!Om zNTQbU4p%^>hiSR&vBrV|@u|RAo^lbRZ(il;jP#{QT|rW9IW!s_dA&kB+|F)2Bn~FItrUFv)Pc6kdYt1_$ZAlYpru97-5!sn*HgF z=va%X&~1hHb%LBLSXI$7Vh_E}$XHQB{V)t^iX8!HC0~>cW4U@C-IR|{{S&-UJISZH zVI+>~kWdHBk>P z%RM)~gj}jrvdgDDc}!nP0q)mc)yR<)GTv{*G*^G{U_ShZt2Owi>?a5`@%-Ii!qZaY zO8MtLwxQayLd~6Hsmo=ohF7~I==Rybb@9pP1f7HY6JQ!Sl}a^2fS#bzY%BSZ{t%!A zy6R;y{c2*xeNk{ZTTOTon0kfLuy%yv%FG(w`3!od>5{QVaGqIMj0jKXo8 z{`6F#PASVt#A)KsAzyd-QoSv?c?it8<#x1`n_j+7k z)OgWjV&N|Hoxej&ROmY=kY=RSdU?d?Nb8Ny!p@%lOo(HibS#V)=oqpPj8)d#0VUJP z*J$ZA9t!nRhtR7MB0^ruPkDu>*-X zSOM>(p)y+Dgp{LkV7KHy+ z0N{CQ=pK2;kU#6eHpjO|U1^OK20A-Lfv5)fN@z*(I+@I{G!2^2I|-<&RWyvYAU8jJyf93gsPK+CWT`u9FJXk}nUO32lKLD`LZnG4GW6r{t02 zb%3rftvAZyO-UV>8#m3!uT`0d(4u`3XW>AaFca&xBhl-(6mZ}Ah%^;{xt7atMK3z{WD@_k2mXjWHLVI3? z{#Xfj=6S*prE(z&A&S4#N_YQih=cO9-P0)10Dj2fa!1IumyYPli_})zu?^#D2KZ1> zgV^&-Pj-d7?K(&%<1N926nP?qy>}?c6y>Y(T=z}wg6FJW{Q1#1WWe8jdwP(t?R|V} zf$wA7X_9GUA9VIl_H7Gy46}e@i`uQ1Wpz=o83r>_oc~lR`qTTnX9#|1L?(}4Dix?v zWKw2`egtliO_Pck?7~bV5>M)}W0zl?H~}N$)%xRsGC|8&verKLYFyWr2e>NoU!J`d z9PPzSMQ{Fy{zletK7QP)pJ~>|Z&7p~JO0dLrpi(q%Jh?1iWUoUe*TnTS}f828zEMW z`;Z#R$c;CHzwk|C zfIedDnqAl908YJxmznF-PN$^h-yjEs`fB#}m9eN2c?6Bcfo=~n33#dG=PK=XlteJK zsx3y)QOXDd$5ry{?S3%fOq1VaO#`V6Qx8XzSPQb$?eT*6r)W~WwmVhHu#M>dw(`V( zm#&tlj*+6u%DljY-w!#q z=6q~Dl+rX_rAq7UTSJen#>gRJyZrhB2>%_|F`}18=E88M*U4ks4=2#p{?Hd*n_lBF60>)p(KXFq&8{-XS;4)7t@#ZLUEEOP?QYnQ z$PklxZ%VrvBX^9gZ%~Ynx#1o0skL1`BB?+Ta0ZI-{T-{XG^Y}`di{rQ#IfXV z+1rE`!Uu}VniMkD=J4x&x&;44-b&|urX3pcJ)_3Q4D~-@Cak~`&S|%#vo+fOsGHJ1 zDkE11SU}ppY2$Uf=)yBfAe&s)@Xx&e9St=7%0AJf7mXofzeIXN`gZALYh(?Of221F z2_6f$=j}f=b2~0 zF4Q-AR~Q~}*c$GTibn=C?$1)By*8!1J_Je8B9i2CeBM zb5I~MWc%73LD>sdgh|Zl{=$O&i~A)axhP2xKoR~Rxd1Bku`Cd|K8e?9_i~s+zCFxo zM!Uye?tvRN;37L|JXC*Apq&-XDb zf~~#7zc0H9W*`Ys2I`kR?`-yHGxM*|PBbn4WqUfdrwYdS}WuTB!Rgdz0 z5!{?hJ0!YRe8f8>e(88qiJ_V=rJ{OlBzrXCP@2^_9_n7gKlHiT;JZb>Jb0HsdY51h z*Ev{rPHQ?oe(Z)*iK>HfK9+%$3rq+@OwQsfw1hR5$|!5GY~d!CpzB*5=kMt z>pIv8{*$@~2Ny75NQbIaJ*T{$^8<0s#PPTmP z7j~w1KxKO}B%)5=_mjk%>a$ZHh6N`S)GqSHoPf(ZLQ=S``E-J509pn`?*2Owd^y&wAc?rcXjPfj0M-@;Jx-` zu(P7nEo$ymjcI^Vdj{RNmbQj9Co5}|xWztDoj}k^MdREgg^cA!aRX`p=;KE=_-}?i zJ82k&RCpL@>F6q;!6cyEp@c+F7}KK*{`$Sb{Vmh>WJ%gx&jKVsR~=f!Q0=xQBwM)O zv^JpZOGb?#9luip&8N0(I90*}Y~{Tz}H^O&;(;{%B2_lBGoH^6ar%m$_w8BE?wY=0TM7_xzE{%_5JeMW6Kz47#;?GE}<`ajl6@9S5_BjC*Fo^&p^K53& zh7srgBZB`s9w>0YVeC2LrPeJAI6Y@hTA$rgM7OPGNG0^Q@I}mul!HL*T2c{+3uz(JVD^xyI?_|^4i}i`z!UfcZ=Kvm#@1PkT z4qX}#4^L^8>`i&|&5~BFlVxqbC^G2Wg~?0vjBUfOzjXLq)(Bvy)n_}vy8)9>h`~pM zW`y=>f^l{XUAYG+Wvr3h{_?lGB6A;cYrn}cyw}Mni?ice)Bua4NcBcS6D+yIr!HN! z_hO2tSMY)^L-a4SvlTKk?_c7mTR;onbi5?ElB8Ws<$1mA5;~k<{tSy-L!5yk#r5GQ zI%^G3Hwud(le8IG>*pg`RJlmvaan7*Ewqa4h&W_kXfZE?ldbj!8@{H3jE4a{)&+c;?J52nrgHPo>bjSw>IG^~jMbGJYqBCp6Vwx@S@+mXk0n-v zavs<>l4E~YrM1SHnEV*=u!t{H;NBh6J}@Q@+pYv(FnP{)M6%R;hHbFWQP#P^Yq_5w zP_kWW7TBS#^&|BtQhD{;eS;)6=e^${H~(%{4{UIIZCz9}G+sg9XCmAAW`iF4!}q$;ON~&Fv8^Ahe!9NXW?=^R{Lec+ z%H*FLKU!*R1W!&XekklvT2^=vac@|U0JR9<`vftZr7t(XLF=P;*y7xOqJGn51sK7@rmG_fCy>6` zb79!~^(T=HkH1hqi*!WBRz z!n3*p!q$%?TLEaHBJ%5InNACwY6Z<^9IYHUtDk(VVMusA}Cc zUuc|rWL5QXNpL|=c;t2Td9CdhIq0XA32qA$_%X(@zNkgz_OSyV+`}6Db@zh#6;XGT zaN?89nc1gk1(mTu6R(mj=($PgytNKqwHZM0HuZC&js9f*7s zsjF&SI^8;^eFx6~KHt%}8K6VglOo`=3N&}Z^7CK0;9=aW5h|OCdjJ0pw||GH|MZRu z!W~Wz@-xM2P1X?f4$=jGL(&wJV2RG4pViE24h#%bE@wnPWjX~nDgLjU=)3y} zz%UvD`nh4?nz6S}RegI0uPI)@O5yMB{O(WC04@N(HoNKn26Xec@SI`~%x8S=S2q=H zv;SfXcrn6(9_rrw-j@fP1O5dxz7Ot*wF%Ze)OXf4Ihu8q=@}WSg@_K&S!&Ql4Uf@s z&Qbeu{+`TQO7cGfhW`F(WP*`LyLx+P)Pv7L>wb@~`r9Y{>ziSMaIErg*t1^#<7)id z!ejP20Ab~>Km1nrTRQsp&-d3){Oz5bJ1`E-mW{=K*H-!6!~O54{)Zv`)A;}I2>ypn5Hj8@ zrTmtB>R7RzHWf<~g>2XP%rgJo zA?X!)hwP@9iZ_fhbGbMs3e?XRnf$VZ%#OEZ+Do~n^JQnm<#khn6$#}wva+&hh9 zFYlbIH7rXsY$dsWbkbQsbJ~-ytvpRK#JU`AgvcFWz9559!i)Kodk{-t==K?qJ+dz^ohi>cTyf&CV=sFD7Dw zKcs0-Lu5fT7W`{Z@r+hU?lo-3{RgJ?qNmRuH7pVuol55`S8X5kB!Bt^t%s`dUi_Sn zHW=2L4Wp#LsXm7bd%|Vte_iMP{fyoHy_f{E8FhCyI+(~sgX_6woUJi6{644l4<7dS zN`pvq2lgx{v0E-%{2T|HQ2YJ;mbvi|Ct6z+7SNf0tMDhXb5;r2b;UpP=)e2)SHdD;8Xo?-TT)%ohCU)rarT zmqTtHog)}$UeY$W>3O|6-(q*ERUE;c*$NbW;nElgXk&3{h zQZB-g=kN7DC^N|xAdyDSV0z#uoVam9qsua;1k81lCM=u2QRb$g{o37emeYnfb$K0~ zHdreCeD(c91dq0{Vdmf9F`MH7E`44@Mf}e(`Ng8oJqt(no!RLj8#_`K48gY*e5LWv zU1cWZ(IbE=hIP55A|iQkvi9vu$OxrNWBT4?V?gfZ4Mn_>@Zi}@aRBUzBv(Z$e|M&A z1NPpfLsVRa$rP7ry_^4{h6=lbFX^tzNV5w^{cujzi@Fbl$e)gFf+Fmn?nOS8$L$HZ zGWX(8M;;KUy6fTkT&we0W~l0CVJ_2!pYK(%VoVn(sr7q4%ps)h%Lz#)(789LNSoO= zLEgzWbvuPRxJ8jE0c5r$Bat0@XQLqO)TYf+-+^xhDQ_yX z!}LX-K!{EV*kVkVy<+3AMdnM1h~{s15(|BUl%g#?!LVZYb)gL(DJVe{zfNbgdFiHq zEX-Aug|pa+^oyL4fauFl#JRCby+>*fS|-B}%l5(~r>Up7?X=7YDoqXN$}J}wV@Gr1 z$^OTd%*D@1(1iy_)Ka6ErkQ&ItA%xcNJ>0NLSK^f)8Z`un~Xrl1k(a%BI5$QWuL?c z!sD;C59Ur7GdVq8U1nY4>^QBT*`=F)F$%gi9n14s{ADmc7+qk>-h1bWb{Jarig$hT zFu{mZ>eVrX#3B&^zp~fn$<@(~iYnXdU&N}#CK354pZ(mO_PMrB{q^)W#>qw0n_}m2 zi8nUYo;WVx+t1L3Fx5BKCc9Us^#RR5;G{OuCC%Up!4U>sVapz_w7tQvEkX>@7X@*X z$ES^()t^DM88l)DME;w=ovi~-3Pna8H0eLD@k?wk9G+<|dj}|c)8u)-#JDPZ4fD2v zs=-0e{jV+S89V6)gYfB>knaS;7}@;oo*1l*vy;Lq36=L{N4|CPmi8PJZA^(s?Dg4# z4u=}awCq6^q9sly|5kJJFT;k)Hha}FL|Cu^o&TiBM76^pv5R@|^oCfF&y2)jW#auI zxymu_D&gn%gF*}4mFgt&k0G5&O?=`J1{O)B|qQ1bb#uN*$7*4fvCBBTl4%*;8zT_OX zyO@JDtiI*=*xV8VeKUA);r$b#6<1#4*`Sl3m~P@=38SGI+?L(WB;*B?jnBSzNGreL zwEsd~kn6@0S$DZuQ7B895`JrYiL~1?6)2y+7JbD0+^_NZMw4?{gZGz&SKTHG(t&K| z6}?!oqA=G04X#e|-`zR?l>lWoB6ur1=k4EE4gNKBg^QB%aJ9XjiSxYog9Soj!B{wH4}&>QGpS*=17U7yo>i~6r+g+stpPsw$dQA$U&}) zI4D3^E_~?C=13y&t69|Vxb+Doge$Vn7gcS;$YmH_sI^@TYPFctQ|<_3m_L4nSCn?& zgM;}lmh)*nbff1Lm?j?Ycm+$8g1I*0sy*NoEQd^!Z4lG>YaViS&r+7N>|jN)4Q_=e z*6Hu&v2Dq+g zd2Y-Nsxjjs_CWsc@h2wvUhPJKh;jS#)pI2(MtzLT$HWf0CFct_G<5J<;qSkW<$u7= zSV882C+qRnjjQ*W?g}UIDDykzTCmiXokh@H54cim+<`m;^E z4&|)OTyPuX(^I|qhosNsO(&Z2@{@6i3#E6`<^**jW=c!a&MQ&ROF1#b(Y_yJOiMS& zL(<;vYUgjn4iwB7Yl%#=jhnZ@d(XF(lMGZ<*uX7e!joZPs{KTPJop;!%y{gQbewQB?p+$)b_d=2e1 zAP&z4_6UUb2MthntCC&bJ3eD>cout#t#v-l%hzw64w_7%KE?hA)mhOa`HlIB88tt_ zN#tv282EQ5Q8fQ%pk~KF+tBY-K>xMCVeoJm;n{oCb7^8Ob*3&QiSJS^NZW&B1U3)G zc#3)lyvg$}&IYw;w^Rl8-(w_`y5fH#W9B{YhR&Z2ds#0I&Fx+77QKw3hEu`Bpjtxr zBUtQ*Uir+i$F(y6&7PU@jo$AYB&MwX>|v^HFGb&3MAO}4Y8yOHv|P_xX^S4&KN3Jg zm(QpZ*Wcw!B>S3Ke_60`)21gj;`uC3i}XEye#p`(%@j8}hl8iy!g^?|p)b79QvP6P zFXZ-D+9ro@Jj`HfqpIFak%5R$J)~}punsc!V}9e?9BI$nL7Rl{Fw#n|?3VKSLi6Gqor=cL%30GORVEP$e5qY)MhJA4z%w~b57W%Ns*0|;KdDl(#KHvMkL|D{s5 z$917dz3F)CLaxF$i*Te(*{`=(DR7XpUkPC*qGV_?^OgG<|w zsk%;8#6e~)L~P0|YsRwuG)5t4}a#EqAtgq5}q3M?uz(>>eL6z1F1 z@sqMf3H$Vgt9^)JKeWPV4k)2xeXrOaYoQmaW@E4v<$5e6=x=u)oi}qVX9tDLc)TN= zO{i+=S6zqLv&YK}%=%Hu*E5>X&mBEqn@fx3H0wRhf7g#Mw|lhxAXJVv z^Ba$@1W5WOm6Tc42Z4^0nOglz49??UjOaR(Y59m8UKl|_`t3ifR&b~@#&Y&NS1EMu zSSDi*X9i946{5FxR3|XDj-_{2=3ZJiO3>H5m|rXA`}3_Tj2{abxJnBFydM>S{ze1M zMhdI-(yZJka2WvQ)eXe=D1iW@T|Q3XhTiLjU;XYW@E0PfUM^6iFH_+af`S9{R+h1+Cu3aSBKIO(jSP zIN}W0yX{uxqh3Ey?$JQOjNW3Uz%>1|nBA!=!s&OhwV)jkkB*-J8Tm z(@VqkE?k}GXhn7h(`@S{q07?+t^A*vN47^F`?a$4)(^X1)^rMr);!QNstzisSD!Ry zJDM3Jgx|`)m9|r6n$F)YmYUq5T zP?Hw;NylCR1oB8~q`5r3%=Uak-4&n}#Q^v9ovrCuW}}|r42g^s(sSH*^7c>Oy#Dk2 zD`3NA+-DH1U>+`Lc-?Ta2{l0YM4b85{0N6CYIo9rEn@Ee;H%6G$}ZXv4nyB`J=cTG zd{&mNQMr(6>*BQTB9dED+AMn8NHUvMyDaCxh(%dFhi)i;bqvNB~# zKhv<|qH(xF0eZ*&Q@t!%vTPAO+Xp|lT+v=zUrKy7Skh zhX&6J7_^B49#)2Wl|^_=4CRD|O)#PTpq#cmET&ycgCx$Yf!x85?&ispT|Gaz4qJiQo;7WV25&x@B_&EL!lkB9#|j9j*NVL1 z{uIMxp4ST>q^kTCQsw`$N{4!QHfNrXuPBY@-Di$n7WWR zHuX@)DWQ)G)E~s{k4uFx>uz!lvE(V3zvF{_PqH&IRAz!hH}ECB-N$-oJcVitL@5y6i2tqg17~11k%a zP>;Eos2vYfa?2ezDNuX5TB3`ak(w&^;4MtjUSC^~lpvRutC&bBW6lmpAvt@%2~d<0RqnZB4(+WEd>K zN)N7!5+w>Q9qhN&rjm&WC(~DIBn-NfmbK*5PmeuZ3MgerO*>x3vP1KV+on=TT-p+< zvLRaLWitzP3~@yZ_?I}%eR zE&3DxJ^Y=2LT-E+DrA`x56?um_pb zU&)Vxu|bBi8ssN>h?VaR&*y81iCZpfOqk!ly`1$Qs3|xhn@DzZf5@0_QW82;eDcOT(*j_gzitPzdVS(&f0uzkXU=HnkAZr z>Gq<)ly->DXI4GHpkS8J-Yy^qc-bb%_nm$rHcuC}+{c}d!|c+&5|Gdw*u$Mv$~lm* zy=_q3ay?<@b0|gGsy*$!x#hhL)g@^mIgw>=Y_w@Y#EyL7NQ;wn^M$hwPJLu*_^@w& zxLWwbL3rgB?giyzq*Axum1q zgRToX<%er4tdqJmyS3{2ImQNI98ERD992*4@J`B(S9_Xf4dS^whwBGZUnd(K;k$QDk7ZK0aJGmK>f$*dR8QZft;P5mw zdKe`UzgJy+#69MBVo}h4`OYUUu3Rj_A`P)?t>`M$7H_JkxGlirIbijr%aW?gXO&&} zT@5X;FNvpLb0$uSNIf>|&1q`mLfKivo4>6Zc6&Vzs*af<_x>>SGCU6|&8Eoj{0^L| z0llQYmAJH(i7F9vfWsprU&O}7mO36TKZ4iM{r#Nt-vi9kM@AsyE|cIR(jE|29l7^n=?Ye#%drz?yx}wvuD# zIBf)8`_q{Nv@+|7ofKaAhY!cTEcE_IJ4I)=0jc zSAdKar5)o*?u`!DyT|!%IW9I{c+u782QjA<$;F+ivaBCV4np|(nY|NX$Fe_K8-kMv zj3zEp?=@tQ^s_WQ-?Ky2-*y_(aoOguh^yZS(T&A9i_kZLZJ1^;hl;B~Eq29JGt# zWg@-0v>5td#~qL55Eywouy}S}NBm?NQXQ^8oZnkGX&iWd5&hp6l!yNdyHK%JMXw@dNidCdk>46dZ$b>K- zHq?lx{w$?0`9M1~U_R#Oi*5w!+WphnRZi;u%UAfvHJO?Cg8bTg9vpGnq$AqrlaTrN z3UxLHm6ZhlJmcaBzT>a+T&L*R@J)GVemgjoJj3oE9_D*Kp4wqDII>&_nCdG@VlYJi z@fHNF-;XOM@+%dYUrj|70QH!9W3Z?2;l4P~e#jIU?Cz%CVg5<-hmgsif`x4C*+z-p zM0aTqMN^Bi7|BpIz`3YAKU~N@`^B8M88f%g2n#S%J@C1;-?%!a{Ly=18Pol};bB6b z7%bJ$qohN#%DjUZUemnT|FTifE1#*7kWcANqtPz5jq|jb|8fXrG``8e$GP1rXH6g8 z^^nV9v!L}RZTQEP^jaGb`!hRjDrsr1!Ay?4)iTQY$9Niy6 zxvxTuKYpuoz1+C%N(~tCaUYy^u_YJ?-S=pl!WpRCk!2osb6p_tI%}sHTBP(CI{W6~ z(*F74>2=SUwCt7Br)v!l-Gsw2rK}g&3b3{%MuLjr@>De~5OT{F2yR zJv4n`2}wy>WT9HL&!0bUy=AU8+i3t0wlzNji?BPaCa`Q%u58&1{{0Lik;*_GFgcfh z{nd;k$fX3%@XJqX#-Txo0Tw=nL5LH^q~5UJ%_*g$O8wqKRAF_LsmnJ>8Yz%ETgYs# zDyfW46S4-UQ)pmAn$y1haEg^sw`x4K3zLK5uC5f5i&~Y^FjX34(XC+uVbeG|BO>a^ zwXx@b!(yje0p^(;A3(&Ufp__P+gOp+3;EAV-xa=>2-_ zZ(ylp(N(}A#N!~SL(rUQKJu2if6W~As?gs2A?~)jhq})X$gI*`&`LZb|2h=Whg!*>pG)qml=)guRD9 z7RO4!g~p<-5ry?1EBV(E{%gOj!u@g5z+*+`@%x$|Diun1EdFWd{liNcY6K*i+$Rk1 zL4QhF{`1Qe=SLVR3Ow4%Nss=pW3%PpIzxz$^Cvu@HC@PXNS~e@cvxI*ka7KG(0&_3 zeiVddUB-v7en`M+S=@^UN9aQRCGIz;6PCGLD&l8~Fd><~7=31g14)gzxcq&C0 zZ)WHVdx@AfRTl|X{gE=Rpi!);Gq+5_sgfcEx0)JPggDL7Hk5j!8$*s#o?oZ=ntW3D z_EJ&R?@;wuNSdgVg&Ar7;d@1jdfv;xdTk}of*>a+*T=}g!7=nK(7sU$yKVwCVOHyKougtYJXt=wWb<%ZF(uU_E8$E5=pPpL=0)D#Cq_3eu9P9$4q)eBoQ z9B3(I@J2UdS3Ml>C`ohzZ^xYsS37`!pxm51U~Z)UkxgbH{g}ACh#mVG?kW*Ee~yCn znuASGY+~Oce!>$Pelcx5R2h?>sUE~#n|%s z{TKejlI4#?FcEEd!?PRDi>lpRm6u9lrJfD>oQEGTu#$P8<@cg(W8N)0 z2ID2g6tY5ycXvXD#N25=dWM!2wC^;P&%bIds+NI>MoMQw*;gPJKRq3W^Pn z<~5wYOztxWL*W#0Bn`LIu>npymqNg*nNBcWPX{chAj4dMGG$b?sk8{zdKm~rTYG>$ z`);3Gsd)O%7rUki9ud1*_sySV0Ou-D!#nJ>k+^}U?K5$Ao4+GSI9YCY0KQQ#5K&J! zE38zEa<5eqKS2nWW|-!+J|QW4CJqCpvX@h?(es0e6L}8yxKqY0upl)K7G3Ub^$(? zU_*I*t(5JLHW6<9y@fW=@i zma9m1X{?ui-HPTI{x~biITz@7Nmp8e!Va5R7*Gm}H5%86030N|G(2WQ$9L2P5|^;O z;2N}LGzDXjd$)IXy8NJrp8l-ZzF<)gMdVpsUF`x9iTj#^n#2nztHpT%EtoyzgP)%0 zs8XeCfD9T5KBXt=$JK8^i9w0+;&Qd+AxR{l^osy441OL&(y!p^%Jm$Gtn?efZg2K) zd(5Fgcs_Ok*&gn?;RfhveH$q+XTjrfE^m0DtZ5f@d{aPkozS5?Breya{JXSGM~|pF1}j>u#MErMmVo zS?Ug=j9kkgGn0-6lkOStUj>k=~w=+istTg5Plv(=GbKUzz!YqncXT znpd0JDwyCi;@6HU6LSB`-RQ)K7?%rn{GXf{{PLZ%+Ql1TFHSAUQ0vmWOP)Wl{^%C6 zn^evhNmNt6$PgBXh669Pkn`tuG1-cTF%7q$}2#Hea4;X>-i`W~dk2mCr$;W%#nSan7S?!yw(zL$T> zjeh@wt=vf|-SnJfQKXz$HD$UUOeZv~yL8K~hC3P*iALI8sM#)n)Tiqlr&)ccQ0 z9?8%k%L;BNY*f1SKB}@UL!~R+v&>u1Zoz;0H08-}CoQ~^uas>N1D%NdF6~e*wu`># zw6O0j&lWW7yhU(|*|MyzdvxU8S$K8(MH0-<>r6|oX#J&QwuSZ56>?&n*fm>s=~_kX zK*N@vb36)P?V64&X|Wi5q8VhgDuacqsm{#V&W~`aae~byB-4Ymnd+?W6l;M@co;5O z$YF>ToW3vB7vlC<*olXwnA}&dFB2yV7(GSc^VqH-wKhdc;qYgj58+}T!HA+Oq{02# zP&y7}pL9FLU`1l&#exw&V%2_vEMHykaz3S%5YVV zbhxwi*QVX4kLxf7l$-TlXH8$kjW2{8OvVCiv|kJ{d-_pbIk3AL)A=5v5i*{cyi5UX zNBZn!M1~rl2$q#$5U(ZgMegOOr=4_KR+Aha5xt;Mv@nPWp7T;N0;S$E;c=D=Qs7P zus^Yl;=0rA{XMa4tLK_V`Io+GE(Vq(DBP8sl`a(2aXFNhEr!Qs4v>%UzAyinta7be zeaJ)x$ss}dJaiUo_=K}Gk1J5|;FJx(TS-HXdS#nct!p~}izW^M&s>3Br4 zsm1iRnX|4W3Ux`Pmkl{ed#Sf*lDi|b$Wol{C*QjHqmTR1Q{Bqjt03g&9vrQsdwec% zsy*e#7)n0;o{HJS#{2{$`GbYAsb)j(4a}j>))6T8RO=N>;bSvKm4kS-<8N`RlT-%n zUw0bfo__R>uzjz&0_7kOo|x4)b{TBwNUZi5W|0Z>{$o1u?-c*HjG+XV>SCM?X47f* z4%Pb{kiZho?-U`Js31NT*X#3IW}sUsm8((Y1> zSV-G)6SZlw>`3eCrh?sR$E_|tG6dTj5`8pg0h@@+s;0f!w}v4$#stE#8n58 znX2N&dGhz%1XEbLn%yJ4E2O zVe;~*m7IlIPH;_JMO{F|UQSx{YQs-`*0?Wd)qUu-w;fojmYLLrlHtX>b9-jm&G%K2 z<}zY!c=_VKVNeaC7u_TaSInHa`I!U3i|e;PDCRQzllMe7e)6h;gP%p@Qu@c!;F@ie ze`^6`eajuFI$VQTJF0&mjbvMw(x%*l@{U|VMeE9|^c$5=YK-w6?dk9*E8KULGYfB5 z2dK7JSa9UK9Uf$i1z2O~z8=2`UZVj*>5t(d8L@$Y!MBS- zkQ2nrPfdfbuLG|In%-+5`ToG7(qou0-27Dh9H|Z2`3z)w1}#CXNf{!$WgNvt-cDH zj!{=vtthW+6_>xg)_Wf|e0OcYFIOE4MAa$#O+E6?Ghkg|ff<_e&b)l49;C;xn%rsG zGRmn`FAmAUTtd~zSv%=|0YI5b&FMS=|V zN;|s8$9~4sb!j;~q=d{s`56aF+8(T1NAP^4ge50D1Zq8XAN8+4Cn`V+7NT=Ntn6ZL;<2O7p0(?Zc5|)qR5=uH| z4j;oSiO?oDH)Mh=gpDN9yv!3kJ9h``Z$sg0$L5lJgo*)NKwjKofYofaDi%B z>mH{qSH}7AMla|{%~pNN7nx>Q)xl)JdGe$SD8@?6mX*^}5Kr2rag|8|t)T>vJBKv6 z5wQeC(c!;}j*2pn=6EO8d`-;fe3E^&YsAN#j7M|>M8lZMj9plI!Hs^cyVjs{+>HND zkdtF?BrEh}XJQb60V4F<6kNW_;JE?a`ZMm8?hDg<3qZhkt46X)WVB$g+!$7pS6JIK zHC-Z~C3U9=+zSc_YfJtlY-b7`9w)J8&8fWpZlTjgL*~f;hqJd1s50Nahb2`6MN&dQ zX(@#R(kUX{2vXABDUDcwiga`6l8`P518I;Dq>*lv6b|v*#~a^!=iWQc`_4ZzI>tHA z^NGFJT6-<2==?sefe|*{L?Q18ecg5eebXWTj9BTs@!}pl+h|gUr#EG&R0jA7J6{dt zJYE=iS6FA(VZ(!kk|90ndNmVB_sp-blT3?JIAY)b@hH}aI8o0jGyb*%$MEW^=u)ZX zD&_FOs`EDj$!n!Gp{10@xEfx>GkCQ6K_e}rYc)$=G=+A&QQz;0$wuvzvs+x=1! zb+;L zSb~oQ*N(<|v^1C3S7qJ1_(IjnA4Vb6RdQ3$TZkI&($^Kzry3YXZy{xedOcOBnBjs9S?1-yH*Zgz>AG}} zD`Rm(KBJ-e! z?-M?IDgON?eJ@2--*Y0)5AxN!su{NhbfRBU3tX+Oh{ol)ETmN}EdvJSU1j@NQRPlx zvB>SUZKU9CdBrSbH1_M=Y8_Mght`0B?-9n;gaE2azX-Z$%&u?Up3 zY^UC}tIAtfh5<@G${FwA#&dX^Bk0>_4T6VE@mRE8pqVm;x?O2Yau{e=1_1j+%zajbe zQ@}+x7hZLJZ)bYB$gAjFcdQNn+Qpxgmz*3s?1zu^jlO3l*WGt8QfcoMk$%U@QFNJ( zlHbh}MbnmS5ViD1$w$9`4a}fVd@CKSvWZhQ8l_d;<9|VDgH_iqpVPyTs~!Iu7%kN9 zuy1x|M1-cbnk4^gDpNff`I)JbK(}LWd7^T4)JJc96xr;P^P2)*xsLKGM+HASCYn%p zv0k_eRj!qO*0Y~&H=RA!TWaPX(sCDKBDJGkpdCt~9WEKtSouh9&PG69prU@F!b0os00WP-=T2_w&UMz9kzI6K$L-jpbFk4NJSWJm#HmVv{lm zRGcY&q9GPFo@^Yi5VSq!iTvX)6|fS(6Z#UjxUA5vk)E5rdCKw;`%ILaSvqmJ?--cK z>{o8MnBTXdk$Zdw2+-}h!Xv#!t^4c7LE?uxR%3g&UauP{e|RFT+g&{>Uk1B0bl_VX zGxB-)>!qAg2J7rho^F167M?M{Xl4N>bM-rkvP!??F*v2_{{XDYl9)Vhvv{1+2k=k$ z4r$(__7u}tnl5%ZR6|es2f>z1Q4Ym`zVJ?wQVaXVQz9Nu(yBkcUpuaRiClP|M%vY# zDb3Ro<Xz=y>t(!Om0oY+^j4loQe+aB6&pP}-ly;; zc6?&3>B-boFMXh`CP5v^?(S)>;Tuwyx0DNbG+sqb(8iK@t#qrH)EI6_jfPB|OIy$S zv$xeQlL|-eFh}m858tUzms2BCQr5@FuJPx=?0WZ3q~><0|52|qI-MTZ+45Bc?zw6# zdT2XGdc>3vJsC^sD}4cl({cs|DW;~Tak{Q``S3c)JcNeYW$%A`|GwaiyJy#_O3U7> z-q*k_*{$^Z*S%VoAGhYf3Z5^}t5Tn5%rd6rjvP|-u`aijp+`*WuLT}1?4om^=k`U@ z`A0Lk5Ry?y%g?XDa%z&w;q4Xhr>p9Up8pB9R0fp1*G57Xz{NNM=NuV!p~F&NmNS_6 z3#pqt!P7c0|TkoyQ(vE zu?NoqOk0p;aJzRyY>#p+K|YeVb$n7^Ysw|PF_)wV43HBQ$T8E1Y(hV zAw231N}BtwlW+Q3L{gI8m}{xg2iyg(%=zw98du3auk9eTJ7d6^e^jgMM+jSyV>h!x z+|7KXD=+VNux3X5_TIFbxF8Bg}8~D|Q3!Hl?kp)>i&fxgs{V)=dyK=$*XAn9G zezR`!s7UW&BxmEHb7Y|`51&;)RGnpNM&&3HdeJ&AA-C}-MPSg?9cCX+k?*rV7dzek zJSrk$+7{y{Kn~uuU-8{Wv^5VN(4!E5E-jrJz_9j`nO~a+AI)X~;moIS1 zaOf7gHVw*d4sPNM;TpfB=gcGDaXqLF;{DYD2!s|JM<2aD$AiUp2D?pTP>qZ5nL1Yn zOWdcTGNXm}C!8YX6^p*M#nHRD!=G$rX-2xWl1+S`k}yv#zFKja86(V{x;QHmP}aY9#%+2kU~n+q#I(+|(!iZjSYyTszkuo6+}|XOyvVXvUJL*2zZx zk4gIjCPXNnnuL?@8xV1j4EYK)x4SG&Lxt#{d#})TouA=E`*~GY;X51l*P36e+3u6i z0zM{1-fpD)Vi%@qsF=4}=FzHW34cs;bA1E!#_|2oG|4*Y&Y78+1t*s|&3sB>XLWyz zhLSrq#14AHY>8W(3DF=V&oa2Rb5QvkYE1a%)&RxxB#-9b8*~%LbS*U2gC-`N`xS2nqaJjacoAH!15UoH z2U;p&b%a&l$C~WDpT*wxoZo^^d_5ovHgTb1gY@$4vf2GfF3l{%vlQ31DwJ%_OnX2>}n0xlce)m1)0xRuddNjv$lN@ zyi-4rzV>ms%9g#*f%e+jH9UXhk`F4z-0OB2n%7#;aU~UK6Zt{&IiM!9f#cd*zV0|e zdpYT=vkLj}eD}BRdu2HV&dcFiuA$ZSljP?63X7i=#w4WNutq1B7Cyx2s}Ft6U$CrF zFVM`XjMTB6-2AMw{}Q`2dM%}_cUyURSdY7Cb%8=&naG>>X3y}Zg|v%Jb_V@0sAsjyS9Y~yqQ!DRKxw3sc$64KU_=rlX z?BwU)&cl=1jupMKS0a#f*&Sms$Ez$IhC*jjh#mj9&=cc$%=_4@M=RRz*dp)w0%Kc> z>5yF+%i4j%V!&x{L#r_FSX*9QvCD^PEb{WXH+DNmsn5t+u?7dyJ;mjqly7<3y*1{3 zxOiij&%P+9kexZ2ks{*T|hG9ph7B@`fmV@7ZbE{ zwWEE>@I!Z7N0!l4St_!_Jh0$4tA$65i45k+8l~Y-jb@eP!2N6$X&yEmd2e5jXKUlD z6$KZ_)rzg8tb|i5_65AIPwKh%)HrAw#jxc}%E)K!^eFS>cnxr@0~=39@cf#U zm0`u9m}jHEz|(*7Amu9!M|vf?*9pZ8`0?!efwvG;4#WA9|^+}9#5h#g+H z)K4(FRrd0$F^XEi`PQc}T8C1nMc%IVZ=4_pOj&z^Xl)~__W)~3Zop|qXVDE3zRIVN zE^9dcuev#f3E2#?>(sjD<(kQcn0F1AnuWT&ZXjEqVi-?x_>y00LFCAkN~9(p^Z~6r zbL9{dGjwI+AmBWA_Yg^ZQd5U)h-t=2vKz1cVrsu>(O6<~3*9J{t#EA2fVW$_cL?_j z!@fxo%2ge`eV`uLM=NnoQ!IKfLC8xiV9HNc4KxoP>-jKR*}5C;so|VmEbX8VjWBl} z{?Wr&>LL3b)uddp)1{OCj^Q{xkc`SOh3RS4>^_&kLW{B~Mma&l3I3a|c~4K20kPh+ zm22{$NX1b z(y|?HNcW(cQ@&=rE;D~HQs(*EDpJ}4se zW%xh^%g;9T6>g82-y4-uW^%h6N`!do6$g7GEr$B+cVeK|S($D>C^3{(q7zqb8A{(8 z7T7s#I-rIQK=nLy@+GcG>qyqY;gnGmy+}{usN98` zNw1d5J;#5S_x<~M@%79=duuDpx4RU*`mb&As|{ugs${lPHgRc1|;$ zMEeV*p2Z3H+I1cqWCU7i9}Sw!)%#Kzj%=f4dV2zoGl7Z)=O;zS$+r)-q*`RUFPIv#;(g*++SWNaXizh~kvExN@)3u7 zY1PocM@3o|$u{6y`$A98`R#y}wz750m+%ilyY1oP-_@-C#%=crt4mLgemkMEsyyto z>#lskHa;1m@lgY1g5~1Y?75$O9zJNDMPh-TA0J z*YrTCoxaFSt+jiB_?D;V?#!n|cI#B>NaanwRi%5wrHp9GXJB0GGw7S$KF`!-)#bn( zn|W^Ue`$C?CS-@xE~KgdE=mnNvsrC6D~x@y~tyc1$yOd=axZph#wJ(9!D8Qz_D zgg6(|^++?uX;GIiP}_8_w?{%p?A+fvhF$B=qg>bRMiBn$!QHRr8i_;{@Ez`#C8sXQOvt>QQ1mvCeW^PO^22o*Ec$wYww>LqW{G9#9eW$ zH3HOJmyE>{%(-g6zduKx;0ZF)sv4BE8-0M;6g(GX5Ofj ziS)w<#{b>^8(&iq$}YR16OU~T+S!2EB<@iNjC@#ZBQQ=^hslRZENed!=M^t zU*WU%S~D%T8?yrc*dT#&%#PDycf)93i=K?%TY9cT_$TMAMlM=?05MB?kKe}(&0$Fg*1ECT8S=07_aB~F3DNz~Ds5Sw3xDz#{rgJdRRIMQ6l7bifB2*R z_YXhH!EgEh_?OTX;flJv@SF{&myrXdnkk&~;22m*`J+>q0rSkq+xI~kHNV&KF__qF z;_ZWy_$Zxgg-vL$HPA8O-=%O#ScJc)ENm8XaGAa z8qR2o4YKAWUG*4qddk4eoS0{bPP`Cwvo45Loml`bDe;DOjkDQfZG^*w-of66!#q^4 zZVH%DObT&yXV0G9gwDMcjAE(2EWF%72@Vz;vd{Mm2zsnjZ`YELkN^smDT(!)ht8`G z&n;oUc5C2Fl=l6vZ+=Y^$wLB4j`C}*Lg9s^s77(n5B=&(2PTLfq4HT|74jaE$o7;1IycEjaXkvgIEe(kI|aejikXrj9^0VcZc z0Jtl?px5A2V>?{hJVE2S3bS>KTDoZeUvxYeTGaz(jl|wQ)pm`meX`7u?`-Qu0nhKL zOO4iFuAQtU4XFfzf%KN27dY;|XaeZLV4VnlXH+O>?VW9x&zZ8sTN494T`-_$2B9L% z$%bO1v3kzIXpse)u}FAe0n1?%ApdsAihjQvRz3-GI+xy_v~IL=d}aq7#;ByE^jQxW zrq>J%46Setn!@quJZu$aFBJVn|ik`wUlA3RZjfmjRS8>NS-ub+x2=t>+tr+ZJy@d44FA){{!`b?9e%s zAZ4*3Qq*_$;W^G%@az*NhYtJyDiX2DkU&K7w&9p&%eY=C&h1(iN7U7IiY zj0epPxSOtvpWki0x49o?-tTDfghJT!jVsg+E{5Pee^&gn_@fJ=Kbu?>l@YDtSg%~! z2_9{dLtdUaR_}dQUBq{BR91F&2=0wx+bf`Ae~Pd~Zs{j%bO3=&mk*pn(X8s()m-YuX_#<8$S5F%#|Cu|hJgsk)q5}(?F>n&kayA@Z~5u*lINC<{4?j?km=`)Q<1_!$%o7q`9TPk%ujA>9b3 zH&63RmnvO8ff*@YiG+JY={?yigH3VTB);{$;)?RQbbX}-S7J6+N*nwR{AfX+wcxa~ z=W}v=WCrR>XZQr(23w__cL#fr7U4<1+q9=JIRmX}Q%#YYjf<6|2h^fIUro>TbxIs$ zU|>~RGBE&4XRh71Dw*&=mgmOO5qQ>Cv}qWlZp~OWJ{w}coiHwil>>UuuFsT*vpr*( zI^4GK8_0X{6>3HGbIkqm?%u1iUpuP=*?>GchRZ|64iBplb)lp5WBldxDm0jU;>u+- zR-b6PNHV0$Qmg`=<5D(2Tn*EyR-SS3rlKv@4g4y1Tqt371+m~b;FVf#` z4Z0w@h@UasgP@NvSMBSxaa$g^5qKLw9fEia#f|=Sidck6ydNj;8F|#v@m^!>R<}?O zg>I0`d20Uq-8)-WLF^U9J0&+)1jiWr++ZOO0_{8Le}Ix%7(AueHBTJs3<)FxKwEn> zx-4rJs>KW`V=X}E!*;hQ3aticR6W>}II(}b+0Mp99P7r~;bXRcIJ__L&rAD1&&R)C z-D%j?3IS?Rx^~6bC-mOJI&0e-fJ1QpjWkx2Z7;jzx~&Y_I+EKTcORhC~9v9kGtuoj9SD_zBx8WPdW(g z#YY`7M))-z&T%Ch(ReLhf%W~;MbxyF5N#3OSz~Tf3q*F3&3ZdCaQ^kHeY!ALYChx2O7%2GIvUk-Sj*usL$|=G+XrE{G3wxkz7+4j)cmA^A?Acen z><$SO`ENtblgb)r5J7sHoMsP>5SIqYx&|xI@U1oGjw{tIg17y2*=XF}#w@;=9VupE z^!v*$kPx#%s$6rFiQuZ^Y38LMYqE&uQ1V2JK`!sL7^tl7%5>#ML`Ke7xgft5D2&X5lz?yTwT=bL8I4CxARHm( z=9e9sKL7lO#gF|i2frE9;{%{-gC0hfpTK}HbB`#hKr5L@wvqN|QHCk;)%kRZE7027 z^*1#p3wc;;!X<0*GVQoTLSc6VP6IAVy)0=S+UN>sV$nXr@ds$aVEJDBO$UPkEFKZ! zB>oy$#mVz0N0TRR%e;BqwqLJDmM9~2s=l}YQ=cgGdH|vqP3*3@1aEKMoe!q?*USIE zmcse<;xF1#Gvl9V6&vWUp0lpf=}(lz*|4m$_sFD(TGiqoA$yUoTesf<8zhNo+@Ouj z<4wegyiL6YEZ_9_SCXXTeTr3q{pq7dT}ZLXG1_H=5yD&+jIJpjNOg#m&Vgm1^gUc3 z7!m@zFVt(57G9f>4>0UfSRHxipeBgeduN%EQromjx?HAf(=^oFY|Ejy+I!EQnA&Qi zq=kTrFOK6<#t#&1QR~!fjx=rr;C^>g?5?lX_X=9au#Gb*haS^(cXmD>{H$qB?NM2$ zEV|ENxt5@+T;<*0^AP25_w$R3h^XBy@^>Vz62EyKfs{-tCV&RMPLGTR-xYx>%Tp4$ z5#XJV88wD+CT*R~ZnSOCf=6hndi{Qu%9bUeT6OxUftAV=Ts2F=N9IePksqF3NbHoY zZN7jL7(%o-R#Vo7pP|b|cGq=O%A-WWgBe9#gR&Hf@LNAc<3SkgZ|hyOhr)_-c5zuV z>$#yY!Xsx%jgM>3pcZm3`1m1(onJ2K zMF<<*iji>RdzacTmU%$%hL41Z$GRR!y3CxU90>xQF9N_$o@7^u_$gVIVrBsbRb%?_ zQ|1Q#0TD;5)~AfD_=0a&Y56_Uye+sKz9&S0 z#Ib@zi^h2w3ZqB1VOl1Qen*`uK+P5QHa?Arh@kFsoZs~&S@vCU<&oH`_p3?fS zEBRmQjoh?fUvw;XZGA`|>#8l<6M1d4YE*q8ceKumRZ%>=UqFr&?(EaFKc6ZpC~V)v zNmfokw3Wk=BMC|n^UA)y)C))Hx>bW$#aW*{f8tD5J)dBc1fbMyTj*3Ict%b!p5PEn zeYU-m)x|(8D>m+>>Mv&5yEcB6b=}r({dm2{UIzqrop*+6pj4ApR`#eYbSYL|fsmZN zo6>8C0Czim@`9C|?4?*xeR90GVYL^79Q@E=XnP827dERM(__f6)eEEN|MIo|IT!PP zKW7~i;DO9c*jbfOUYVfe;N|6AtmXUqoNZl!dBh$o#`aWX!iexJloMqMRYpwOiLJfok$EWYqD)(j`5ROBFu|t(KX@{`89s4#m{NoZvM@*(lokJZk_ zHlpjdNsfz+tLKtD~)slDXgh9(MM`O>$5jcqI))+9(AIW!li)NQ<8@U0H%vlON zP2;$H4Dsh7Ijj+W5KCLZ^XUZTf6@crI(;;k;L`P`e(9jAcK1`wZ{TV2IZUh~s;v4y zTuPa5$x&8#CvNjG=xQBtUebyNVZH!O?_G9*tz)4_Yyr|al9`V1 zEzQ}Fs9Z`@1#^f=f3qVAbKNm3FavM4uubRLSAWCpr9HaZP|Y>dQCqoE+&E!6_c?&d zUqv;yWGD1KO1h@LX!Oh1PT7&{(R5#6A$Lm!_<+5_w(+D?PFf`Y$S3sJzHS>5EBULe zo)BHuEhG1ZxB#WP!rX8&e=!0%O6Za56<^Ydt(($1QDQ|yNOg!1eLK)#AMMX&?k#Dl zK~?PJagiV(LBJ(D=S5@!+xKR2!>Gk?8R0rIVp!{TM7)GO_Ga@%E2slVGRd;grxf0S z%r1T;Z+$LO#MghN^6@>ynVD{(4TKv?nc)aQ+z+M6+F9Iz8dsXV1=+IFr42gWY=!vg zg67H^I0Q!l4MWpAC9y#-K`B}Fs6991rYOW>ns?kD17^41`d{LTl8o zME*nd#p;G&R|>-D{iA0s*Q{E!8;Zwmu63>CNCcphpG7t(PEcrMXmcv)LT%3XfxjIV z=-okSYJRd_Q*3pg#@}%4>cDUI$g>yX1=FtYMHJCRkHl?8R!Ztr4E3kyFFV_V)Dgz; z*je~x9R^nA*Et^8S`g$r`VSWc-b$0JcK0qmImp+1ZTjgs1=DrejS`GYUPx4jp}*KT zjZ))OW%Ff)w7nvO`jP(0iI$7NqK)~V94YZ}C->ko!_}jVo}njJ$yrMi?Mb+CC+OTz zp>9r9Tsj8PZdsa6AVR#=@aj?y#^VGL>)qAiZb!jb)K=ymI;y(c%|m)&0AJwVm;|-W zU;!&1zwE})s!GDXtl}Zkp?;k+yhwfM2$a}@6T3ACm8F`66qoEXJA(Y=@~Sn^nsklV zYG$MM%vhh~a#zL7@ANC&sv2_+UEulV*Jv42H2pA=Bnz<>J-4;skAcnv?B$r>7MbBZ zcG=@y{+|uZ-~We-FObN%S)~oGwWRkXB_EoanaR4ly9Ylh^F_q5l1Z0|X6580@>&OI z%E|*@#K6khW=mcL7ow4FRYbd!;e&Io5XkPeyUJ=dybkVD2I*p`y^Wi?rB>!|BBJiV z4s}ilmc@F&XZw{7(ya*uqPtSdp)07_Xh3y}BoyAkx+;aE{Fpiu{{^FdjyIm~;`66}0;vA_ zqGrUoi}-R*X1abm8+K($QS-9&OX>+fj`plo)T}TgZT$%{Q|VMFa1cM!3otI#5+of* zRo?1T4noRNo|-dMu4b1%Mx{H^vqa7&cuNYHsYl{sx_b~2dWJb8Kli>1%; zRb+C?Zy)$&8~`_87GLXm1Rq-CVj%FGVST9*Sdkb-A?)#Hzi&OyM5zG^srJxG2#r>X zZ(~D#Jw^SyOMj-f{(HHRs)jdE6!qFz75y7|N%-AY;~wNg0de<3&P@3Kl_dZBzs_)o z<7R#4EJuGzRq{)`)*1sXrsBUlPtjG+wYpOjQP)sx9}oWbpZLuBNVT2HL`ldpY#K0gnH+Dm0SBU%;SlZ~9|Z z@QKLV#E@fUW$iAzwW(zdxkgKs{wIEzQCBevuJ|em%#qYUavOR^QTT^VaXk|hMg;rY zK-a!OZ@2_r8z%I%u-*ov$8#OA9bZdK7=?sJt@iG2iyVzk9&3$M+OY)(2X~wKhWVjq zw;1Q(8Xii*Nj(!U7nBZ&F7wIZg4i4kdu-{I zF^5-wZJ_?mJ@M;<=gPto`jUB9sIv?r>%UHL;HwbW`~4j_zjzk(*e%$dpgU87_HC}b zUx~PjkA#-ip_0T0bbY4Vi@g#ijlfyW*BZ2Tc1A%YdMot%Ss-w8h7>$`Nl8f+Dwogw}JR|-q8^t={6owq{l(gtD zy;3~)oN1L~{S&;!7>mkIq?jRi{9WS9cvY_)wvggogc=xNR!*^f3A{?muv{TnF)=0f zO*e-od(--Smtf|%0t|TbBe637P&s;b&%3+B%NZILA*sbm7j!)=Fa&+BLDzItc=#2# zIHTY}2=0OA8DIR*rqTYfe*-BQRMxPfYCSeMYe_X>nDG`*tz|$ni0(f$d<2y%89~z5 z8kZc1VY;%|fPsa32jbk#V7k_r=R%ExAViC10YNjLuSR0I?BM#Pd6`+{vtB=y1)9(g z%@Z;XcvIX9DT$cOUP#=F zOP;V)0vDVi=Pnb_xfDK;UOxXiT-+rx-SzZ$7a^A0^`^iM$g6NK(6EC}&u#rnh<#v| zu8|d8oPV;H{!l)KKINmVclzt6uK9`}#7zLPH^NGC#g}V7JLc4KSrsYF$k`dKG-tSwF$*!QMOr#x5(Qn2R%uo1!wW=;>L0#Dznig6+5?d_?<-A)F1 z?KE=Im&@(1?40}rH2>On>QvWehBOb6Rq9J~zP{(T6L50hhT`H4+ohhT6=gqtfd&!( z2$84)CsWdr+c{Q_1bx{uCM&Hg9JSq#9%1({)UFgo0x_KEek4;SKDV%t=(6Sz6ybV) zbaJ8~-sn)8hi6z6&faifk5CS01&2D-01xcKb_43DOv&xbLf={`5H?4t4I}p#`gyH) zDF~Y66ci9pYG_yED_l99QPPfw9z#hNtF-(?r;8|y;V|^wiAJByOxiM79*7UG%tyQ+ zg4wh=gmkx(`|^0>X&AAbe!c_a+;^bQnfv);X5J+adGUz-c7`Qk41rSL54X{3fQ{qC z#k!tn2g)WzubwrmVwq(shK*yTQYpr2j5gFgXJoOww zJG5r;ELs0T{}lc>=1aAD?4s&vysZcQr)axk%tCPqj_0`iZ|KsD@S)_PQ2ev9_{R*TrbeneslG=^$6(<{TOFVl3lnBQ z7yc2K-jg%xLtB!7i!gxDkL}YcAxqVQZ+wDX!!cswy6=JOLPH4tz0aL|?kgPQgWZi4 z<1)0qvjyr{dGASqpP9($u@AKmmdW$O1ZLvqD0OX6zG3B&-V~d?*-&l={ z^rYc>Pl>q}UX^s3MN?$bS9}|pW&C*EunNTAeL?Cw$;r;xWfsrxZ_QtVY z3PMO#nzT+Nv3sNY?W{qV{a&`s^RO_Fo1o1d=`wVEEufoT@i^vkn8_UB=N=?3=2DE+ zfoSOl`ts_wJ^gb&(LwOs7w@sTn~nP~;Ch?1s{QeaYHq})n0Eoq!HxN$*19Y1TwCvn zGIu=~?(v(qmHzoz>O%aL3XufqvS-6|4ZujhGktgfXLE)Cj>(mOyrn;>!c~^A^UYug z?x8*yKeQh`eiMIJi7y=20Y-!bl%{T1qmD4-zpb-{xsOk`$3S4+X#PUnFZlSq-D5ZI z3yEoMJS@zb22;@b-VD66D}g>3!$fxD+FISa|^o=B9G8vNz&0 z5s3rfjNIk^>KF3jr}UThG~5)`&=3B-3hP-N`4YH`M) zkWtLO>QR{n&bHW~r=txlxRU*WT_90QBtbYuxgjN5<$aY$60XBH6{MvX2xMugb^@C1o(d{4E| zqAX92zR!>5^mnSj^9!`G==~MBPO9};jShS{O6q|(Mjzf3-!e8f{%X@(J9S?4%uRDp z2PiU8007h76?cPvaS)dO>5ArBi6Fjz0gNrHI<6A7T}dOl-E<^6v=_+%&}OAI=Hn#d zyNn6%98g%NPDDZ8afmW1i!DfXZ+aE22rgvN~^K}WV29cj(k zL9#9VUry3LR?4Yr@X1dxj&tVel$qn+t5WGNU|csYt{Jt}$Fj8^Dzd9&E3@1E3dyCq zFWPU2h>TOTTrM}F0^r>4HKX8iZAV6R)km}fpCZ+?jNOWi1rL*v)baFYp?1mpC9?JU z$!aGPmma`vlfB2IV-5Xu0QR0oEDb&cOxC@7Jo@oD@>*pxaAzomemlnwpkSug5qy>^ zst-uUr$T~s;@$sdJvphUJ$xon- z5OmXBf;inm+{<4{23EdN94vizuQQ0&Cc?-EU}bjJ+nXlOiH9WF_9j@f`~O_X6V}CbXKYU?X_B(KUl#1#|i%H8-7tuy@$)PM1BNwE6h#V)s4Q(4zlCY;)8^uz`s(D|B zPZn1me!JlR3Qg?Nc(2|-izV!0pp@$Xw**$S9%$rnBNlDiqu0-Y1@*eyf~YT21V)p) z_n}rx^3u7co~__iD+a(E+N8n74`wJr3GNKAD?rY&`+8Y)U#(>$I>39gs_{ti_DgE3 zr#D)l!HTz=u%#c4EWnoeeN`2>OsB$8<=Ibe{_Y`;lH(&>S$s-F?pNDuyUhb@F{?2u zlqzXrl~6j|(bL;*bq!@D*vwVWdAT^KiJtiX$}#V%Jh*M9NWDmq!)PXN15p!F`pWf~TLg|@rKA=kUlX~jf!a&JfSfnAW<(1#LZY&1dr zG^yJEQpKwJ0;xlC_J<)#=CzFuCk=*#}dv>l=n$xY@{hI4&g*k#%r@cL1lp zvP8ZVj4NT~_zIWS+C94wh;#`l5!7m#gxv^Xu)L4YTT8lcSQmuv`yoscaUl|Ww0B(T z@}JdBf8Xta`Pa~QTJYS;XP(y1JGe~N zxhyCVlMq=KjyQnkPU0QY`i~IZ#tDzomGzR`yY_+5mGV2fTZHma_R8B zuFDz==kfI3dX`wM@SdDH-U~k&^KVSCd4oRqgE*H2AqCs0m#)#0^k{tg7#}P`UebtM zgT>-M3-}}PkSxVNyZ}N6U7!DNJFWlQUgBAVxujnO^R7Px`^w(bQ%AiDEWPbdh}JIx zPQnNQ-=(Fcx3KJ>gb0Gp`Qh0HRx9M44FE*rA#*gIj=t-e#&Bqbs`{RmTeREAyC>i>UiZrsBYmWwCCWeU17|vuSp*UQ^$0O#2AF-mp-%eDN4T=Cr#^Aj{R-+;h z-_l^DwafP@2{?COoaJUey0iVqv-!Ur-&3=gCC~fMRwOL0-6gZ$mg~!wkj*-)P}<>c z;Z$Nb=Qcox%qLkSmQUQ|wiw$aj{|AkqH?A*Fy}~Z(ym&v(1`eW)&r^E5!C2KmCAHR zV-yVd-iNmE`~|@8Z&m64KWq!MlaCFBWByFIM20nhqatK6QwbQp8emLgI>4|kQDim zLdgv(pw@M^{w{1Kt&PbSe4yzw{yN`Hx)61Y+K7c2r*^2!(NWiCP(FxOB`LpjX!jK9vQafIO5iT?qij>(2o`17vs# zV@zPM<`1`?|zL_{H;Tk3`SC zPs8}L6_EQ5pf%F1X+zgu>G|adMtM_c=bOjce(ALXOYzF>Mzsw5_RV-Yp35J*BfNdTi@>0!e!{aHWPp24O|#PD zbqSXi32oAlNJt#MrU^Don6MSU-3-}!OK#{>(xx(pQ2EV%Y+@7-h&UI7wwV0BVt7LN zh?1pg#Q@9Dm4@QazCC(r8Z)8sFA$nu%Smz$L7UxEA}s0tU>QfV2sb)fB5c*~ZHKmC z2L0@)y2~N>Aqp?--a>gS56HSs{2iVGkhu*;s?1vq0gph|79z)V|TT`?yB zwQn>`2AsXA3^NvyNU!C*J{ZR+lkP~9LTjRB;Qp9vV9bSv!2`|3Lfv!+xF{29SPGY2 z4a=#}bn8OJFradsNdi2Uc)lrKJIfSXuqSZzs|W;P2C(8*%Rb0eZFyRZvF#i7&1C_( z(7o(;v@_IKsC%=QgAgq3*l=)g=F7H$VthpX4U|ANi>Pckr%Qnq>fP%A>eyn!;ffG8 zqF)w$pa)daW0gm4aucSw(3F#&PWciheQ7%kU} z@5yLzAmh|qtQmu{lmY~tIf}|4bnxoilPSv;!AUMEUO1KC%p**sK?=(+S_FEn`rMqsZaeUIwu-Bz@EhwR~1>^gtG z5wRD4IWrS^+Pjga!K>!P$w6$5^)Xg|2`O^VU>Is2n@4!z+9-#UeO?&ca{FhC%l zokEbPL-x&%ytl_cRJ!A{1H7jep@4Agwv#3(x1@{2RsB+=KWf0;0?la$F#2W*2 zUm3~|WT6}|3-ue$lR(_t_9e|!T04$K8>jH4dEeL&nb-F)YJa){7&g_Vc_)GyypSCK0^FX(Bs4A zwR8+&hlc&^lNhyRU3q`zBX;ZSW`MXQ%To?`qLK53d#r0K$H1~n3%V{&QP7XImGy|O zyp37+zWU&D`(`c7B>YWYj4fC@WVAwgW^XYgY((7!|0c3twuzN92(~- zs23Y1yBm$0@p@s#8rDbX$E<2M$3L6YCj=Vgz_;Nz$|6b0DFp^!BFsgcPczTW&Bd>B z9L&}F-N=*RN|0{RN3h70-M8%Jn&zbsCF`or#JIIy3PQfUyzTYzQ80iT4nychnzQs3 zh$e)~Hnt9(dZ@w(j4)zFZ=?!Tf=u$^MXEu}?{UVWDh+`xHvReiTQ|{ms7yt#Ee)kl z<`x>ymDpOM1bH!TtmUf^(O6L=fs!-s3*%;7>by`~DR41-5d+F#&%r#y5K#Gk)}y+9 zU%a)G8^%_z5A7fwhx3xCaE8)?zA@3Wq)z%(}-8>zq~Adsf?>heg4mS={2Ic6_RJ}0A_hVBp|IX)Jl%g{kn zgkWl85=s8RI!8m;?kp@b@r=G*mZ-9bZLrIdobU+CSBBk`&VM=}79P=Gi6DRJEG4V92!O9^yB~FzoUhYIlZ(;OWl_80 zDQ>Ou3j5`xh=PfwALq%2`TH)K2_DeA94U~?JJPvatuZ$~O@;X+;L80omwx}RO#16? zUIWK3^P;`b;RZbY^tqnGZK4SgghG zeQSS^+gSQTW%(U9$)kuU_+=MtsEpgN+0-RjB+iIzf0mkdy~~~yV0KeBN`~P=l)mWC zDJ+jrwP>c2>W6ORlOe&{cds4-72%3-k)aDnf2E~#^_te5kL^RiE-bQto9Z#HME{}E zb%1(NSlHK`*#llU(RZ2{*|bE~DlHJ~-PhtQC#GKI2xBz*o5qIj!)7&9r{z4Q@xnMT zi0wb#3t#VXMFu}S&k!@OwjvT(QK>5UMQXJ5GR@0NH?CH>KElkIJpb(FLn@hGs|2$c zolV`s!z(F(@@l(xa8$M0aSXk?=U+k6Aa(Wa-3n)-cLF1dXQXSC24uVQ%b(_N5~+)l z;6E^wWU)kT=7oJ6^PG7h^sev6gXrUeYn+$k%TE2(7jOwvU;2LQJ(c&8=Uc)gs9V5T zd(RXIMUA^nQ8->i7g(jf^^_=Rs}SMfA0nI{={)0q_+(oeqx}Lgu`<`X5v%zN-oaM# z(K*j6z6#_rRZdZfT3VarT;ERXmE8G4-~||aP@D51@y_dg#wm~}#|HQH0lk<4lWU@3 zTrx6gW81LEW{>At!cLL-6xHvJR9-A6qG@?&6n|@0z9^kVMpB!0rkbOm^~?(Zjg$6C z*+i(tzJ?7b>Zt^f;OUUOC!&lQUWUVrLBGJf_q3JR$VIY3{&fPmLJ#tq>_+Y#6-Krt zsXH@pw6P_Vv?Cw($TBC-7Tj8D$a9svh~cJB5OWa9Vvv>kR?MdlFZ@7%>FG?v^CVec zV$p0`1|K-ZSdm(O=&1B<+Ah&X&l{Yd9&6Ys!ocua?1d-EWGi!}VB|3uN@0(~xJ?md zBoTb9P*9vX=&j?|{)kQef>|Fc9=%*K<b{=dy_XYkd-wU*h2R!cfE6Z2zOBrXq2-@sc!}p{j7xr!Da=FPK@w=bt`@vx7%b|9)&q!@N z&G}W#F1fsN#A&zMug0MQD{=$8bK0ii(z8yx;1k2?kw||&@aq0x-hC^nRxv*nc9u4m zBIBA{Y@bW$MB{Q3A^7+eOZ?Aj=*X!W?sDv<9ZLE|=vddw9|{(kEvLze_6=n?Qc7u; z4+G^?h932(nP&fC5oh$owti=@V-Uv1Y(Wh&4C9m(k+!36(rWScWv?Eo_`jF0Do&ng zN#5k5x6u4vaE|f^~QAWr@8Gk=LQ&NcwAME*#6^9)y?r@Z`kZ zyLgBAXXoQgUaoc+9y@=EHhHIQscW5=Wv!~x*G{6nwhs-fU~1(@V~m!okMEE*99aaJq8~X!^Y1`&CFs1HN_)H z-p8TeZSMZQ{`mXy3_Ku=t8K_U5Mm^&Km#zVhg&^8vx`(@heDTcO6Cc&iY(H1o5!2Q z&=IDqY{C2`4w9V1v*Ud#%l)6>Cev1;J5Klf5{^Y$2+wv2Z*@+07CW!1uB&E96>$8N zwSIl!m}lLrH`-*G?4D)X+w&`L;z-+j4w>kiE`~^gxp6takb#>pyWxKM)WdTJ2K)6a z_>V}I$iVGJSdZ&+tH`Bx1*p^M^F6kCN=XksH zdO)V`hD7p_&P}J@Iqs^zzT8C4DBQr4qbvevT z^j^Hi?)j8G?b&-At>=;?#11g*`hZA#okbpcD`OfB+GwT}!=LZTKkA^OrBOY;>n>P* zq&SSKd|Oy!ARw+8B9dnhubbm98T&w8L7y7-c@!8!`J9s`tP8V7Fq%0&#_Cv;2>-Rpz84#{=RB?rr|L(6i81~#p-(UvlY?&~!R-Q!2o z)o?(&Xzjw3@YtHrhz@G=LyUlZe^~zu)H`keBX%ygDT@4s+BxY)HUaz3ITeHVI&}8m zp8(FiMO!r%%I%U%8xt_T+_U^i`e()Kzy9ES&*1CeFp`b%OImm$=`t!r&pO%a!F z83HcDHap(DU47WKU(sa`+V{A+ilCYIXp#L|w63Q0kMhKki>XR9fcM98+AUa!p@sv5 zE8!Gfg^k{Yu0#(hWPSvf(yqBaz3kqKyb>urC~}4&1n>WG_SR8Vz1{w(0ZJ($sf2)Z z!&e%%fJiq;HwZ{bHybIDklb{)beEJMuwm0JAiV+U?(ja__ndpr8RLw5-rw&Z$52r= zYprL^IX^YE<^=Y?-#>DnwW(H^TxIX*7xHMG~t;AazO6~?%k|y?9r7V8f$66iUw2Hvj5NrW=f& z1+u`0iy@(nz6&mya8Ns?pe_O zii^ozxs@QfcZ7-U5m5y7sICuM+O($17C_5inro_aH!pTZJB8gJR<#<%00p ztd(t{@<2vU0}a>ur7~>obzQXe#uLCoo)Ay+hdwbEDPccY`8pKovu(;|45Uy~jtgNa z?W(>{Q|OVB>m)0RUDZ||0tcN>s!@@g7y8CQM2<<5H|+4j4K5{&U@&M2OWX}{ck1Vo za_ylGMz-7YDEg~jRp&eOCh`3r+(R~S2C-$yA_k3*ob;QXcaJjUA7O%Q`n>yU|N3zK=y5t8i<5B-j$(S^iml zit$;a@-(X|+>}s-xd~`@4>MVHwp^tX!M70fjX>Ei{*Ic>I|Ne1y^?ne={o+{DKAP; zBN$*;$6QXW-tVE;4YwkbNugh78$OO5z^O8_EHN<`er4EqU<&A(U-Zz{I7l0y;r|lf zux=aooU$>B!;q6CpJlk%T;KVXoZz~I*5+!AOfuYK6ht-a)2Mw!VyWS>8Pb0|0D`+d zWN#lH59PC5ePQ1UT2kH3N;=2o0i%*OSax?$SdSWmK(xK&wR0%Pf$FS8Q8UIu+Ig&f z2Sa?ME0FU2B6x8-y0^Th@Adue!^LfBU?FL#wk%kUl)eDyw5QEAs$9)Jf!V4mkLv1@=Su=zyx&K?fS+8^81s{eMrXo(7PXCH$X1&uzq zkFiRoJ>I>HgV4~jVagtNhkBBy@ya|%&0{Em3IlQ+8g=)K{65cCN%gm?ti%aI(8StCel*YaVu}1RiG{bM2H6 zPWmax`cAUwZ8DbU8UH(@IjT+Mz_(OYVo)DEU8fH+!4ZcL(j@0AN@dlo!0Qdh*-N1D z1h{gqO&VniCI-K#s>X)ObYRQA;i~DO2}8Ck2rva_Gdzaz&M?^pCVm4W4G*;P1wy{^ zz!P5-ous7l;291~wY6QDeSjA0jd@&n#Wwsy#V}0SS7B0;HN7P!!F7^ytD7YcvPH!1 zk}x;Q^KXGfcO{HuC5%Fqm6J!AU1T}-faoeID>V?EMLObh%BVx-;lB?~{8#*97#mGq ztdQiZZo0n2tBBTE9DpbUY@>vnx=m02cbwzD{#XAmF!E3MM<&LXI1L$dsF#Yu%L#R| zDN@-lB(ZR#$6+R;V9i02K9Fn%G?I3E-Mv@2llPqLi1**T2>s`CT>Tr${2p?f<*`gSx0UsOx#+v_tyg?|(}8e7}nW%C#abAv9TW=lTD^rTa?Z z2SE4PQYE(Pe(u@=TTF0=zYo7LO3u=6zGa#N)i)k>x26GrRD}wx@g?O^#PO3xdzWHc zX9FO=LDVZ|dxh#xLm0q3+$?}x-^xt(egH?y@#wVctS#SSJJs!e@HPlXAzg&fd65H{ zcN!e!bTA3(25|yuH%$ttywyUx4YXGi}c9( zUDinN1%%K9s-MF~ouLlDG;l@6`!p%C8VJ9I00ZF4W{3D0vELB%w0auM9OI-%dL zN>v5mutqK@TMiSpAM;_aTMw%8nBQHCn_OELdA02-O}$?k>BE`=45B=16uk59E7Tfb za}4O0oJqj_-DGd|*aY=B*r^Se&T3g0eJ;l`2FC_@{t2cYMn@wx+Xg7xebw8xKUZx! zgs@hSE@1)Wet$k51AC}%ihqpwDrjoo@dCrKmeX~L@t#Brj3@%D>8%;BxspnlAKn;DU0>R ztSXjOv6?^KQ-P>TEn4|#yZLd9MO=!NHHG=-QHmRVo$`{ucx&Kc&PRC-^4JBw62TBP zXHl&3h0vZJ<@2&lxM!pl$T6z?4zDNg>~O0mf+Y_Q5wRYA;9MfpuIX63VVvTewe{Fo z4H# zkO{m-E*>BU)@-SH^<7MPHnYRnlOF+Hwe8OrP-R#@0wRH*0BttT^O>z=g zF9%`1bQv9Mf!8+Z9m%!=bnP0=PXQRAN;y4&;syNbFcSS$b=@DpB}6}FKBV#<_W9@0 z^-rUll84}O#@tHeqi#Om;=6^KMquG zu^BjaB(e8UR60v1(AZ_9fo?yb=~V#iD$4-(M!EHa@6{}s5P8MZ_N|a7NwA#@{}D*+0JUJ)o&>-zi_Q^ zi3N&dm13(akVH%V%Z!sh#E(ZJq2o-s*I-I>kmUdv>-a|_Fom7*_}&ua0il*IYZJip z9kH7TD&Ip5LCS44xYMEmd>Q$`w;ZlU4gif%HY(xDLRA*U3vWk`ltzgm)$6x<4w40_ z0d%+uU>rb|JQU@|YEKmwxYzdt-#LMSgL$vWj ziNfYjni9;%ZHX>kx1;EvXPNxy_Zc}d_k&)Fz_vECwS?&w2pi)YvaD795Ma&jL0wuS zFI!4(`VArD5@e07fZnVIYVf(VnyQJnx=H@35bU^vcNoWH_wqBxEr?9E!CN`>P7}jV zbTok9@|yC=&mALUmENIPp!99et}}W@!xh;cn7SSu1TO?2yW$lAGxC# z87f2efiB#IfdLgQ%d}X5=>&3;rg>*=a&lI8e_ns?@h;IC6mwWZwb|}_zPnBteQ&jl z(cvG`#U!OB4vq2ufSeml#{)Rzmg{%MvIoFJEkG8nk@Gm{lvyJ0XOQSSN2dhOg95;_ zW~ic`J+JK?40L&r#RhJ|c!2g$pON>;9j3cc#@>p8#S9Mbn7nU6u2A3i%FoJJ1YNm; zTy$+(7s8jnbDXA;Jz_KWt7M+Fhf+F%akx)VKr*S7Id4%WU1L?S(=F+ij&SH2#}E1@ zvqaAj6*53}2I`Ufrx@Cs_~^g$YZ|I4P{f#a0>GC&hUHkZcNE#l13$26Q*GTwWptDkAhD;QtA{auUWHJ^34NZo;1F<#^9X^CtX5O=7h^1DmL*C%f>WVKH?;b#mp+9*jOrHzU;@)%n&lpZYp8q35fOi zvEynw^+|~_^mC(X2X>{}y#c7g&iQK2CQrsixR5G1&|viQij@jIxfzo5{E&XH+J8H zzb%;_qNE($_FJX(vio;6k-_@d+fiARO9_$WTaC?$tK^`}KT4gvt@R4*4GSkQtW@6_ESmil3!?8Rmn?>V?TW4$N981U;L ze2c>MXM56KSLZ%xIW}k5;uS1OZeAaA;oV^SsA?|Zar(+ zYYbtcuOPLrF2>Skc;xzWq<{kCz1{z*1=4G(0i7#!)j`s_cqul|vt6>xYLQVlYbSK3 znAz4ILwfT(zmsf#Y{=;sjcpk3YD$hGlOe?kp4uMN!*tK|DnSrI(r2pTzDc}HaZsIc z)88oJ`^>f`I;GiI{@|C>kh#1lP4&)KD?O7VYq%FaI|_>m>qgoY91z~!sCIuScLftR zoyq82N}ic_c{-!eVv#)#p2p+gI$>W?nvVu;f@vKXTHk3MBAY%2QJ96x*eSS-Hz}5f3+2t{MP`%lbpIgt z{jTWKN(;am%`iQ=B*Y?2drnK!P51-hrI-z9AVA`pB|EHt=u}`AX@dxnU z4%|!Ah$bNj8;IoJhvPG)ea^9?1aMNkQRPmSmGH~z%EQs(+wheSi662=M$RfedjYP) zY~fkKh zwZCa0cNte<|H6FX19atXR?VQbzYmN36_ET#6`{$4ofiXyJX62C+o^-empgbSw_eXI zj{>|{==nL=6+EZnDLj0__uME@3M*qL#$wK&i>>oavR~LiRS=D?s5BT#2beqm#3$&hUk+V=M{wVe6))R1+NTOn*Z5=}xO`&5)Y zP^Q#PhN^kw*nF}5PDUC*@>dcw!~&b2386*U)ob?$>5O8(020j>nNk)U-!C^`k@GQx zHRG+0RUR=1tf9tVH9ACt)BMm0VBSF3K9yxd6TS_QZ@YsxyNxtsHMB6t)`Kq}%z(@$ z61x2>m2dEMAUlI>0IM|TU59=yxEadk_poy-=pj{lHjFffU$V^}V4HO?{MGoElUhF% zECXS?stzHg4~cuPT_4od;I{3VQMWx#GS2pQRZPhf+4{yX1Vo2pJMN>gt>0QhhO()B zNT{ZCv?J|D-FW<1JH=OkSDK@K7jfl>>rQwJLT)1CtBl2oT9v&COpN1OKgpC?6{Jmi z{D5F-c%CWAUKCAA*tEzcsg^O_$W0vP?URK{{9)MJP;QFlXUyiB$IDg{H}FwgI!v`5 zYsf_wTukh`Hb_5P^x52dJYn@&cjn#-j&^2 zGp&o6*eQz!ovu5B`(5UjtW0&O3sOzacZ>lAMp2`_`QsVLbwid%OBTs22Y?Ly&TMYrD;b{3i^xI20ig0;0=8SC*mw5bntM*Af zb4tzcl_ZAPgYwSx&|ebX!RJ`AOoz(^DaV@Y=aOesF5$7oJ=&pq*z|hsP21jLM4~R~ z%j~nh-G?&g>H@{15jA?!WI{evGWJHZ!^oT;#61Z>)o_5#Ea5N<?Lm=x^wUjZMu-l`G0-0h3OrYWing&LK6gA@s_Y8>(n0)W{=3h5Zd&A3SC zq~oq|Rw~I<5+$Lv`=2p!+U9%0WJGO!K2_JM(67)Et=^%J;~eNS(X~92Vs0xuaGw3L z^iYx%qp?#QZ->B~i_`cnSrFIdy)1P-)y=zfk687^jvr>1yoOLa(t4{vTw>(kM3KCW zL?hWua~e}wd@r>h+QHbl`x4guTh59>=PoQn_vUVY?ky?~>5^Y`1{1ZVi#RzWzicD& zw;Yl-BdrE0nGxuHq_?*nT#$IEQ5X#K3~p1G-7qYIal{QN3%hb)Q-Hd*Y}05$YygXO z-!tZls%!U`Xp$MocaZ5V&&%A*r(h4a{@vFnkAd*|nA@f2K5@!m6K}gmFyD6jTMB}J z8)}`6uMJ}A)>3rUxE&U&`Z9)dTG70w>iVfZ->7H4{oGh}frY@(aJCnFY13w}@`O?s zcR$6S;XswK7GMFgZGm~OF6mmul($J%bMYV=Tl%fkq-~tUrmV~e5!M(h^gO{W5%qXi zDA&ntvEdCfihR{~9QXN6`j}M%v)67HbUpzL7n(Z|)1z4KE$f0)#h-)%E4n1+vam(> zdXtFsZ}`g?SfBT>RMcZE3U{QJ`MAnro5B0eJw|EbEOD(?zC|-`C=qqWM=TC+aJi(u z^GRvTMkm0o`5#NQp7_j@1|0aHZ3Id0s?A7npkxvXCl>om2$^F@^VRoMH~Xf&JDW6d zs>q<`P4YHp(^NglK~4r2lF{%mo&CVOWwe zTDBOCZc{>AFH}|Xcz#P!Vr_X-vHrcMiAtE8q#Bm2@&+$5M~T4L4h;0WIG&1rK`$xph_XwpYofJEytc;{>kNVs7@|Ctgm=xnq`{F;7Fi z@-zR@eluVWz)xdQi5bIzAxrT`JR7{dpx=^e(<9nkBUd9i6RKyRZg6I+V5hwD7SW{9h>rH zWd-F^WKQObKo>gO)%QSUp=cc5NL=HUZN%F6JRXCs9anF!-pdQRl4`r z&HYYOnvgER|2Vj8u}7Wloe)`Lytk|~+Eq(mU%4Lr4uO(5j8b8V^^lR8cFfklp<6O3 zqp9yS4(6e1N={Utpn862Q{Hk)VBtC}_Rt&vQx;GqijdmJFn4~{;;9#5j{Eq?7)vW!o zC+lohbC>@(kc6ox@;s40zF4jnOBhOq+_w@*J_(yk_CQEvCVk-gQr0J$#rt4jme#W zBS>{iADI9R(f{V?d?$9Sm5y^jlj3Ah#f~uyJI#GL#Li}5*J0+7l-(9X`x|hXmPpVIv}uMSYC!z_`eX|kwzYMZ&3D2Wu4jm$ zs>Y~+?*69A#&CgQ?s0ZVlG1aER;H6*+@>UJB?8}0fedkQeG-AxCd`jGr5E>2~R6XopXkLxPS^D#&(br>OYo~0Zdhy5A15c-_(GVryr%C3<9 zYKa#nd>I?@5)aOVrO{58M+p|#>q}xP99k@?xjx{bUIDn;>_gO zUscl`86kf1!cgV)*iGgxPta60#gIlHgLh36ul}CG8OIKS@<^TqgP8629-s$2Vz#-* zJ3Fj|f_~*Y6tc7_HlWb)#}A0jj(=_e=&drvdt%Rcz8o*sNO|IPoj}DmZKeq;S!hG; zbz^|!Zj>W|-oi zEMYll{*A$(o~MkH?sA;LbrFP6%LfIFM}AR)HF4N5$B5~zH(7NMtW9ku8dkmlcLBn_ z6_E#u=9?{#FJ6=#Wh=!hI~I* z7FTk=iRbKR;O2NSx!%W2TGyvk{s{bF3w`~uTPGMW3dsX{_53QRMxRt#^;g`;65rn^ z_^mq|_dR7i4P2Ub>YjZMrA+7(?E3f$2(8~HrTko!QJL*!81iuAO7Dyz=sbJu!iJD3 zNORTc)>GstEYh?7*MXSAP;`}BU=lsX~>G1A|XBBgQS1bJvW)$(3BWO4opeVASIR+iEQ`zJOwh8;ftF+_f`1}G27LZ9hu z)iXXCC}zzj4kG$vFdl%MJ0_BQ&e~5i>ka#yib(~q;ud}ueU-8&<}X?2je;z$&ktfA&ub0-$#Y%LWfY92&vMsl^?7>&Z_&VYm@mZ% zrKi*ux~|nxj0K|hv9ReD1-Ya^YF89^n{Zm_v+f-Zp7He|{Rk+y+$TqTyOHZ8g*IO0 z%og(5CUYNYC&+?(mkGm|wOVYzA(S2+F>Qd7PieGQZ+hW`BPP8l4#{Ti@*bx&zl}~1 zJU5dFWBuxc#^s{-nrSDCvGm)`R#XE0B~NKZ=*#yR%85wE0%#`eAZ*AjcEt>`S9J!! zZ&z~d_;Tfjsv8SNJ2mg_x*Yli7HYlyA5Wug_*YWKGR`JU9f?OP|6x87H1Sa+{uy>S ze4BQ-@{?bI5j%8Q=r-I`x$YAwyys28_<>#AVOC(4lCQnI!lf=?$g6=0aA>Z|ipaXx z#8@y+Ne1kFdUvNJRt?_IWi+w_tb7hfcIR8IifbXJN59{>tB9KpYO?6cq`Rz4%ZTS!h8=iG{Y-)DVWwr8HB512E{++WJfX*f|aOs`IqGtVPI*O2nMF6^+Ncs*(9bOjUbE;>DdLXrKrxIhHvGC4v^5 zkBUBmB&qJC=FVRSfZ}268eF`;e6FEFnNTS?ahhPk-K&z{vbEn(J0S$CRmsQe^M?K` z+WGobpX8!oGJ`Q~3);gG=3Bm%g}AB6yAqiGJiR@>kHr|z(?zq3$Tw#98T zA(Xge>+6}~`Btwvlj(LXR(g!4$_?>_DP{l{g~wPqCRbhT-t!GOQk!CmRM_PvFWhjE zj~>$IZ9dzy##8GnMok*T<&p6n8-7i!A}bVA;5LH?~y1AuBR9H1;Bs3`dx~Ck|z~pmmap^i+ue%UQgj>(iA-EKzZz0bRydeuqVm<7LVD2!V#_*bH}qkm%{Yb!PVM1zI{+#+ z&nqydQL8=HgVJM9PVdjt6sisD)fb~efY{~c-RG3by@LHcB|)&*#RH%rwO=tLjC;)Bhs4Jj25dTgH!@^^pS;9Nq%Y89sht~uk%T?_6TIk}& zcjy>POkQ!T=mh)6tUc*tO8Xj)o~3BDpW4;=*H2IxrcdSFtRm`~`P+2&I3g;W#3fu@ znT+y=)4X)7$HY>a{raEPR?U;=7kmy~q0x-;`&_}S)U&D|0QL}Jn!^q>yB>85P_HxU z7g!Ev-=&a%>RczT2JY%KeGoM3aZr0g?;Gs-wPl0i%7f;SP>p!BmMPN@aTTi98MJ}o zH0tOHFwdxKF9)^&>@a=qO5~B&IAp`VCsTPO`0GA?Ow;g-w>?US!;tsb}jj~3~|oOyPBD~jiEvYBSyTt%=JI3 zg(y3tQb|Pz*5EU5(&`}T52DnModjzPeVmhO5?og~QZmw?u+CIqwApY*HKLy@QtVA? zh*Al$Mu+*w)TzP{z#H5za@1}^`Kl5uhcY?2FZbLc)!~g@$Xi{d5Sca*ufNT%ZZ_S( zF5HJ;AZ{`_?Qv zf|NL4+9jZ|jTzg_YI`@_kPMnBRn@h3LC4YQ%NGB#F?a(opjQg%dv#@yCXz_u|tDrZO^GwW;$_$#nFA(K$X@UR9WWjymFU*}RKrmGD8? zw&E~$ohSFZy6Jb=lnQhw!*+-KAtQezL&=PcBxQ&{Adu62o{T>XCpc66*r+BSwWy(UGGIt8_UzlWpy3zIXh(vjK1TZYNBPHIi(9eCDZ36tq}p2R z`wIk;+BmuB%;1DWbr8n#L|+_S`EUFIn)Jp31{(A(*RKG(*R;2~o6?;ZacyG1bgtJe z!JYNJ%yy>S+(~m z9&vqWS7DVL?>klQRO@Mb?17o5+s*N&?8KkRlx={|!zxSyI?wH^G4+5G;`2w;{Cc@y z1j#==)HLUlhB^T@aLpzQE=OY70@e(LKC))s6N8ZkPJoJ$i!-5zy_W3sQj=ag^O2L| zhFk)Xk=$^+wLJ6^c3;G;>1zL2intB$*q~!ePEi$hkyl zSz^);qbsd_TYVku>J$&060{kPNOH+{W%RGTbq__yw(MN?$q>aoN_~d!n-q53%x|aG z6(Z%pL(dJ?h`Cl7tcm2qHFzNqkNbbjYKn)Ntb){{P19N%|8iLS_=z)Tq|9g84K$&w z-}EH0y#?h*I>MSsSAT@_&zR}=u;wK~CO1~z;j1#{rV{3Sd?lDhPkXH5G&5ejKinA? z*#|B&*Iz{2G&B)yGVEEhk}F4`^n)s$*kpT+EfJWe5aiS)Vv+74Nc2Obe5HkemLs_^f7VN0?-U(RA1k`G&PGya22$a^&dnNSPmSC`rSEbbeP^UXBMX8kzZ(x740|~Lc!v}zL zi=XN;{zJ&PL(j+8a-N;19MCEnzMai7% zA9uv43d#DE|NaLQfX5yq!te%Dq)t7gafN|Opt-Egnm&0pcw6~0zzYEuIeCYufg?&RICiCB8HLE7d%r7 zl|u{BUj;v}fvsgpI3)Y!{mNFFlQ=19dv!KLm3VB9X8T)dU9SO=tIkRw-a-<`nf;o_ zEwL&tMzQWwTA^`D+QoiZ*<3L-!DU?fG)sa=!PhMNtq z@Blr_u<$BICqNG} z9#9oUqnL!wnwHLsS$fkY;;{VMAzp{P@4j2FiW#`0LOs0yaQzs0n~jI0a#O3FHw`O- z9%0*2iHNrRRRD}?J#vwSUoRW-ElHOSQ zcnyv4DvwAhe0Csw*(F>HWyf;N?(C|zTT)fWO{SVlKb{^IluxsBt-82jt)j<@-QF&g zz5J1|^1x>l?kx(kxNlcv&k9$It0@uVUh>pizYRB84H@RU`h}JdZN`+TuZeJ9^zy}R`zX680K3pb4A0T6Um79%sIsClJ1IxouP6t*ZZ6=1z$zZ&L>!b zS+Yg;W)4^@1OAG*qzCkUgUI zijXa^T{MQHGy5aMLYkz<)CgrV>Jet@$V~Gs(c9|U8~!y)pJJn$m}cXQIgnxlx~iZ9 zRjP3j=*B6(z?;F%s4aCp({q@x$*TNvo?c3PwX5*tjR7@BL~6&OU?(5FolkDE*Wyik z@O)qq+!amouKsVJBqA(m%15373FLIz@~eBPZx)~I(M{iB*zh*7ivaVO!@F zyMh&M-Dj-C6VJQXQS!VoR>C!gct|Dl92?z7!i({pm@nF*eua$h)kSE=0h`uZq^F6Z zDn#|v3Re#8-y1I&e0jJ;U+c}Z;%~oEEnh#RoXG6Rw$ZEe&A=kY;$Ph3^FWl~AO;tW zP#eY9H*YiI$yg+L5%U9a?P@&3OiA%PzSu*K`z)@Ume>z-+vONvUrN9iqMN~Hr8TLf zz~%Rlk$m$r+n#0rz%K3pWUsh^<>>uYHpGTfx9rnOO=2V${-8in^0_6!D#mrKhk}6# zAiDxLf^b<=Sb@*VLp7DCRAyPD&@Ih?fzJXOR`vA0E$q=hT>EQ3ANb^Ib8Lzojoa3X zmVE&3KdZ`$?<((gQocRm(83`Q1dnhU9f27<181$-miUjRuI)?A-Lh@VVJt@Peryq- zK1=e(7<1K%+7FrIMoGTk%-@Jt2|YlE(>&>DWWf#$7aI2LUP#W{cpLB-45Afd?QlEV-b!aS$kiETA0g_+uI=uSmB?BoQLX{dPK$;%mPuC|CT1HDYr`=S=Pbn`90eI|EV(aNcD&o&Ug%Prvg z=|C8ZVw@+Q9K<1ScdXnNO?uf}T<_oL?RBs$;NQN(wr7!;V>c|n9Qt``AUUYqjaS@@ zS}1r#3kTw~t6!TGDWoVNeY33UEXs%|B@Ygm1~_0?=^FEj=*!9XljTcx9N>rxwRx1ycUOt_?sq+*#0w=%D$JffI!#CCGt0mkin#HRACS3!v! zzdxbpeS>o-gva(FwDld?T=2Zu`U-NNu~-mrtj^S=0^l?&4JAw#U=Fe75>UZs%=JO?ed$%z$Zm!jGLMxW?L+$F+*nm2LgpeGRz{j%KjBC_=>j0sRNP zGQ5gvt9~d89FG{jL%4*Iba<^+-jSnZOz@yUMzLROgI4+&OUvv?dkkKjE5`){s zr%bRCjE;sV?uEx#)X%xn zWVzL^28YYZM(urz;3xMP39&zbT}$=P#dL=qH>Vap20lH$gWJEOCJI~7!eEyOv=IBX z_oss31zcNNu&^O_G62~}lqW&>tS|rwc>8XU0=tCLUvh`sj@_PRl-xs;KYgLD(YeyCPiOB)z)$GFBGeN z$o+K&3StuJl+_k7;GaGvGa8QFSiJXAism_7cIW$iK(Zt8Wan^LMt1esiXqK8=9$w` zv#ikQM!`4JE0Q#&>AfOC;a-Llk$9bdl|inZ$i}+}QW{ok%dMxZlOZP&%IDnB!ix=4 zU2UU95*l9no-+&moh(dYfbrr{NUYxv> zTjP#8!OKHnTTT2ekBud%6LeG5&3)%_tFbkzt0g#8S)7Gnq$v(JjNKz@sl=aGR%W8% z73J2XLu0pzFyr~>XyMP^pBsn=%&@lWjc_p%-Vo^F%ga$YNxfk5En`u)B;t|VezL0Q z^2lCh4>f}P{Ngo4l>r53wIk!TOhUplqSC3}QBsAymLhiu6t-(LQLsn$M6*FPKB$5s z{W87Z!Yx2`HMXu@5Jrk+)EWTd#}O_49uw`pSm}k1`ciGyM@LIKN)!pfYS>r2B9!BL zx&L`i6CTy4-uu`kuvYu!Z!!{N!1t)y%+6gq+iyzWUkQk(XylFI;0q;~1vk76HCB;l zP5kX1DSBz?3Z%G#)fwk~_a1G85uo2fp_Z`ss}Fmz(Xr{j(lzVJ{9x#3;lY5d3|z0@ zimXmaVMZ71%W0MHDW?m5UfCmc$Fj=9MRc-reTfW|5cn8bh{qdf46hV*6kpZ=qCBY* z)n1<_mbVmNS)xfgWJQ+quc_%u=(p~Q7R7YurU}OkfetJ|w|&`wFh;9$83S9ONUA`l z0k1MFDNTn!s^hoU%(r-=($v=tVBrj}&{E)&7w@5~iDHCgCuL7WH)~S~*1#5-!Ik3x z3DJCQJRFL-wIgY-=(LZXqy94XEsXRBstz)x4|@|>Ci#`uUcD9UO^QfZewWa|h;F6F zuP7@oP%l>>eoc@;u?4(SrgAC67sh#8AMZcLc$c@Tw~w*<42WO;bua^hfJ!9CH-yRX z?lX#(u1%V~cOtvL38Qa6VagYJZA3Oq-6R^WE3CvM8#1%YJ7dm)EpyAxLOiQ)s2W+I z8Z#NW?Gtca$e3Lr-*t^{$M3YpZAP;K7Ih%8sXH47?}~($V=oK);jd;&#dbY%HuLe^O|HZ2O*t)5qp8(uKpv6H*{*<8+pdH4l8* z2&OOQRLsU7yP|O~#uis7(+T^=Mz0WWyCq8t0lo7Vri?|xFpKwpl9CPIdqu|axY}1U z;&Smlhi_S>{@D+PAY~7D`Z*=SQVZ&6N2m6JT=|1Gi$YUnrV-wnUCT^X1+8$9P^2Lm zw7~4%T>~M5%e3y49S*P>t0L)MLZYR5n|^Qhy^A}2?(vVG1@P>$LmPwK8fEz#mEx?s z8y-?PD~kqqpUbvBle{Yf5XrR^aqNj^&0~jC(Tu`F@p*tgvyIKLnQu(;CmsKt#OQ|< z9VJLJOwQ%wcpyVn-uZ~69{mFE7peOw9Qs9MN3idsxBv&q9vYmLaG)^%$z=Q6xo9oe zvAEwny6pH#g8;iKOWU$$oJk)%+1~(r|43>h_ye{G$yTt{X8I$=*6nCMmri#)^c2Nh za~jy&f^+kXA)(%)0&?4-r=1{WweQUrEKNz#3tqEZxv92zr8Ql$$kY2>h?l~eJAXUc zBfz@kew{Vq5B_tRAMr4xbT+8wNV%1|jd?YUjO3q-`EhT=a^>1 zK@u>_%{jbqeBlqJbo5>0$#9r-)9S%tst#NpU*a2+=V|QXB7FbTxLr!T{_$m-Iah$6pkQWxPi4Y!)n1Nr0ANjh{bJPrr4{nkQbv7yB ze6lRpnl|!-;ZhM#APDX=_ut`Bfd|inQBZ^T{mK$&Lini&JlN_S|UdIj|V-_+#cU#i%M z?}mMqZz1X0k@_RJnP!@4P9kdBWJi9cL*hEO4cW5UQ|Q&6a8<7}*o>;K@(AkaS|$Vd zY4v;`!lL{D$5@!~I{7`Wi?POA1J%P~xiVnSQL#9Ojhh+Ru*S>#LJ zUZOxftZI{AA>7d9l4L-)s3~Lh8@qs9SH6ruSmBw$%Dr?Gg^)+Qjy9&ugXYV|JD=2u zw5lz|ml*QBOYZNRrlHa4@iNt4y=7FJw+m|TAecl;!hJ{EKyLN3^g2J#nM}eYEp(u^ zxi2|tNL{b5$7Sv&Y+%U3?(zKEx>{Zxzrom^0`C}mRS%6~NEj^%sw5cgKOfFbSHy^8 z5*K4q-=}Gd@>f;SI^z3l1J%SZa!k&de@^`H3xXC;Cy|k=>5s(Ylfx6=M`qJNsvVQ% zj(Gg+ia$lq^i!W1>!#zQ)l<<#q&ef11cP>$ykID77{*Ikm^1Ht(P#M1Mm>q(z$xf< zVZrGAUh*FOFP9$eC*JwUkuXmxgnRzDUsoZOpB=)53_b5s&W!1l_TC(<8p`@XI~^yX zN?MyA@cN%6zh5S3bqWSkZLS=)E4KYGewPpY3|JTy#`vU3 zX?=q4#I9~{Dp0#V?O`v6ofE^s7tGMwPiSbGUx%Zdu7B{rYB*T6{ig{?PA>*2du%1A zrUh^&37U?pe+HQpyt5!BmZ!uRZ2g+{s3U&{G845*|6vZkBPE2&Bbi3Uy|6GC``F0R zTkom8+tF=NN;(l8m$P*g8MkhLAS@h-%O3uq3lF;hRO>4jv=U#dE&(ZE2Bu$Le(TN7vjtK-2~w1oxg$=u)-qA+pTExBzjkatezfzS zyiY%WT;j%ia>VrJJ2Hr%JHVRSn}l5>u5+U2lSV^ z6u9JXaljY&rFLVG zy2CWW8u8a4rY{BdXo}jeBXB36(60&QTlY-JNbdz;0PEe2iZ*c8-X12jnR%0dv4Hil zeUkEPt_@J{5J1g5PenAsNusfJg9@2J39a6N*q6j6-u_l|K54wK%oYK;iC-$M+<70= znvYAOecE)hA)&6${r~d6GT)$qrSe(ZUvna4We<88GMo=ST{d`sP2eazWQ+Ox>9G6? z>5~%oLg*FChw9Ww6UY9i5XsE@_dq0~Q#Zc3y!2DD|J|mK!;a=zT3|{HdyvpfTK|_u z{O`{FV>CZtrhK_#VFoVY|M2(!{SP~7qM%xqHj2da|I>&2uNM9PpX9TAen93}g%&%q zRQ&zH|JU#O-#^3O-wl)BbM4f1(n=V#0vR-j{N(pV+hASp!0l;$TU>kOC28Fb6L(h%8X_VJW$hQS>9csFQLq3(^m-jOPV_$Rz*oGhMp8_>MbrT4$dT^d1<%B`* zJS=9%>V`7s0EFb#6px)?TNz1DpV8NZAr`x-4^1dPdXs@qyx12E@6+mx0q}kEo~qeo zs83v@LbkR6l3|w-BJvWA0RJmZhIjB!{Q8jZy8rwSabChGR_uOm#H^C<f0v&QIOheo^VPVDXbcrVTUH$p7+c!kN4I%en{XAd^D`~KR!{3j*B zt6e}8b2$lI{E|rQ7u;*KJKy|8Xg&!%`Xcyl&Xx)NmR{z@lG?lj;g!8W2a_1UoQPqg z`|m>Yuyd}yJ47g66Q4)XGS(6E0V00q>S?$4THuuL2To}sUAEtGp8!v#pJrg7*uz2J z2sFu)pdKqDy;^w4u=ve0BT_B5t*EyTFbOzcsHpPW0YR`CIP^&{NYMkSPZ2>?WdJKf zhEnE|B`rLX3V3UH57LXU+?(WDiw{dlo<#%XynY^~LfA}5UAF6Euh#;9vk>GE(pQ`j z?p9r)hmIhlW|l(6XQ6@@&kzaA31~xSMOiKAA;+l4NKtT_OPQ5rkalU?{#5jL(6n%Jfn^FZoE9iRyb*8RYxe=4+H z!=`W4y6BUVs$s+yXqSF!^sH}0^-vo5Ix5?0G4fkQhB-1gxgn2UcK{x zE_rs2NS(A-@Bbnn7yfvA>Uz6|(^#D8W38!)M>dYd@tI*mTS&KW;NMR|cLfA> z9k+(DUB3!L;l#cQ;qG?s!y0J@&WlP$Yp56z(yg$YrW!^ab6tu4UQ^AJM%Na@y-`&r zE_R1?%8SUVmp|!|#^#py;Xrd;20V#g2UpKIZdPpR4Uis{=tIVMl)@&y-i1v&VmYuS zkUFr7d6qa|(~o;%4p6O{zgis~upW~j?pZiE8kXjfm{JjF&LxQaRO}p}n2s&YJ2<<^ zlKh0CTMQ9hQVJT^F>_lYgt_qdH>&pkDiQw*BP)sbTAVuw(e7D;<=6an>QQL%&Afc(ybN zkd`=~Nj-lC>r>#;5tkx~q|4>I>yl4wwStT1MX^nLuDdN79)`dhw?N%wciuG*G~qA% z)e31=hY6d&$vz4E`12cTmqcPD9|@-c-0-8@Dy;+KIB)`_NYetSx+8>$S?>#P_YT{U8$I468V zLi-|=rz`>8*4E=1$n-7iEU6`7^ZHbJZA z9P=bRS=ZaHN+qA{xg}_CDGcFEJ#OuQ=(^}d4Ip&VPIXTGlu`9Z9rSf+(aspnC}E!( z>rfa&8SA_!W?QAWEe)H!yD*puKUC*dhqNPXV`P;io#p3%B_fkP2R`;|fGI)V1UR~V z2c`vU))_ml;0DIEE*=yis_7!?br1kE5yG%w$qx*=6hwNN1YNWo7na%Yxr3kk{OTOd5|MdW;MfqSqY462`zjMUgo z8Bv3D*I9$u;3zc&tyM)M7av<|}P zU|(vSmwrmi7pY*S#T(|l*ZQihSSUtb3Ex}=sqOW4^@{ZIW8Qg zTb9bPTTg?CW@|?}yY6|CRwrBpp#MW-J!QBTUaX;iIS_s^e15)TyFH7puGFs*vUJ^D zee@5F^;$y79~$djlz(chog(*{A_FGY`w0%gk!QC&3!dCyL!Ghm4Kq27HI=hcLu8x$ z8usP$VXKNdvxbgtF#lJ)#BWzS(&Zm)()}@-MSCEmR&tK{@P?Bt1KVn(4-`E^SoD>A zY=K!H*_E^$N0Y3s(EO{)sV|=6SrI;jukp-VEvtl+AKBgY|6b2=E;YWaO~ccH{eiWQ ztzWI!wf?hT{rg&tP2#2zE~!FVXo+b-zt?r)XVINVEc;J~{B;Tsw}9;cbL9gsgWi+X-o z5McbWktzC9fjCfJYh?Y8@*2Vxyy?SaljH}TwbrbhXOO2t)&KXdK#&JvT0f*_TCggq zdg%o%i4T5HwaHksf^};JAF#3gT!PW!Q>t|F5@S7?ptXYrJGHchN?IrMp1{07`m0t0 zT$G0|Y2{?dr19ijLeUlWMSb7XK@{6oO|>u85Eh$a2j{rFDqF%e3z=~i);Ee)Z+^6P zAbS-_yKKl?V8xWStHpH_vWv1H(cgU4@DJ63Fyqmy*5e2+shzManoTPp+4Vu+n;a>@*# zt2N%Y8V5@8*%*S3kuwl6Hz>zW65{)z_k8$cw4{E?`>-QldIH!VPSG@k7(tJ>l7N2_ zgY1!wgP_2>=MbVl+?&A814fXAd!nxX&OeJv&nVuJfXx|e@i0rPb;)7$;u3Tz2_+LR z>g*_`FG@j;*v;V9wNlKh4yIr;txdl$Cq?eMJ-*cs;-b0)g%Q&a)S~SbulI^8YAis( zF#>p$e~~LkfBE<(&gp z8RFJ+ro3b$yz{d0tuwfZ9ISjlT;$nYrCtVew|6P7&iW%aqi=sq^r?LM z_d@2$SLEphdcH|JOq1VLA7<=LGS2V9PwUQ?L2wd_o!xMXp?5P^fuR)Vx+kT1OP zsrZjFi<-izrV)3~zcDA2e~J{Q?CzryV;F^B*Ig3Z+We%opBV3RaeEmsRe*w2U-)z= z>s?wrKZGMKw85wP%0QOLlwW1mty4LiqxgvYT%0NeUBxzBoS5pa?DzF|TT)rX7~<%A z_fmeQ55PLGzGl)^7u)CAK^wq%E%nA7ndYoxDZG+E5gDJV1laGx-cNR-W-xa%o^2k) z)LlL*c2fNfqLOi-J-B6isqtTM;{~qQr3?F2ER8sFXb#>w=~cycGGQCj|CT=m20BWO zhhdVkq0K_*Uo^g!VjYm}&$fuUs!$Gxhb9FT&2C$!kba+1iFy2{;f(rMvnuBxwhVo4 z0~i2E(RhooPR)ZWV`}h@DD#=+tZu#^ffC?686 zWJkS?ehanU1>T-^hETF#-!t27B`W0{o_y{>^yGMvCi|m7QK)HtiFc3sP3{OO3ORk| zqnDqKR7bGut&bTjOR_TyJYGmc44XL)o+d>%i`Gz-FG;l!dx`1qN9#cc7(gG2KSITWp{MHNzy?^Rd-WL%MSrk zig2}>EUeqv2u<60+hM7bBmUmjW?Z(B0F+85w}f#?;i?Gze{*IhVTX3Iq%Rin?ijeo zU!_@m;q*XC6~Ggwbcmb^qTqKSUe<4im)f`3E48N|o$`+3#eF3rhlVP5zebim$hmkv zTpyWpntMR#2^~EVP2vJd}?t@BB+@z=T{%1oUj~a@ed0CpFJklp!{YiWB?(tDgs)o`0&dK z+mR!u49%Doo{(2+k7e@Rmy0;CUk)Fgmh1l-{r%SPsb6tIHbj7)5>>Yb$4iO5WKV0c zF43+Xk=8-KE?!ML&fVR?dw)#YSb48Allo8PFv`HJADw=sMMhF=p~}dT=3qGRws1~u z?bYz@EqnFQ@3qQHcAv+(URBBpK7XpLNakn$1|MpDfe0Ol9?{X&*bTzfQPF|kjYW3A zq2d(E{;E=M!1BPKfc_n7i6sN}q2$tUU*ZxWp_`khY`#TBUzVvU_E-5ouVz!c3`z?N z-UPjhQoZLmZ8cDgVJk=)8S{1qJci;Od*v>q+EH53C${?>sh(|&cj409%G8tHD<#OZ zTRJzyEt>%MFfjEq1wzXH{f45(Va zq|p2E1b^2Shr1iZFKO{;6w|xGUf@vhV0P#}65|<8HFXcF!j&8gIY78wQm^#QbvrQ%xe;x_Kr*BdUSX=QCQ3 z$YMqqiz=>64RB=4>2J7$Yv$;j-YH3Km#pbXVrwWO?Vi2sMdWujCLk6Rtx2h;`n=>z zU28{G*I~>-Tyq!fi%{j>3bTgcTe9W7;3b{+Z^ZDDWU?dE(EmG4M<>@ zYz%P|ZTBYeHnY`IE8dvgG^Vdi7IB<+DhUjwM2315$=ZHXQ>k_4a%-*p zK0;zxWVi_=hV=emTXgVf&H>)8Rlf0`>Ris+z$9r%{{Eb|%(GJmX=C&T1Ut<*Rp9O9 z9jw*&^wB~A9J1N?!pv)`!`fg^z%;?02>jC)!zVYbm$MWl37%+pDJ`Hy$bZe5bIQD( zy?!b#8xzeM%7xe7Jf#L9a{pq`>wQpWCaEhw57L(7u}aPC!}f&7ZO^;xSgp=8S9rv~ zmY~|06KgsF2It!R$pP=Gn&|l%UPRy*2sEv@RfFsh3&MR8dd805#+47}Wz1~lU4fnU5mV=-LblNxS3m~+_QE1c7tY%CqnRes%I`OT z4J}G;l3R@1GHY$M$0M|`|J1!og3Z2Og3!IrnC7O3J|1GDwpfG#G5L>2_|*Xr|=R*%OnR49?eJdX$QeZoOu^i9HYC`pCf zz^5kUd6-y|ra>2#NI%7Ov?(s{rhV`a(-?%vqbqOPK2#JpmzGKKfQqrZy*}%OK^Xav zbMDadtMXcywog6VV3JihI2wj;f1lDVf1uw!LLGe#H@1!kq?JF&?E#C!=<1obN6Vcq z*#D3Ndc6JZIHBvaI(OsrRy{C+x%A88rT6Z;@Yv*16z}bWVH?ZT_}0?{MPU-)glF-$fqncy$DvqyIwTS0*lZiVcD>&ycOjK`1xs_{o;7h zD*4L@^=LKd1XfYmS$rA%9Mn>1420s#L7tA}XCAb>KU%G+=}h7EZ-HB^Dt${*+i%-Q z_u`-JQgV+ZL{-Y33Bc>?m<~HIRUobxU*vGYhD#S#1%eBJxl}~INsZ&B4>V6aY&|HBY(~oqV zdxMW+*zt$bRDD0knC8R8lH;dA2;Y1Ch<^xB)q@-jo+zFU;{QW^8g*|^Ky>m%OR=BR z7HRVVv1YH6AYPV=!jBCtKzbJq4I<9#=7aU>MV*A{>u~P$WvOr-E=ScAc|b=`39cgk z@G^MJ+oLnb7!Vn#;$R$QF$q<{4nu|oy~qsmhh*^KWGGg-{-Mdv#8v75$@X_1;81aO z=a52x%;IqN51GZ5i(*X>iD=|19ZsRyEur5uPEyHscGs!+BmApk5egzL8LXoKV?Uzg6(B_}np) z-UL$1_<+C6^$bH?R&gXfUZ(tcc)5O&*@N$Xl(1gndw8>mc9Vm@Mf((XF#9Y{l%vHT zYYM0yFv~5pIT90A!b(}7EtozY_}8d@CM=}P&S!g1dZefHptH`l&Gmj7o8@^&b=)L2 ze`C#z0fUJ+q3%SYI5y+pf7n_xcP<&>j7cnir5*V zjKB9h>RZ&^Z?ulBV@{ZJ!Z$SbwkFH)Z!w&>;yI^M*K&g%Fp}fB$F6)e^o9WqN`Sya zn>$bDO0Qe|UUcrrRES>6U(fGI{~$jIXxnOUpf?>x{Rkw4#mI!rfv# z`86Uc*E#tMy^?B4uk2?W37~X~=qTN<(8%frMjA12ZJ+ilh{6df(3U#S#Sv3qI@r|~ zNfh#QeW2#oO9YMHTVwV%XqNMpm1fv(1O_8y(06GEJohBy9%d}9CRTzDbAYGrV2c0J zLzZr2%^;1czIn|qFf{Is#=R0C(&l$cjasyyW#evriKq&`E)(*gE!eDE|+u&|imE6Ex*Is9(81>O^A0j@!13LhefKlCW*Z#ZZ_tXo8l1MHvqt&p1RKg#XR+;ukwl%{XdyT)&-u`#3CCjhl^CkWa5;5)+ zTbzTiy^B~fC^wwY4wpBtv16ggtYQ3R(uW1r2Zy%0LZz4orcol@WwW-^47YPFw3LzI ze%azq5b-jLTkaL%Q_Pa9h=@%9!ZS<9mN*N@cb`!zBxbgErPNQK1A1o3a;GgX%uz|H zUud1J4Qj??a?r4}>~YYFLYKc#|HFq_{p-;$?4>>Px0j=$QDRO2k1Rd!Vz%C6(QW>M zAt99#vnw4#^g?FCzN+xxJU}l-1t+{bB2|I^`bdB0oZs;9j^ArD2v(1~iXi`T0IRh` z`Bbj11Ta%K7=GPkKnR17+%`ik7-}@6(Bw zc1|-qzy7Sc^@Vc-75Ql)Bz|m1;;GZz*IurV8cC6O_c!kMI-@#%KVe{Tjj>zyV?bBP^Bn z2>uvfIjSo^obc8Szof!&bXQraT%ee6bZ^?ae{zUA>sdDvoS2X9#c<&NEh?2w?n)m~ z94=5J50D1|=obPK&W4!=Pdg3Dk{rHyWpHoI4&)J?=GdEP>1`%dBUEgafUV4}!7*h- zo|R0|9VHoyuX+xgAcr-fw}d@VNvEQ>{hTPm^c3IeN~HuE_Ft!Tl>E~Rpq&U^hp&PJ z?LFiNxp2Q;S&knx;#5WQ&lO4N?<(-6BNTZV{j0ETkRDB{R;3SFq7cyhNzRC#*JMP>cj_!k-Iy!>||%_m#2V_C=mFE8h(3c^AEQap2jIY|8He7+=l07YW)ip zIj4T@Ui*aI%NXJ`DipM2b8$i}!N1hiu!LU*;i1b)!hucr`~ig@4&hsr>YU|S?(Qho zd}40R0^)@3EOG8GUzQ!<8jT%5CwdGky@MApLn;01QL9{?^RiEA?!5tm^KkOD1+SV| zjD7R2OG%v5NAeGoQtJJ#%C9mir*p3@FhBi5Fm(P;BQHpOe}juUN1hQ)K$J@eNEut1 z#Xrc%hvgae$RXp$aOS2-(ip$Wv0D!0_=teS-~o^PKMDJH8#0$zu3{|M^LM0x5;M_ zUOorN6!!`%uJn&@rMAzQjCmh;#_1~?#J5%7l@Q;~4m$aHKM|}d9%95YF`d0BcHER7 zREe}?=^y_8C^e*4EUOXuT2cl%XJuNJH-m%GlV#$&!I_yX{$m<^kqx@wkcDiu##DNc zV6b@gt@0$+>BLvxpP{mofwrrTs`qazNggoNtaL@wB#ZrMYy`ozIEfH8)f8ddfT>b5ZTDOAW~%%Wfge8HwkM@y;{HKUGqx`` z%X}>OoW?%$07wn~(Dvc{|I+s9CoKHG*Y+KPsUdgK&oixNr8fDYeGtls5Fz;h5faV8 z8$zDIC}-k}>zinF*X zi}$Zjzh8O+x9?@rnxf~^i@r#;Kf$aNB4ziXD>$uq-eHXdpEPfRe_tyUjq_mcw54@M znMS+L`RMA4QmqMo);iRAKu;D;&lwKPDB1gjbm4mE*mOUMW5&?Lze&$BPDZa{NBeo$ z4>B*p<1J>CBQ~inXD!0#huHpa`i%RJ!0#anGDFy}8Z*8C&s)|o^2l06Sg9$7Dbh<+ zmf7wq`C3KGm{+<&{JyOZxx_JJ&ji=(qijVj+ZH^?9*DB`0_S#aqN=aW{YEd%$5gJi z)R>U|sbaI0jDN+l`2KQQ#}n;L~m&-w&x1DUDxL<5m=J)2siU)TnqrK>!y zOoj;GeB^~O|4sq}Yn|Pv$pMFk$n~|}`mC+hew(D>3e&mGl97|p zTBG@vExFj6%lX?v_c$(_OwuU<-AYPB`O>D?ai$@rrYmdpKG@F8BqCg5@Ceu{;02Di zf7QBZ1kr+xCfBbQ5h&tdFMxSONm~&KA0KMIvDChsyE!_s=JrH1e68mu+Q<+bDNB880?RbX&Y@ zUcU7V;94@u!WJy&pjHc);Uank4l?ud98Puean^5@^YbYU%k z)|XIL*CaRKG__p5y>aMJxWXE`r6>YaBn7Jv9chj;xutY<7wM*k-+0E71BnH%6ntls^BruTogVN0Fiw!u{ z6b_UpR=C^>Q|Ee+BVzB4Zh0HFo_LO0Z;RCI_)MM9j~|Ejyd1wuAFqICR`kBB1=Tpi zh90f<14>l0!GUOmD|woxT$FIdw>LR60!<1_Yb$z1W!9SGgJSPF<+k8z6=FrK(~Nz^ zL*uHHP0F8>G(HasWA=b6>H7A^kyC_LgQKBy>2BLAL4)sZcLxX15K`ChYo*EPX0z^$ za_GPs=%!~u7wGDDq8ek7At)>R{A`$(Y!+WC+^(~)h@WylL5_Y^QnEdOJj4dd9>9z zCR6h5&(aJaSG~1^W`si7~5%}89gX{RcN+4NH zG)r1T7BA*iSU~N2+EskEjM@Mqj(%vU@|(}XpULDqoQsHdoKkqNNFpRF;~F(N3|N1*D3$9s+-T{G25 zv*{;#UB}PH&8YHw51$@&j1;JIoB}e+8UBE+MR`yy>Z2rvSY;yNrfiUyCwh(g9T5p| ze3_#D@{&&d`aph7ijen6wbCp`AZZbf@cWFG_yggx!u2^u*F=Qkm-b1 zg~*uKe43&~uKyC)|8IieCV{g}@gFXp57C2S$FzqzKF#sQJ{28GJS`3?Up~IBpSGfT z^}5$0WkE6%HV)f?bx#T)TS^j;&Lr&>V~UkVl!W392RfKeCRhT zJsNSsP4`h?X-v#1p>0&Lw_Y4x^u-k;)kWS-yf9UOVDL7gpi}rwllfHnev{8RVB47a zr|L$R$g)ZpOAAE(J~g;lro7&FyKbU&mim3Gw@8}(jwe(Pv9k#I>~K1De-K-J*yNJy zw+2ortxDq=68K;p@7<1GT*tFuA)?SKIrX#6JT@E~;wk&!YL_Wl*Ni_RBTS=DE=C$s z=BGF8zR_N|`0!6QCH*YF?$0B>jBjV%hH0*^S#UXL-@AJsL-5W&K0J#OOzTZfmrrk^ zc&v@3ZW*c-z1bEsM3gu2Q@Zs^U)5C)Z#6#S^Ztc&!4$26SrbY@)@scd!{;Zuvl-Qp zOaCn#h#}<+_4PwO?VmDOQC_I{BCh8}mCaGN+*;SL!iF!UF1jl4yVn_Ieub-BpLfpH zoB%h`#T#pLsJA~U6G8Ve8s56x1mUVM5l079DY-qj+NY%#rB(j4SGw;eEpa3>%u!~E zjHe|wQ?);9dlLbLu@{Ge&> zMeSc3Oen`ug|sLhPm=0T*vnEK5oVWGH|8;D5y#CQ`OVAnr}mMtb7P~gw$FzvO(dEK z5?G-@R4%e{c=74 z#A{T?mN=$0q&sz3xNCTgd**?RnhN!7toq#NrHKbW9#tl+3qPr!KFkVfDu^2VefuHY zdTx26hl8olq3Qxh!AMxOnd(EVL70P&SkORxsAHxJJky+M`#SN6dsy3YGo#)Q$-cm; z0pkkLyn{TfBly_r`-kU5a|N!nI@fUQ3dL*swCGZdAci_U4ouQo_?zwRt+-)7r_)m=j0YX=6D2anZWv zOi)H8TuSL$tgy^lW89?dCoBzCDiL}u>6BntrMd;rZ0O&rr5gzqwzthn4s${Zt=pZe1R2x6Jm|dEw zvGJc}8+6RjNVQUv~8(FT46JuoFOYt|UcPSwT(oovvgap!4y+5tksH%)P9BMJ~w& z#xGAAREC+_M;^@9M#yv6X>P>!-H>eoKEl9z!vv?iSc!0C$(*iZ36TaBnoJ1!mr99g z5+!EyFPjv`MH-B{mtq~GUa_@m;l>kQZ-*RBOkJ*53s|1e)OjJf9HH_e`>fES+?Ue{ z@p!hC5fqQR6;48dqbwSFh+oz4i^jZ(A4N)5*`m!^JKO)2=CmVorglCer0R8U5&F19 zU_v)@+Q=b>Qd~RhOk?!E+(6yVOJwHk&%1N859^IJe!#omN9N(!%-NW$ldeTA*XKDY zU(z`nI_7WhP)S(j_pL#>c2C7Mp&4_cZHfNY8ql^%+$t@(0g8G3Ro>k;tt8ksA#{e| zm*NH-5~Wx&E)jdu4bwyfYw<7pAT<}D>Kls74%(R)-f=RN9y%>&Ia4~gg-4v7Q+6Fv z_SN*vW%6r}uh#aL2G>lii)%r}`kgfEItNN=*3r%Kr`z>3hx4<{IJ$)j@8pQzzoOvc zzvdSwg;f>4(6}zqXuPFM=en^rPkSQKO_hC{w1IxVK%&KZR6FqaH}t(>mRzN#OO)c$ zh2*A2FHDm(k;ynXc>8}(L=3@XXQT+f;jW5~Kz>1*ad%eIyY~-Jc|oSmj;O%Jx7)nB z=X{PU+BJTWOL8}1B&VqrR|O*<0@2nTjeFNpeoiV^aiOrw%NEPMj!i*RMVdSXzNov?l&f?3TGl|f@|zO%35XZLN^M&HReAjNz=Ch86bI~&XU zcy-d%ICO1}k6wf*xUEUZF^`G<^WX+A(+Xt-<~EcY7#5KxbjMu|`N31--(xEJXjz5V zkXzjEt4(+TxOfYWG-e7puWN0$HyEoIkT^_6Aa-E*AljEZO#n{YN+xkDq>ZT(m&=nv zyWKfi%h1wbMVm!GKSD{?z)T6>;Y((t$UgFIN6d$vADK!%`rz{P2Kohz1(OKq$wty4|GS+ zxr}>M`pmoDHq>q8=9z`+-gQrT>+>-`#X{xb6M_)tqA9BC`Mm9b+ckTx-BE`*qDxsD zO^y>*4S}HGP5K+&1+3j>jr)jSyC$*jJ@S|G@D5T2DjP!sipih%+>3_qezx6K}_zW#*M&_!r%t)bNEhP|6#u@`M>I%6 zxC7!Q!%Dzg?*>u@Iez&!zv;zK%lnUHOgPv4_=Nvl8SX4V9yAG+B1QS1V)=i|=70H2 zLBx%5j&>PXO8@oD`EEB-^-?jHu zkBGYSFc7?|bgXJX56}ps`Y;0+6Ei@UB{wRoPK0QKEdY3I)f35(S3vd5ykWK$W&^6b zO!Zh64 zzxz2x4(BpI5Fx6!B7U1+y^graBPIHxRp5`faOpYbD*@Wcnm+dF{Or76fTL~=EHHw{ zW(~qV%mC@brdp-hIzokFbOToIOtTm2ENF!B`%p#U2VqIoN(E}?&`6}Ze4{#P{_|G8 z>!V@zJSm?zi`tw4`jLFqG7*wwHsc|REFF51H3VfnQz}6%Ylki9Kfb$y`WQx21W!Hm z5$_De&y=?h?SK~SasX%!pNNw@9`QM}91^2GkGk%5Lr`tFLm|>-$3Mz=MTHo|?sM*n zmHTPifu87?u1KS1h~^PxVLs^5EkNDFw2^j4iw*@og&52NJySGheERAJW4`%xWubHu zzm#ULA-;Lunj%5G!E}Yi?9}o0^tixqX_bDh?fs7fiQ9nU%%gn{*zhtufiiUec}*)) z2|mc6mGO2aiE*1x!=qN$tx4wXzQAhx#7AAOZ2TI|wC3$!yc@ z^`|65v1vfxsw&j1vz4064)2o5a5~+Qidlq2VA><*yZv!jxPt68GD>D&7oriT0o-m}~k=3{e0KRmHq@^GkCL*5NgkL+mgnS<(n2kcK9R%rhbp?SC>newD{4}%K_ z=f>DaV5bHHA@>ynLd+I3oE%`*9?$EmHSCFWh!Wb4cihM+bU6Ig=#&EdpK7HN zxNCsJU+;Jdqal#1kl2B9cYJ~p#%hBgyd(POf|B-U79bPdqC1?Xgz)vn@#fmfKEnJJ zq$|#t14lb4)i)2&pKpOM6Wb}{LU2z?slEd|g3y4q<-SED14K0 z`31IQdQE@xMoqA|YhuK5{pR_2g{gAV_8NE*pAliDuI%1xxs3r;CZpZEek`~Vaz_p6 zr3PK00Vd(h>+4xb{n7YR^#HMu8n}z=LXCInk`Ao`?>Wwd0?}P@&Dzga-LEp%;u90N z+4(w9wwJ+JX1PgXgfGE7ZSzH8=Pe>H;MQWuE_&)5csb4jMb^?BSlZPeQXNnj<#nQv zzK3@i0RhM!Y<=ziwPdONzTT}1zWJUjdADswl6twTiTwO9uqz6@ofMw0@bU{1qq3Sk zz@3U*{pUsTHwRA$&CeAd?X9BTc*U5UMwd%&P+c!uxxUtlX-LMskI_l}3e+k|+i#d4 z$9jXcF(u|og&OOH^#|P(Q69kCuBi^B$eWm@nIFqp0|<+e(X*`$UwaED{#%}8BJZRo zh$$Rgx&_i}7;xZJWlQ+90!|;x0p3H&MnylhO#Gi4zeMi}5m{902I`OOUn2rW&qEQy z2mY^y;Gn+|J_O;hGHIGIn}o)~?o516C*@rMDYf^w5uVRl6w@_5D*HaZ84^(!7T&S1 zP4O7yiYFdZ$76iR;%bx|csq3XeA{fS6GWk|7gjH!2q=$_14J#MqTQ7$tP+5hGFkYG zv>P933L!Sbz*1!mzv~u&;6X|fQ(8HRV!KUpg;Bp>UXm&i@D?-f=7JY&Q9u@_=1Uj+`ox`N87?xLc# zTabXPI#TIa7*key0THa$5O$YN;IZphP4KYP22ab#0aPHG!-Jk=)oLLjmx}Bj?u@l~mgv?vW zx_u4hi9c~H*$#UerUA8r*^sv7P$Wei)oTN(xa`k>L?%Zn)rA0LYRY;4scAi6WfVkv z3jh+_Kx&fppDCX6@8)I2Ji^Mq)k4cs=>^+xkl(6WhK_M8>s2z8_cPXQg<{52;2Vz0 zdsAKZe?=9PYq*#jWHBU{D=uto$%mxr*s7XWj9$?j@(i8EPYsF^VkzQmr7vU&32%c( z!J@mqhIPq^>dHR>ijh=7meCyG`cEi$X=3iZxn`n#tS*~?^IvMY_e{km7au0E?%vN8 z6J*jG;NHyt9pVZUxiw%?-YiX57i?HC_@sv{Z6A@S4vqO)-3YP}z8@0nz>O#eT z4B64rAyA7Cbf#h#9J!KjP)TXl()0`fPX}U#F;rvZ3YEuYL^O1x>0)7K-v%U83#Ueq z16k+|p~!=%IBVB!#=a&)@)v)MoU37R&EQ1r_6CG8!uk! zV4R4(fh8&YOe5* zh1(1DY}h;)Ku;tW0_o^bqKrsEJ0OF=%?j!78lk%yW zo)|r8b%QF%<8RLCf3_X#$TxARq! z;?Q|7+S_?`ExU0nw_-7-QZTX^aGCwUIF(Rr;1ck5Fh_5JuI1hGGg*v2P<(|k z7x7DeiE+bY(uy~UYVMV>8pw5y*a1V?!KLb*59YelmFCxnoILg-v;M27GbP*pBpY>Y zen=rLI;3$$yYXAQ)aseK1o7$XuU?IKl(1SRTG?T{BoDk zTUWDEb(|rV)HV&lQsP41ngMQtH)+Y3F3rc_bvxxTZg5MnmIuO4@d{)HL`IG!H1M8) z#7^gz&#N}Od07Bgx>|U!{uOE~pw!~_Rbe*pUW3WOm4QacH<)1>Zg|C8%J?iv99o15 zn$-I?+|gNA^SnJmwM86YpiviNXrWpnYdsdBl?_cd$3>NKh1yNba>fj!|V!?Jcg5U-^UQivj%Cw5r~PGh`x^_BaKmojBJ#J zQV>Ys&Q|A)i~2}l+YtNFsq)SLqA&mFv*an{-%ABkMR$To-O29bB;zPISZo=ENdoyifBNG*!I%Lp)dnQ%QyRU)Le0J=<#(UGnzjoJIFu}KYK zD7_E`A!+bDL4m;y2jiZvGLQS5eKM&B!=Mo}^Or|waO*_E z3{%3^9O%3El=H=jQ}@`Q%xu~(I5Z5F?c zj$8Z63>SZHpsgA6^6s7oF*(&~46kzT;=M|bI<2Y2EIpAh-UY4l(dh40r-4MiYy0^| zmjV;~kE*3HV3-QS;h|r>ktebm7n!NoJI#td7H{aU4)&AnZ<3nf3W|ePs0iv8!mRo7 z%r0vQ)H1m$3Up%9g?}(A`So$8CsUz5Q_$%Hv0N64ID85ZDgmcWc++lui1HbeVJp(t z{J4R1nCeM`Ue)GcnkY_u`fdm^D&EVJUyPwZZ_VhI~ttr|Dj|Tj6e-}lhe&~=P9YX-u+mPPtWUO z4Fd;-dOXq@wbZICw#SuEEr74OTB21uA)FO<&jz>vobBiwu_(qwl+*V~%%xw-_FgRR z`)vAXZFn#!)BI|)oTx~wZGT+%v&6BWp*-eQe&UvHPDhlccH=sjG6MT5JVieHy#*uQ z`S(*NJ9WGKc3^mPnNN%mH6u(_%G}(A{qfG}^{lqRw4|`m`B1#!_Fz;~z2}byY_i$A zQmFjK4up%#_M35ld0bb66$!)Rq>P%v8jZTl<&r`%|E`G)N6^_(SAiz~oOV=k8uVQ8 zc^4jsEn5yJ^-R^W?zlJv+4SPJ8puja#wE4Z6>bp2BtUVEG8w4?5p-kG_#EWQG8!|r zv02b^zeWMvko1&NSsDxzfQK*;{CsBXr5x0&2#20M{Fk!#FUR*CcUs&Ro^lb1_D_l2 zDUcRXYc1=lqke91w{-(i#mBT9Z)_?_OMp4-&VsWF%kYg%NN8=1FP*N4k9hs(z6h@t zq;kK<@n3m@EVa(`91)2F_`=I4rJ9-z5oPIof3_oq7ETJ`Vu%1Q-^Zbq91aHt^WL>u z+`!W)4SzRMGw0`13Uel30PHsjb;uU38W)vOY%34Ddzq^u!@8>z1^Q# zxl`gr%V#HwGb-OnL9U`eJCH5@76ERRX%U;J$N6)*J|tJx$U*!-Bg8dckcWKYlA@C^ z#fb8;3=2QLyo)MsVo8mN_WcnsyRbuWsYW6)Ji<|6$^;*o6*!XqdvPgq`H6C9mq;C8 z(0rIGz>-C?%3{`J>jUBjt}ph_0A0VxW&)tr4ch0BmwoJ`RcWhtI824*t0x$VFa%`B zksCUoNaQIMq~a%bv&fmma9Zgpc-7YQFmAASu*hfAC0j?9cPRHC%KWYe_C}Mkcv00m zqoIyCCep`%bm8isi$L41tC2{b131VCUP9hh9@Sb_xznT*cx>j1!!F^o!b&Hf zv8psSfqQw}VCr0hljt)0P=k{lQGxQ?O-Bn*K1r5qI~L|aRElLnxL8Z@>7 zvg%fJK0g}UK3e+aQOD!+D+Bba*iv?~qs8VpRP zU;p#Ql>)_q^-wXmv4q^`fz_21lU4yQ0GZe~Mz$j&0ClE%eK_k<%rF`OUoiN21HP;W z!q|m~#o#@gG&zD^mEa)>u$k`x=iMRLjo;(BVJ7Vdmd&+Xn^P5eJm5#Q2v@MoHMzQu z0{}nUg2eMr+({mNR5;$yb8xfrZX79H1xr_X?W2wZO z>sn=Qfq-Z?d<{$<0&^Rd{OC0N@fcFqFW9g?3QSpbkoZqkfUzN$T9FJ!Hc&PFs7GK? zvwk_IC0o)2**2ht=R$Ve26%rzlIv)8DgF+i*umAOef2z^na?`ERh{56n=H+cFE&*P zpOlJLdL@CPtL+=g=s;7LzO$0 zDc{+qy&-_#@pc zX}yq*4bfRNn(soYI3vPw5dcyh6SR74MaH zN}!yqYKN0jmOwgc9d`gIui8k9_J?;O?|jdoNX@2<QYy_PRX`pvtKU=^7QG0u~>BOfN?xok@F#QFFDRG4L^Y@)82 zfB=f2z9@ArJvWI`!_h3!Inz8Yw?zPUemy0dnN!@yW^V8)ZC4cLTKfl;r&a^a_2%b= zmq$bMi^^A~Dz@s9e>c1N$H+s_3dul*w%@k`BirOIxEz`Fy*VCv>V?a`crx^g0+uvX z29@T3U)jg4-VE*`w|_vS={qLomZ%h604Oj95imE;+AjwZP8&$ifY>XkAG8ut zjIa&cJa+%mQ56}P-v{7X)B?P=L}%EMM*aU`?=J(YO1uAYSWpa9L=?)c? zZje^-(4fQtX;DF?R6<0$JES=@NFC{}gD4Zhg)l55=qjno#qz_g`{@IH3FLZ~;}Hy;0@! z+1GPoZM;@XwYT0ZWlgSut(;V!d(~A-|DzqIf9d4Bl2w4NvMlVJ!^jMV9A=`n)W;tb zO8BsaZ3R)EpUkvIi;Lgg=IZAD41pIdTBuJb8jr2FYWGn63KplC5~^-5VY5VrcV|&= zS`Bs}RmJ3Nd_nwV(Q0*X(Jj5mH(IkR`S(87FfBzC=FO;$nO_;mgeV$vh5A#dZ1a(_ z5<^WUJ55WX=BxiGu>bO<=NUo;C^+;>?|!l1S&ydNV6}4VCQ=o|NMbvVKDHvSN_)eN zlbBZHMe;+jJnlWcD-|;?N}iXnWrcw0<{Hbh*jr-b5}rTw?q(0?qJLFtnNz)Ag{tIX zG!|zJ4YeD;?fX#9wT9qqU49E)-2CHg(8#8d-=(tT5rT-aw1;RNYx_^qWd&_h*& z*JahR5{EE-Ea@4a0Ot-%i`uAAF%qv(-L3=z?1TOgxt3nXDCRudO23uMJY7(u=(rfx zs`kWBfH)wN@F(FaHd+O*`APH~_M;Xqq<;k3h2VTQw-3c{X$*CC=W(UUxO^4s2Djkd zd|R4{wOh?pOG`_%c;#^#<|p1iEQ8=1 z-C27GWD92G<__|hdVbjY%%26_gEdsd0ij=xZm8!jp-7R@>BC6azC~xbifqYpr<^{* zIDh$AiyLnWGD=r<+1^-5e~YATIsnIK=)cZKH|J33=0<4%XX=P4jD$y+sTEYwmOB{S z|4U74ZE7?{PWNVp>%t%m9LU3l^UbV|+wN`wr zP(%mZ<3{Jt&0%o#ry4Z`SOasnBD9|6iDVdMX{2lJoKe5{CWM?-Qy_8;M$Ziz*O43( z94GC|Mae~%H)wuatK6LfaoadTz=dm=)PilRdqEdYm^0`Hfo0@=iDQU&e7HMRZKgTG z`zJ8h9#3yJSV~vKTxNv{58gQl((h_yGjJaPoO8ze2xZ{gh-cV?f0EUsq|{#_Rg=|j z6lEDX{ zrh;H)L?lvgju*=^X^WzSd0&!TP$`yL3>9X&?5s~(@zv^(y%u#TLF7-*YYxro&HjJ%@p2ng^D-r3Cj%*Lr$m zd3vl=QU_jc9MYlm(ePVG0Q7Lb-T}*(`**53vvOynBVQW&LdobBdLv_>XMZKDtw+k7 z<{`-07A7*p>w-$#k-57S5{>WxrOcN;IV0?lF$4M%X!UAcsR6!^bb&iJ3>R!@)Ye_z zj)0VWA6)H<>d#mdBlwQI$N=Sy{9Bdo+YurVF36DFEUat!)+-$3t5hQQbB>J&LM<70 zgbk@7rFG2$vd>Qb*5uUI|1skbbt!i3gU7=Dqi4n&#&(#sTr`Wa_M-15FM`K2$!I9>~TDa(wZ zAfJkh^OHRlbB8jJXBV2jZrXVHa6WcoWgWmixj>>E2Q{!_SX_5`%zP|^+$({Cba~}< zEW$@eGZUBPTG@+@`J6>M&P|ixD3tlNhVN1gr!rw@ zy34Z*M~ee6(tb(2aaH??PA=nn-bdQC3Q{&CakO?*za)&#U{dpacwQ4`bM<9tbBpV4 zWBVx{HQ{jHI9l4nw95B1vsANIGj?(+Kx|Nh1C2&u<1zHP z+-wWo*xwE?h1f<;e2u2 zEp(T9P$1$ck`&iG?#mMss#dkbzKlbXmES5Lys{m) z(dT0)%^RZJ5a-|OCsUm^Y0+r12dP=}xzW36*V+oM(YWypN)x0L&mQqz)si$AR3bVB z0-dgppU#Y(ib)`X@2UO_ID*~w6VX9smJuJ`<0(1>L^UZ)UoM9QEuYZl^j--p_SoH- zhuzjL+GEFQJP)c2%th8VNY0-$v}P54`tUeeMEss8V5Ukh)_NP9*ClNL^X_1 zRwuTKhKMD#C5P!?-9w__o<;*rR$aAkxrRfg;#h6Ap=X1t`$_CNz9KX4#U8L4x(sT) z-^sV1R*hzmJP@dVz81!ddJM&ROy= zF)JFusaLI-gP!UY=q=U!hjlKF{`d`8XJ6Ar;AaBrYhT&|U;=JPuJ0C>3y73SQRM5p z96|9cT7L-U*svjbGOc4~$uNy}fTLWhS_W%OM~AD0$5EhGn6sgoZ>R~3jmLoLr&vN% zO$?i;T5a3|27Nl>7?rA5SNOtQpY6hks(J3tM<|p?W8cB3~a58k|}ptbAYZqJ34xXrXYRtl}~eb zMPU!)R#bcWsr`(p3Zi*vOJFO*KW>x^20alysliAfTaxhl`Om7mcd*@)eOH7wo={e9 zhT*m3E!#cW?i0ua0rAXPF)ckCsr=cprB>g1&hll|HK@6yx?uL(lQv8OC9$=tmXT&K zx(N$V2o8KwR-MOs7+&P=@j^|t$12j0qpFpY7|E@KfyWm&2;WT+jzhR-^H0D%-@}r| zQ6Rdpic$dk;!y*1&X=+{aAen)KuvSacZEw|l1H;oKjI5IHco24f4Cspsn zS&lDa-$wT)TdxLxt&Ql`2tt{tYTaS^&Oz57|2qdwaOxB^x9ladC4*|)#8-!4Af=lc z@l#Sb4+j^`_4j-!vR6l4)9twaXulT9VB6huCJoBOD!W@RFkmJ#*o z@S2u>i5J0xv^Fdf!#h-Hc9cAZGAi_*w!k?DxwIoH|KpXtL%6HZ@zPSbR~jJllf}?0 zaRuCOkC7BBmga_Idd+m;<+XFu+^VMD4^cIv^etV7Z@)iyn-T>C>%irV$C3hkN|aY% z(x6m$2E`S$4aF-^NH)3cZm+q&;*uf%gK>5vqCv_McQqmsX1b?7m~yPcp}c+6pk;9{ z|3Y*4$iY0fGAKnlSPHXmeYukV<;hFc=bRa=efMDw;c1CbT%At8L&;!QN#Q6xk?%OM zBJq%Q#+XcY9(St%UEKZgm)GKK{FzLzQ8wXq4{;x4Hz+Hf!l|v9Vk&bV-Rw@hp|>Z1 zD=c+LYGy+Lc1MrzD*Y%*sRj9FALsfTD(%XrXJ1g768)J4V6?O{3r4yDuA{Z1^a@W1 z9%5C=POG00~o(9L#@82%77Hm$=sJ*N)(28v&>vB z3~~M4PWOLaD=(tP81??Ayre141n9Ms`-@LEf~r&x`U_WM(r^j~$%Og$Wbba_Ci~kl z-#Yjpbq5fb7hD$ot5}f)%j<1on;9DV3*Pv}Yn-)tz7I8mFEJc3Z8W&nh4uRnv%J~x zC$r3><#-o6b$tJ&e%qNH#V%}bKIp-v^*@0w;6;j@5c++vzN~1)?1rrZcoA3+zQeSr zl0MA6I>9nwx&Uw3T9{V1_q_kblQ*&)sFuJpslB;6M;A~mZvtgZ#6rzQoz!!FQ(|v( zUDs`?*af-^QXZVbxttkXQquND7|Ri0#{GouoZQckuh1~28&Z_C!JguGUY_y;zv$z( zqeS8i?AZBW{hImnA>&#E(!X~~FO2-(nz;Y|zcQ#j^bUgbNm+tG)`p?^d|W6V|C#xS$iYVzp9=d# zGIv3NwO&VjUA0e|c%O2SFeoVA?y>>R?Kox!>Y&eQNs6B*;V1gGW+T=2D94^~#`0&e z>0`PNFx3%VWj=`Yd3aX6#t9pfQ1ztPkG3R$uFutiaXRPhsNfg9KU% zA{zh&KOmaYLIFAfWY{33e#r{dBnM7nmBMB~$W3Dv>Ht(F*_WWC$CF~#E=0y&Qu;a6 z_o@hp_6p?feN`jGW00b+YZw-dlu9&v5=|1gq^IMLXka6j+dOuQwgBCI!QpXd%2Gh@pBe+ZzBGm%H%wyvN;n^hq_AY)a~1t>dzb}Cd=D?>Jm#Mq$6sxWT`EX?_BG2 z=2)9c#n}51jy`+AR+q`vNVgT@)y5vT-~z*dRrOHRGnGB{aExSbq@AtTSF>3<(m?ph zHZy2Or)X~Gc8WYolk84VA=9H!!si+G)17tFI5fI8x9oN)H- zp7S&Fm&|yvHLnstnMg;h*qL_+#Hu@C+07=mC@gK-@^T_C-eilQ;THq+FLb-isy8ES z^XqeuS)^~_CaAZfXw74je-oN8mqSwjvpeT$eyVXRZ`J(OX#a5u+GGiEKNy92;T~$A zq9?M-2Vr)0_B<}|RCxxHD6?hktnGo&yP>Pj&$60jbxu_zDCu|z_8ogJp25GYN?}DBsK|e#Q9oh;PZf|7}OUW^d!Q z?82jaqiscnyCb7=&w1&$k`GxAI}=}3ckA`i8y1>)lguH?PAiMcCL4)g0VJ=U30xl+$^F^;c zgqH|E`VQQ)(IfT5G{4P-N`1gI?IV~Jh8H|`T^vDee06|xS=9h{zAofq&@~1KX{p}# zap%SZ%}WH=KRVcq-J-39rGK)>XJJw1aCeWcpxodX>xu}y=mI`VMv79k)7~j0U$lVt z_Pc{ehvAj5Gx5BIQLH^8;O@n>W0ZC4Lg?)@B@Aw^=)-&zu%A;#p44^u8nZ&T9$eFt zz(OOSVW~l`tE7+7R+c}luHH(0@KHd`*Bt-LVO}+8U&^9M?}upfQYNMrS-pG6^yS;W z7FH>v@1Up|!9gLc!f(UH-aU8ob&K`t>_9=Ghq~YYIW+##g7av44th}a+?WvH*q3(w zCinmxaTs*E%{!3&uPoJM+0_S7Q)N0r@`?0VVnFj7-t?Q~u;P4s%|JN8W1vfiPht_K zO$2bM!`B-1jJZHQLf}%`W!x}|0v&pXQl=ZN{{03`Ix*Rgz-cqfbhr&kI1fBlsqA8N zR1f6*9kWQx@)^#x#y|!+*APs7U~s|^>Oj>UB}OyhKFr;Ub>sXLFx_+%`y;+L>6xB( zOcVmn$qi;-yuSoJ-BUbqG-V0qUS1*l7q&rxiRd^>;2pIzeFJkIBWk<`zY4DA(gx*Q?7Kq^5ST6NcBQ@02z#%sPbFy+Z~3%m}# z^~v&KpAd8+d8oi*5?XXJ_Pm!!=Z2{3k5Y2IGhY9Q7y*)LgZ3U1xoA|uHU5}03l8Qt zZ$%<X_M75kF;2N`vd*y{Fp%gP`XBiW zp%(PtNx#GhM-;oNN|9E)v_A_zN6w5hi-2gN9D*C%UvfDj6CiEULqzU^m|cY$<7jWl zM4GL9(O_^cuHO9C)k!R&=<2YY(pp#a4v+Z+Ol7%;i20Ldx*DNOpq^`mk@u(lO`anI zj}$$%5O+qNgkomAr9C3*Ubx*{c<9lm8fzSK5F~SS)F5Z7bkMDfYCPO^PXn4-rNHPD z`S4tAoV+=b{ihw3>`TEJkk;#T>^&lbWIMDY@ zX&a$rQ`8-npB^EecL>uj=tdR*kEBBsEG6SfK`io?q9C_jZV}kn^N*wm_9MAR5nNwV z(l4MX66b1GS3;M!XH6~iQdLdE`;`8O(d^>0+{fyBR2vCKgs->KMOuj-={rdnuF zanMixHs`d^m8@jX?oD9l6(DNmvW_pG&3!)%hzNc!8(DLEwxPJkG1G8FOh0F#r&1+z z+CFad2~vn!Yb4Lg6VG-yfvDL(@?k@U=@!J=ylwKkH8{t`lInE`#}8VlSJ~RNhR~f* zwms1?%^hbUm(Pdiv$;JBz*M-1lWnU|4@xI?4;PxB&!jaXg@mXnC0zcnvpQ*<81@yJ zLv_+LxK^ZxLWH;wo;E|n{EB|$^qUrI?oFr2Bn8xCHLZ}1b2k7AGS3gf8NcDnE$Z0L z1w#}D(W!cLuNZ$5O<1X=eLFkHOT9ndYoPK|FY4S+;8Zx}%fOQMN&mI)9zuXcEx}t+ z+ji*DZ8CWqs74N0?3k$IJ}=J>botYWi|HiI)r8jkm#WRK4IM?OT9)VkPSvt%Wi=b< z^_ro4b7NxKCmT8s#p}2u0?CmMZ#JR+jPI#k4fe;wz^rGlyy$H6ZlAN6 zt8ZbQ0JiW+{^T~`3QJFtBnn}9e;~;ly=dhUixRLtY5;*8e0b>m>H9Ef3vp_LwGLuI z>0nQ0L&1q)wFt~O)>_;}s5F3nXaId1eMf0%>)z?YyE6^?Y9B>#yZpoVuGq7EPQcV` ziVR<{%*(20f1RkEXKWp$!1oO2zrHkSBJw_0tMih=8=fXn2vr;K9TwmDdJGZsukZOy z_w;^7W~Sv4mxTf;6s0@dtBTf~($*XWqSu2qCj6$8!Y&mGCY&A{)5+YH8PvE2Ar9RS z-kM6QIsG%YKh}7OJ?Jhhnvw4^Hv(Qj@X=;|K&}Dr-4VU`tYMt#hV1%}rA`3DrJV4* zAYLYsiR1q!NW0RvzL9b`!Pfp!a6F|k-2;g*qTtBuoasS#4h+J`n_+~!-5dP5HFCxP zHuW->9`MvfJIQF@?p}F8=)Q=ac~z9Fb0dXf7Tcb%XYXNI6$%aFtj^p)4m_cf{v~D? zU!s7JycUcQygvFPuiPe)KzJnn>-IzXIfkr=qbCkof*56oGS)(9HjqPwfm~0U;3@G5 z_4b<%`%7kV?RN469N6Ik#B1XxGcaSU#3~@IFU)d7t@;YFYuZ(nQw(loJPFnV*q%te z;srn@CA+1wMw9Pt+xj`q_IH>F>Q=axh$p$?FxoEoETw91V&gMH_9-pKKNjpDc)EZm zCZH?Uw`jj(Q%>!{?#u_dry1Ct*CWP>^0^GAx?adS@TCk6uN7yo77Rq;v)+MLV*x;w z)h8L2!e-zNunAmsHQG|MAjEbiDP3X4f3^T$)e(R)B74I3=FEY`xr-&Lau)jKF1?8g*Kg#WAN7)`v2{_C4JW|<5YPv-jDTKB~bj4|V z-5@Ya4=fvMR_~=J$WMcmUl)c#MqrZSXqC+fP#o$0v#1a;l0jBBiC1kk(ixsVDzMvhaQwrm`kyb4BIm8QO;o=e7=?qXfO8v$ zc&4DVLICqN=MW|UT$bcTwi8(OmF_kFGn-#~96eT#{S+)l`VCV{!hKAXLY-nU$|+%{}XkCn>|>58DIEHV*5KWst06)`D`=b`m-QisW`JV z)8z*gTI)h93}c!CicdY-!QTy`dC!zTh^mII$EMc%s2zwGO_v+q6MUvHZ4_4;7w3BQ zW7bAl3_d5;L`LBtZF|)|AvAs*=7GLHW0O85PVJj-7L)=#~&u^RkI-~4 zPQ3SRo$vcbhF?3*+a1k_FJf_d^5?Sq&kOR$a%k`0hu;nVR)3)t7W5uM6e*I%q2;ZU zLWr!q7_q}=p+ZGOi_4_~w&RT(W$Ck2KmP2HSD*!-zD$G-WITud^kiChCdgRN_P@7KTCdHkJkK*^La4sZUx1rr`jw%x~e?Y8!42<96Hs6iRF!2e%ej z3+(zJNYk6?&BS45vSt7-vq)ZP@TZ!Dd8-CSrHK6%<^+n{0%=tEp>mcSfN{fb&DXQb z7JXO?Tq2-mZU>6rq2d|f;#UHLPD;**>~1a1L-;%ze7DI%2(byi1RM;7 zI)QS>_UCvi(67kbxvkFhwM}mz65?at9Jt7vJ08bp=H=hn36Qquonk5VLWg zNhX@pY@V(_3vV}#u$9nHtn^hsp>dx=a8!N>p#7qJyCF!7@iy!Ft)zLgkcGoBl!J$eX~GAeI56=MWuuGP%kd%3SFsATm81VPqi6KCCxQ zgLU`3A?0s~(s2hskgGF8jff}IaDFJ?o48qp3gmS`L%@I!LrUOjj@4>;{2K*NW#Skt z$Dju}kzVkbn1q;c@Z__V9x6htR7%AXlGGE#%0fU6xxP@oQCZJE#R`>C^4IP{;v>@o zdC3}MPe4{~gIH|Ak!=g!P(^Vk@bB@OJW!K1TCqRx zpZxHBpg}agQ)Y;XwB3iWybun~qCxZY5|C-E4}d2%_SoZ~a@oPT{JG4c5j;!@rSdl> zBbox+S6@t)uU?;h;N&~#7H9U7_9$oVVGw1UJ!1)Pmkfcd*}@RGqgjOxIPd7+t0to4 z!z6(+z*YQiQy_!&XqBhCh?NUv2>**`^piLe<`#%saZ6a0b>d*ARj&5bTs}K9T#|RJ zJKxEsmZ$Amv9tY$X%{)EY9IetaOmJam1zwMwrt0^`IU;TP?v-7rUb^2*3ozq4#5t* zOqnS9qk_Rxk5Ivm{U)OGdchu3b@s5L4#qy!1)z8w;klB@a~GMXLFuU~4b59S_6xfq)LOcx zbaLg^1m{3Y_iK*Nf?wg~ml|Sz&7Q@?{WJt32>VUbZVhg(!kYiuS9wOmI|w!duK4Zr zEDTm@#tG#q`{rd~xgVCQ|LUbl{U_@9B?fl6&dE-ob|{D#r+2!z??EY6;iP}KL{>2R zBzDJS0$9Gn4(Us5tIo_TH-nv5NEZT4E7JfXL+MK3r+Y5M6h@O&xPmuWY6U zajXC;5!6HTfL-M&d_J4Bl5Z)Jc0)U-yC9mJC?#c+6Wx|L%V(oMZ-zt8G+LgadBhz^ zaE@4~%RTjQvA1TY^DEm3e2oGwHnsFh>3nc?V{V8jOjxdPMl~bjoZ^!@E!PmV*#8VK z)~ou8W-K8IdJ8*}z>!^3Xh}_VYAK?96|;sGMWBay7=9dNb?zuSGIi<(o5UrZ*Y^^F zH&|$RDMoAf0f))b<$L)sAwwkPq3d2#$gx5=1S6CY0aVZ2& z26AE%xk>e z>C1<4F@V{*#YQkGsXWK&a zdu5{JUX{yP)uHO4_L#JtmlljV323W7X2BT=BgswTEmWPgi|Za!v>lPNVx3+!TX=G@ zk>52yXY=T#E3tFJSAMv6*Rw+tHYQeimYNVn4#ST*iF+w3`g`87`vHhj7ze#ye%jZ0-+S_kzC|555h^7E@~Ulo^@drm@QcJ(Y_I0MdOBCjeA`27ddHv3vBgee5 z+Adzt>UX>K!&8l5AIonqcG-;>Rv+*|m0W$$6;y#lYrM@_am|b$O#qGJ*gR?6tCCa0 zoE-6~)GsfFn76o9Xr>qm7wPLs*UEmlSw*~i{Q4NP1uHTb%#z3G8xRIVO6jHP#kVP4 zKE2`{cS2smkuPF7m`|SvxTxwgPx80*_9<%@_8NM)iOWfv0+gNP0!R&_TG(#rWhaYK zZ?>z!J40o+Jq7bf@Xo8V(+$9hTKID3vPp5z+@mimoG_!zz~2kSD|G#FZ_eGjKTq6q zi@>8~K1{Dox<@x)@-fjY=xRT-yuK9QKF`-UQ3u9KfgJkDa&Q==(#>t()Mm2Nj|M5a zQZ7{nAaFm}wIu5HX=dJ#<9WP?@TfsmZUM}OLsE;7w4dxaAeCMra<=)zd>o=E`7YK_ z-&2bSp+mpVUGEu`!(8?}NlB&AbXBztnZz47fPk2-CJts`7>L%0Hrz5^uq#Ola!$i} z-*ZDI)3-k4>UImsWkQ7_vP2inS^G;t<*b=J zK80Er77hV3;07`ZJ~Jvs&CXc&Wkm{uDKPTkaSgW8!04yzjB{?mpDX470-ov1j{e{a zR_Z~~ukTK}>#fxw)3Ng$cOyd}^uQWdTrcZXp}r`#0R|zu`)Ie<`h`%`&T0!L`?@LK zAc7(yl%~gV9zBluHs4wvO^`}zn(v=wigjL-@GTSUI=uhuKrFX#t2)^33fW` za0myI$^#J{jX%YirdH<{qWY$rwrp(^f)Hz!;*3rY)d%MPIMEAN=(7iW(;b9DlO?cd zx84g`L`13n&^s1Nt5trh<1}w(?ejx&m{l}D^o*g=>J0`#?!`AW{DE#C?kF`_#XFhJ z4?!*eTiQHqnGTQkAN9ejkb)lM93^|(Ihig42Wo#5CU{bgVSx^P*EA8hD84gCOC1-^ z`7=^Btwkt{LZ{*oOg@?Iv278UMm2{C5n1@{_UaP73(S$QE_)r#2%8j;UzXhoM2gbf z?xzPp6H_bm6Ph=LA8@0daJA|H+IE3`Nq;a+$D}D$md}fqM%=48WQ_>e8(;iJ>AM3Z zoltqOD2Ru-KeNU^#<=8hq4DZeD_0KhQ9c!q7zsOaApoYBo-!EThf`MRZfd2(B3KI7 zJGcn+++Yv+5Rqp0b@t~8UtJf<*W#=T$WqV<>*O*}VrSwAXOwgoG~&8E21n`4L?%>` z<>(Jv=5Y-VDDN>O)IwLg@M$Wz?H)}hko)2)L6hc(UpbrF5;$I=aUld?ysm{-=;ue9 z@!;sT(R}{!lCZ<3K|8jB;KzuxVq`+Ig$Rh4*+LGh-r3vug1T&1$=cQv<7=2{(;}l z_U5&4Gla@{mSlAYBm#;{<+2V~Ky|qbs zeKR{lttFhFvoBWp*pJQ^fU95&RUKeTJ?L4_nRC0&Cyrzhq}OOr$lAjogy+3|zI{+7 z&xn9fuEW+pr8MZGDf1D<4Hz?odwQI1*q?vo^;`c7-BtmFak;*^%8zl)sY5B3rL@PvO<>|qm7F~ z+;)iB+YAi5i1)vZ+jM|-o-fSK^jzWP)%=#{RHy7d6j}}SBCf}~S$o)L+&MB9zGnfG zB0*!xF^`K?h&?mh8>5KPj$J9 zqE({=u}6C0{9dYc_Mo*dog#)TZZy;G?T)XbJfR(1Ct+){yR6@+U6}Nz!EZe+YY@na z1vEkAXZLn@ys(bv2NqnZqa(3^hRVgaT<27Oya~LJ&~KP-ucPV%c)1(%fQxoY%~HGg zUThbmf)ng;otC{Zh0}ut7WtJBrSYuZ&Oi85st{s1*VE9N*isdts#y^?-<5*Y}V7eI#-;JwuaS92^=I<}lz`fVs#vvCU;(G_{<;Z2rKUKM|u@*|q2A z&V0Lk?|X6agR?Z{7;+*!$*cMveC)IjL9{b`%V36h{TwLQ-L24q@$-+F4)R>}q~Q|4 ztj-My6PX}(#2r!)DU6UF^Kz4pp-2387z7qYG++!pwe>x8F%lDC0;xh;aby@yhiAI3 zE^1R%WmV=U!K5oPfEsVu^Yv3eFG4Fl3Prz9*vv4zqqiryPkMUOUIfdJA5IS_HL8m@ zCF(^kkf!YSL@rL?jVYDl6gU_y?(fQY?24 z|IbuduYGtS=_ec`#F4+f`Y~$sPex0^99nDDuD3toQ2x13kevYM&D8l!2EOl>CXZP|8XfVbYSsy+gtvWnEg9%0=JA!!mQ4R-2DR*)_Qrf z(e_8r{r`MG((BAQAamVduY{UEm^Zjk8=fA#{U+{h;11FrJf8D2BiY|S$H8$jQ*f;r zu7V4FTrZiAqFehxCwL6EZ010HP~ha+jhM9l@iYJarG&;|L?XZQ4RV>cg=|vt_9Qk1 zF_uCM5r)rdP~c%Oetfu#=~sUl9fbhYkKbQOfYJ~yXJ*`VhD}1|+9p`THy?p=D{p3e z)Qs%^-k$$fQ-A$0&w7@2U=)xEjNfXJ(QT2`#*zD(kRr(qdT%736G6GNfF?ojm#qD@ zYleHnbLzXD(Hg|%*8aS`_n>TW1ocx|-W7{Z40bf(zWSLoRQ_0aO&vYglj1wh{bU>$UJAZr=&DV+cj;O4 zZ~wpfRkG-}&SD!=bZf9@6$6a-&bVhF$6z1RK*wE>ZdwD`WQ))mZ7Fw_D1XfU{^N~& zAmQ$b)a{^|?`PL5ZU-QOZWl|gr;|+>%yNI22V|GO`hdwvO?5o^Ouw_4|JXNw@MW4h zYzHi&U(fRW@*fESr+VrMZ2nIb4S#*@Btnp&=|sk;TInBrivLqy#GF8=|I1;3H~r#| zq#*giY0K~I`hWFi9aO&z*DlM{wf~ndU4sQ0^6vpgzx)cwB5}fpP=mjE9TG9J;NVK@ zJlS^ZmyajqLvZaZvGP>^^U|wedY0m7dP-~kbMfB}S)8ajw<@J6`Ht|@LmwaF^E!^Y ztOaQ19?eXP%|3I(^xJc3nZ12|nK2cGI<6r;!ahtsKDmt*?k%+9!VJ*s4F|3Z-6ACE330D1b?4n$f1^7XtRMo!z z!{N;Ck$&&>$1xuxLT<+bH|q{bQhF5%$YBOnE1*A)nF3J|MALZEVf392`*m^z;0KVNvB7-m?ZxU-JVe;$PdM0}S0H_da)bR{Ix{xw|x zKliI!5N|BoUj#SXdsCBG&AeiWsWA$q7gCMNIUP#Fi- z;U#D_450U+1AaS@UAM3i;7L`KUBy!5!gUb3GVt?jtuB1v)Ge|OK~%<_&N|8N+Z%Ct z+Zm9v-n{?}G#MFLBSeq1l#2u3E}N~=(*%A0b3&OoGL*scW zjC$z?d-t^lKTtzGHv*1Ram9nOE3Lx=;!K9{5qkeq!1Cva1mM*1B{|I{$( zy(_$;JwH-jI^~v^n=4;wZ=4D&nx=C$?buDu(+f+t-_P)=P^=a7|G2D0cDXN{FH|<9 zP|tSFyZ_iNJn!$T_oFoOF_ZAw@86xS)uxu2kZ=|BjG`y2#syWJV+aHht)@CPd-96d z)|V?M@1M|CdY%nQ%uS7rFF+q510%WZXL{MwUVnyz_Y#Zvh#$??=z&W^Ns^bs!oof` z7+63Y@*?oOLs#-vhAJUiFT9GeO^QXJeKZ|n<*!8^{S;?}WKdkyw*YH57c9jTSI2q) zK^Pj&+W85;&#QD^F?RxnThvMN7>jvwdDQ-N4W3Q47Y-zpTvwe}@F>oO0drae*y2=* z+N5n1_~vFXyM4muZQsowgXzfI_Tu$}HQmm4SyEPiEXNcH36s{2QToUh=|`##bAu>Z zbJ7mTtcW*9Pj^Zo8dJtS*G(>ThP{~e6V+J&xa?UPMOZ91<-lk{T-^DAw?9?KtI<8u+G<$XWDd6+G#Lxdw zf$AstaA&2U-RGw}{dclU?=+nVgCOdb0E?LhsqAcVzJblbF`q&t3Rf^OPu~p^!p?b! zc0e*(-ng#R%J%ro>}9@byK(F6&ohg*f`WpDPfe&2BG1I!32MbA=6 z!lTzLx;h*qHU}5WOkH46Y~T7Hi=ucBHxq+(;~!?Tr*c))5eekbq+7o-?{agbm4f9g zwU?jA?A{Y$0huL`!HqTh1Vu+@YlkEOh|Ist{>8O^*UYoX?2?RoK zW-TR~=A+A#KUkqLN2rCm&01j`Mf}!E!hT((-B&ER>M&e=?ImPl<1%%N>h(x2UKbWd z~tBp$&2fY$xl<&7<SB%nVGo{N-Z@2M-3*nALe)IGiqK89#Ickhc(i?c&2!q-}3|hpOGg z1~!FGszENIJnid%$>o%yibVg7cL%X@=ohjSUCEe3JHmO%rR zF-HR$7^VseqFVJ5@=lgKvN%ylZc1sZMsSwCY4yKNo zs7XqCLdbOWIU(Imy_rCdkVLqzlx@C-q1eG@!JqDC1iOSQTUxPymfYXSQ%os&sh*CL zbl+=J?yhiisVtJ+T!c4q|7e14uA~TWPa#xNO>nZ0!)A7})bGuHW`t6`LY@#g|J9-= zEo9PpwQB!3wZOe9W1`YZw8@N{)y-RD5Czo9+Z&xT;7!CH0EylJ%WvPm=F`J z3GtfoLLD$IDQMCFFB__t2k(odLs73~5Uz4-Yb!rbyWF{;D+!nu!A;(`dGN`I@5075 zPZ?ytUfkFXNi^IRL7x`}3s@KP7sl?0a~2YWl=O31{`@5gMAYB5XG{Ko3^j-mDQOfJ zM4CLa6M>OCCLXZlfi*d02Wd)b&&WqR!2#2;0i;KT8Yi1@6FypBECW(+J0S`&AHM?- z#Jl~G1*RRBVG7_!Gec;*C&0e{0&<#$f@qdU!3_HnL{Cb{u&k1+;Hi2iI@1o^mdNCz zd^_OJZ=3LT?kR=YvVwix3?Bl-ckWMqgPalj)kpL6i)H)JhXytAKQAQTooe)nR6s`5 zAhjjEu7ruo8E#`g)0_S-)7hZk$HE&VS_D>`T-V*r4-}d41RRA?99@axeAFWM+vmN_ z!~J$r!y>D%J-R*Yc?q_MAkdc3TiOD3$-5VQZxtMwE&*bcG{F1 zZe0`IfgumYIG+{Mr_bsA)YKs%@t89BKJo8w<8{vm9KMMJY;T4MSYYJ@L876c@nM+V zO$0@#zn$Vt$)g~9?`nA1pN6Kj$$jCgiV_vHWTAx!KFS)w82I@&96_g#S z%BN)-Aq`(K#SLj`zS6IH!aNWSsmX9bMXBHZPz^F3^`7$cZ6KL37_K&-AEiedmfe4A zs;rVhmJkW$=_?SF`3hHIar}{C(;y`GM+++%l#RuNJQRUTyMG2t@bfp~G9hK6oK3Jt z`s?<@ls|osFA0V6{4N|-b_Q~?#T^9U@)w(6DK^`P?3#fIcWP1l%)VTs1`_R32P=`D z%W6r5Yf1+$o2Ycs98KwD*y0h$6jFsCE_$f&;Sg#OJs*lr$49{~BKMxxdicih`?nhC z=&P*lb z_m=#^%MSnK|Gzh`T-T7Kdpg8toyK1-jy^gjWQEuaCJ^1ZUGtv+BJB%n!mXV^Yr4xW z|F-l0`Tzb%P_3?`GwV)yX;%sOis0t#_k;01(J(~?fA9S+2~D! zmzIbjfG+QoST6=fEpSi3WOOz})bc4SoP_Af5z6yOf!Y;JVQ^m(Xx_22Puv)_$66OS zKRTZ<-?XF7>PRWgR9Kz7k~`9dpMTnJ_t#zaFK9WfJ8YkyihN4%Kx-eF3*$@eR|mdx z+$rZg?|*P~S)Te02Fcr>=_s}t(;<%u#_kk=gv4k%#=W!)lKiya-j=%Kpw|FcVGz`S zYK6tc0`>&5AXHoIZ;li$>II@(yqrwY7~faxhPp;A)8cU}!$ceL0)-psxhihqjtdlffSi zzUWSXqmwr^I>BXTSFaL8+=2GI9KV-G60U?0SC^yq1x5wYOEh();DnpbIJ$eJ;@S?C zU!grh*yJpJ_==$!-4Bj-w!V2GgXZ-$no2kTky7AOYqp;v4v0$ zETEXEMXfA7W+cfe1LixPoslOWtZxi{`SLf&$SZM$hT~;&TmZ6|og-ne*JTKbya3uo zWM*R%7)041-qnXi3F&5Ea1cJLSio!|hD||`?x295RaIA~hampo-JOM92MU|jM`eaX zSa&fkOKMycNXBik%*hD+oe2G8u1VXufeN=$SI}22LX>?Y=rC|~@vGa*N~|7}!XI7& z_#NI>S?^Ev9zMgJ`Qeg90Bx@63wQXz(1q}}DuZ(8Fn44wOmb6ubS+K`8 zHs3>l?YN$*$2D!>>4&p*Ghe6e2>NHx3A;zlT&=Hevx{tQdm_Z;XExAsRg6aUtFzLw zpxn5d82bS>>3G$3%*qU<{noN$+F{PJ_USoruQo6E^Nmqde)GbGfGxQ}50889Yj?@v z&rRrW1#1ceG$V%YA>Dx$udSzcBhK<$>os#w3)x=>ZO&xa$5<$)_Ysz`4*&7KmF7jb zu_MUJk`XtI+eD`tR#u2+*ryv_gwW8$D6S2ZrMr7Y5W?Fwf(2pb>=3m+@dW5)xSn{x ztCI**n^uEO(hY|p>(nZ$tnm>t@l5fH+PI)7lI}hMzBY7*+Bov^@^X+ayA^@xXc0_M zf^&UqJ+M`V?IC5%vv1gTpJ_d|=$^T{s@3zV>yPX$bSaCdZR6dOjUWx0-3~DLz zubXB4hxfUO6;yn%Ikvbd7uSY9znv#-DRsgrtA}$60Qsy(?qV0~r|DiaPF`uBly_N} z+-7?@$@X$>9o%kM-9T)g6=_?{SaP)##D2whIUUV#AN4%Y{I->TI_jmIhhe9QIthGHM&s1H@XDgR$ zV68*@;%u`#x&9FKMGX~71Z4EgIXpFs77|vt)4AnCh#p3bIaQKul*HwImU1^PS)gvK zh~C)-H)KYBn^1UQ>z6**0Pb7sn^8uhFKXTk)a)G9rHW9Wmbz@88y7Q~rC*AL7I|)3 z2F?MIhPNd4^~{mm)scV#NJcBIm8+F6OqP%s?W)dEF1y2mj~OFfE+H9Y#P^g*oR2;x z8J(xTCN#C_nl=RAPj$z6*Kzl7NH#o&8RmcMFMj`0Jy~M8D7VL{=uGh{cuY+n+SniB z4oA?7S8f*lIk8PhXm2wYtI%ek`}Cxu+$*-@ku8#yDZLY)EHQx2M<&9=MMm^AwF2gy zuC;X*yRq@ddz_W=@88vh=a0ZJSEjvTNAneB}N2dNI#eBn7DSfvFm&DM3E`&E@$tc%~Tva+bV(5d%7wqPQ#P> zyfw=ahH4f2Zj8_fJ6Nhmg6rWmgGPZz!KLZb_=S@jF5VnHCS5Q>^roYz?zY=UK&80l z!oVb=qzDpBo6vFypB2dy+z)_&lbERgiVf-VT46&YrcLKu(9nEqI)0bwd;23WEgrkax>;k%H z(XDRmYZx;u_DM_L7j+@dSFFa1yWfTpPs&FPh6z=inhvq~=L(;FMM0)|x@ZCySfnBl z-Z0w@+5GI5eAbv1hZZyvJx+U=Rp#rKG&@=>O*AQei=ys}gJ6>-;HX49 zRJP869GtPolX)3gEZI18lm|u(93`ihPpqtWyHhJldwajGPN3Ge5C!S6xO``&QJGLj zU88d^Y5eJ5@6JEnphT>WiXU>97MG>*(X(zsKPzd+q8oovU89{nF=5!ftAddQP4&d- znHsdu3P&m&(gmnto*}!lG|Wb3O+ZsMO`X4CE5%#;>ybl-5liKqnJLU~tvXVy6Fin= zl>vX_0d2&kjk$bwkMnEuO0y0SE^^yIy^gZyJkm5*!BZg{i=*nu+TdO}1nJPOS*+ee zcKU`r&Di*85rAy!uHdcxz41bbwAfy5(M!rEoWt8C=3QSM=v}^Lw@)BB$TNUcSHFWg z9q8iBnI3A<(EcDp2fB~;zF{lc6ybuuvV>XDJs-u>o}}-!6Ltc03NfazXEYYboPDhv zsTdv{z3-rahsbF;m%+Z?&8G=biP&`Vuu$QmSDsUvf{T5f_2&Ue7w=A2$zg)%^bJbS zye?c|sV2{{mE{Z54q8qDp+(%E5m0!0czX*AQud6ciDh?NZh5Oi#=k5Bc%$)Y-WiV&I-(XeF1{duZ zgHMS?pH_dbzpwP*=`6Q8r3nX_x=ok;pG-|mqEm+)UCV$MHAG1}1eQYL)a``^yd;cI zK+T_=mn?5yK@sjK(sg@Z_+TlrKMv++C2}c$f6Fv&?@CPZbllHT5mgdkkEjGOpExi~u+h?VHQ+?rt<7URLcnB#cn(plOfLD_P#GT{B^fu=yR!OHZ&MJ`D zJ@sGbJxQNf?6PP$(wDdod0b*C=X2bo$>H;B7HfoNa9gM&$Bfpr8A2rW;Tis9n23;3 zsOb)inIfkUPX0d|YXY^5aalo^An4mkQjFq}W*<$NBD7%GA04U%tdY)>=|2o3T8Cx9j#e?&kvU zLB3i9#FlEgNen67X_pI0NJy|&Oc96uODgxoy%mQ^@Lfkhs>Zv?JHZe}{(5ss3l&%n zqTR`kyXO@f6{7@mq2#K#Gr@f-hQ5MQKj#CdK~@|3k-=T;U1}#Gvu_^^+UBby1Co_q zUcJPhm<=8Bc*AM_yckE*D=wYUzSaP$1BgOwlNV@1Q=!98ycq9UhlN@tefZ|E-I}1F z%>JfnBmWqNX8aejKD+1ZS$!8!3Ur$^76p-d6b?5C$I z&KIS)tLdYyMCeDBrcMPF9Q z(09P3#%_Pf5gVqd@GtZieh%wFj->?;eAuz)$|VC}N?}=w3pyu0;GoZV1fe?2 ze1-jSRK&2(t45?$`VnaYv4&Ezn;RNGi{T**Z@3zU{`Ac_L@&%Ft0lVKt497=zU}ODlAc1?D9GyVmre9p1)San`^6#_azqu7?q^cUSe`rfg zVFFc^o;b&n5ioaRe%GevF)QUy{* zH?0#A&ew0l8O3eM@>?c*7r{sDtZeo$EQ{@@dFwXq*`LY^IK6z`xD-?O@H6$$jTtE2 zLm`Dtw1J3*6{+gzB-qEY9XtTta)VrAk`AX$d%$Hx6A!}@> zitb(xF3l3hf|b9BO|QvMuq$&?D^a|&qt?@Vb(kGi)({XR=vVP{tuKEZQX;ib`FcrQ zUW&t2?FTFBgvj!;gbhBLh>!6YqW-xe;04FoT!3v9NRO6i8-)AWQ z^xR3ErFtjbB=kh|Ry}eo6m6-7bV^wyGz3@UUbg;+FicAL1q2L)l3Z5#E)Hb^nS>wC zH5iY$skxnQ$c5(`lJQs^0mHC8SdK(iFGhXps55ifuQFpqVPxOLnzp&MHo^x!k`(p( zQKKEeeb0|r6)p8f`XZLe(OIdOUjvlOHVt9w9C_}d<2g{>Kl7=1DD+NQ-FU4-)b{EBQ z_S&Q8oZDbm(F5|OMRtr74pFMCAGTNOPC_F=DoN6MA$v_W=&dMf;nc)R#GU>f6*H`h75u^F7P0O$jNy)YB04VwYw#b*bGj;y!oguqQVw@c3 z2*??9*c0nGtUNgg_?E^o?)@v*P1xNPzd^t zW`rVlUkR5-%?8Dt@Kg4zXI!^-8k|8+#RbW1e%>qTaA;R0Y?@BfMUdPo-$V=&R=wi} zz>ayv6>7VM_n00&h4GmLbsuJ^Hdz!Y)qd787H2p8((hP9ZOv_hmBinxV)BS>2QCn% zd=-gp%iUeL<%{aZwxQIr^)Uu2b6`p~0(94L*Mkl@39>}&*{ywU&;pAuB2{}e-N%P6 zgt^aKo?i`6jNRT|tHT>$GwF07JRdH#51$?^EcMLwd$*D6>g>fQ4zwRz#$ zrXL?AR5%CAhZHvIU<%#fdOy0Povhb0hc^fY3V6gAc5*s6%kX1`B6Q}tK;}xu7cAm|ph6}?4-67u~IAITnYyI?+D4#1Z!t8@% zldlAdIQ*6DHIB{FZf^T@SX#&S`buetNT{70?X_-YeKml+C z+|Jj%k7s#a{yoyy2!1w#jez)sn6+t;M#@7BRGZrSjXpAT=yKz!z1(}C41veg zI}-%_G=tbXRsa2w^eH}ZR?^Eux8&8=-RfqR(KC%4@Q)oTrDYp1Hkat>>1$)u)zo@V z&${Alf4cQw($9JB5)jb$`P}zMtkrt@)#peD=(xTQjg74q+%4BWxOj#x|CRWsdby`% z{2?0R@%c|)l^#`A{SFKyS~ZMRFLf`?gu zmenOW@BjXH{U2V*dzavy?KHb;_HWFb|5V(;Xh#t|B?H1qEB{_b^bZS)kKl<$x#ZHP zF-Vz*y|({fKJedsc}(K>LLd4=DE7?1S2Olx#WBex*P+bE?S5XHll8E8x$Yf$u=?R23|_tqo=4dWHx8fBd3m9P|-Andk_9^XC4-$H{@qOG{U1 zRi1!w`Y`b1yOHkQ042vNaYzrLp7bR6?LT@V@l@&`zXx~c_EAtDehV{CrKRO%MZa_2 zn|4UKQX)u%Nm>My`LDf#OiWGXp!Ns8%55N_lH?ZYBHbdKla3@D7WI;D@*U>mUiGc) zzAZj%l+@!Vf$F$_kzLhizE`Bv{wZIR^>DxblV$IzrrHSni5kn+4Uw$@xvmT7RJ*7t zwS4KwQ&#<~?va*}ZDCXO+%^KA)v%>eR?$*b*ZfkwvYGhq%Dz5jGs3mMlo;_Tzptfs zQm=+^DQJn>C47qA<$A>{(&?7j1_^(T?f6Nhes%2>ic~L_S zy)yT&v>nvto2qZz^*LrShIt!?CgOQcn@Wuu3D5mx_UP_e>){4)|^V~kf8OJR4?6dkef)6#@Y5k{;t-9pC z#ZXP}ZI-Eug6;s^FVT41LY@GtSiYt6gg*xtRtZrn!RJ97M+mh-Kt#4y_mM33qgoC!MM~O+#YxP_l9;)(g!9%+6Xb?^F0hr(|;>TD5Ad*3EE*JUYjqBI1Poa#oXEmer`tunbH+@NByIz~vLuC0V zMy1y@@M&JxX9ti^(|1%6R*2)tPT1C8!XRVM}>z0-Tzh>`E zv0Lyx_f~KL5x%_vPCQGyzO4tcrO?`Axo~`F|5Xh*y>_o^v9dvX`Dz)?zqMhX-IYR( zVa!1xQUoX?8r0RI$rCt3XPk-BDh#t!kkl}Tn|5Qjqd_eUjxza~`PZEw6GsV4^BjwG zOB2{+mt&d!C1b$R9+_mF=|o-Z0$sfEOlN8cZmNG+8Ku{0kAx;wylj8> zzNM*VbE$HWYK-A3zHt$d+v}m+2GAF4;ge0XiMfNk{d9@o)nwpvre_=EqXtRk{ z173*Ex_zwJtmEQ3@R8{-P4^9)B35R)?gGby9Z@>h2hhb%E|*#lF<`1qN#I||1IyX6 ziJPHe{5(vxi#Orf<^U>%f0oNaNuM0F1}A~Y_!g#ou`GM6Vq<2PTp)rm34|*WS@CVt z0NM;>5>azJSOd7<82Y=83J|79rS}D+Xgs+3hZmVp>v~MtW$WqMmP7);HHDlvxTgb6 z3e~c;awqmn{_+t?&6o7-9g;m*6F@@czs`B#wo7XpDcEAAlSh!`;!IUp_wlATE5C4U-)Mmtbc+v3;t0J9& zqv|8NoFr#}?&piGtM@0os6&trHb@uM)0#qtO_aED&i=Sts_ns}M=!U18$Gsm;z678!uNy52jy#?aXVEG z_;e$$fM|ORIGpP2J2usQC{2|H3Rp}0_*-%tJkoM?LC#HP- zqmoiKM?>{lXiWn3yxCg?=nKTnG6Y_G@L>=5l~76h3>?jafCLMiL>E}u9_)+bt(Z80 zIb~oXEmW#>rF5g-&D1n){h}Vly}^%$ZadI-@TV?@WaqEu?a)PF0}w$h>SlKe+(LU8 z3$%d1%XqdsQ^~dJ_xSFOU*)WL_cHkWLLk+#=gf6IE2BGiUNSkfUQ5Ezlqc*(BOLSg zP{=uLp}gvPCGy;Ks5cC1NycSXcrEH9OlFbtl@Jh|>>KMFR5g70-cGThigNDUbt+eh z)sikK8Xg~QK;qBOKo9dyF|sjzh(_c zMZ5!ao>POes5E*=r&UOh7~)=-l%Sgs!K|vUZ(!DpW6M7PFgEI!%viNASzLiac8GIn zOO0aEl15$}Z&+&RlQ>SOk=wj0jZ_#v<-0v(;pzFxLcfR-l8pJN_ic9m66?-$Ud}@* z{$;6*JCj>X{$+-5?ccZ;+h?6_=Q0|Q7r+}xd=p6*Q778ppPmNEfVET1}W1ygg zGhwLcHO9szW`#ReGrPc%bjje15=hyRMzXhobr|(ooW~Hdevra3Xfc<(XinLdS?-%& zq3zYl(t@H6FL)<04{xpSu=GVgyQmIpl0syAWf1_o*8!&2h6T}+q8A8tZmxE#hx9zz zEC)xhlC2t4UOGjMqH~9@3(o{H*KPFV$4OmSNN)Esrl=9@m!)_q$kKv_^v~PuH!{H= z3#PwI!0WC($Xz7BD?|OFd~Qxbm?17jK*$qdk!iK@ePm>8LrgL?qEH+(isJ?7LSO<( zF|7UTDln2`wzPMCiV>yBbln14vExmLmON&EJyF{UQ4 zk6@yvR;-mWfkq-G#9%?f1Y8=>ti)`&%^myiTxb5j^q)o44_ zZO?o5zf_h&Ens95WMYZ6WNXBlH*FzHu_-uX=^*?fHQtI(FJJ20&t*f_?d0EE+b`Bx zt)_xNqi@bHT?J#&-HNk4*ih5Y%@oz22NPJVu+0|J8u&W5SD2AUPnDb6ZMa_@WeUGV z_E+@MGIV__t34~i{HUQ!%GnE7S5`9i5lz0i8XMkDnbuNgenr2aqxGFeb+%=1%3yMf z>4g3LZ>*!1ohYv1Q^khWed$cQ9?2daZ={jGwN<)YR=1U_E)(^sGum44caRol^afSk z5VC1c2eYAH5MaQwLsIZyL?6H7XKlGLetw5zBWj4D;5}w!`h=~pCxAUo>2nZ)si}w_ z?R>`)4u><%EP8#CU6k;|?QUIv@19U7AFz}r{Uo!*AZpM4l>np7PsC;zy>S9gD%!yV zWS6l&za2K%m`30w?dEk>8!blLV}Y;Qx}x(3jihiMYtCR{w(nBBE}{c9NI)P2;!uVU zdMvjsAqYAVRKn4=Z@BnDfnc*|?|AE?Vq-0Q-#y|1Z}n!N`?qbX)ov1%taQAxXHEZu zd<6SMnNqQsi~UpgNoVRWc;}fri@kTe);of|E(6=F8Cn1t3HO6L7meg5cL|){^EUV#p_`vUES@WEA){+4-K>z1f{jd8&h!WMp7$S?y!*T zyB{tXRWCU2XJ#(-x!1-h-y+~1D?hE~SXr!Rnw%Co8AHX+y*lQUs9*dns7g_0t_7vu zc@xdrlvWcU96j{ObuJb6(;!tfy=Zc5!FA|NyAJ!SfFp7>)HL>l_CM5?rbT3?*4~ms}`ep7D`>#&)~$} z(M$slA0{kpUn@S$MB!UXANjPt#C2x4%P)6LGzxca7N6_fs=FhnrGyLAk%@I~+18vC z;F^oD-zX8H8?3UAb@(oxDqn*O7yCeNppL1Vil%3~2c;_~`bB12FW&P(AyDb0Eu;EqN_D0fpREm;1E%|+^YTh{`FLdu|mJ5rlK zuscgB+0PHNy}~N#$f-Jo{@B24kk$u0xon3DP(Kdg%ET(UhY;;kgH)7T zK*Xb=>>|6Posx6!*Vg9fv)jCuVB7#nT#-moK`AwzqY0#)G>0lc*1z)<>PcO(wBNs2 z;kPmGigr~XqgziIGp%e_eu+^83jDjFT*ufzLhEp1rI9yD!tyh_3a}v3I~d+(3WVU+ zc+mAqVYJHMU?`V&(> zlA=7S`%0{$8cMOq99t&9Ib%ocSTF|sEh!vF)T`Bu+;l4vluS++WbRj;z0SAeQb{c1 zq}A&DkS?^dEUPx_%KK?|x8Mb2@XGanRGYNw4 z(B)J|&YY2jqCe^5-9p2o#L-IeL(M3~PR4n%Y1Kj7mYJcnPh=~1RN=I;sWpZ9U`@nC zM0;s?S9*>|*ty)x-e%WvKRcE(Hl*id4F235BJl?YG*v_QFN}U%+WF3mO?7f+VJvtr zeYMgtnz!X~z5`U%u`MZj&qB#93H}>1@vBYL8U01h)U56~dTW8VL@#v&dv89jfStF^ zQmE7ew5Q-67U8#_xaSJn?IfaL7>u$d1^72dI@|Ks4vE+%TAJ?{?@pxM`ec}I-US_` z`8dh=u`JGWs1Rdb z;m0;e1$u*_SF*UHu;`9c(>6!?|L6i;6sokVXA4T{hl~H=)~f71KKd^415a zFi)?cMLk#;$sc;TjyY~t-+#Ts@w;8(^}14%pL?9H82ym3O&i0Glep|~CY6>C8|p?| zvt!&1HR3gL@BCZlgm9|50r&HR%w!i^_u9(RSI!Y5WkZvGw<${eh37*Tx|%6&&s&PT zB1P@E^Kh?bwd4vLtbHKW6g?WOvU8C|-7U%-?R-3+hZ8naCoO&s?4&PcVD9*X#Dkv| zS(Lb;vC+D?{p8Jh2kX>4w^EzZ^a%l}re0cIY%gk8-y-Nms&~pn0>uy5#e(NyQS75E z9A@Kh{u=dDtts3qr^^21#--j9(jkhl1BL-6=bhzR)Og6`6;Z=7Yjhy9SZ~}pL9@5f zukV)W=;)Z5vWnkVeWqq;XxQ2^H8F8x-B~yUGBu730Ub5&2dwwfqt%;p=XT zZES>Q`^F^3-*yJyEd2Zt!Z5 z&d>I-6b%SPzh;YfuOP=xJ4E{T8Qt_uQrI|HK2o<*qt@lU)#vTX@Wk(tQ?yL%C#2h% zdmnl@uK6c+RKJm5q#ACXaLP8;{>o=2_E|Wl&{Vm|^WGG<+zW@6sM?}F?83+}B?q&1 zoxHPW?SN_Dbg%eHH7z9_mh}Nb?T=zvlwYT#EkS}^l6y6d8WVQPCnt?oTplAdd)A%a zrbY+WK`vSq`|D;SojhvQ7Io7Kk8o`aRmz%`=Zg(f=_W7lTIiZ1?-A`0Ifw>}t7@Q8 z5=EMR%bE*J5+41h3gwkGH6>0H?&s>VZjqyS@@@K}%VLxm*JxD^*2p5QnV*CU;^EBk8%}s2mMEq^CjI<1F!Z-^fo2LTY;j z_oO%w+LIouYs#%UM9vaYv$h>1%0EDW?ERUcsxpVY2D-Muo@g(0Cjj8s{(Lj|wTQ0D zU44=0gGxqW|8N9xFnU4A*<2lU&Rie|y5DH1DZ+)#3&ghSgwG49*H}4pbcA|aq6noC zZEw{z7`P{u^dz+}*2gw`+t~D97-`K)F(Ks-&77kB-5|Lv!$Qbd!i71vYMV#;22|TF zKNP=>8fsqX(-KyUJLy{aX?!cYOx}5<^mKgw^zCnMrwxL4&nNCwO*9l0&dMupYPWvB zRpLWWtlgPlm$W;tRaxi0X6;Z5!MYKV@{>)VLHkfKgs3!8k{xJp4{5m`L|?d#(z9nT zDa8DkwiFkxsO#^U9KYBF5LN{i|EYYCNh~>b26N3CIk{?e`xxI$ICO^lYSbd|;im%? zgpu;Zxeo`&w)N1%2$?m^#s!J0a#1hJHDg+tO7Pmq?0N~^sL#0ILGzBJ7hw^z1nwSe zK$og>DDMLb|M?e}f~{(;S1X1pFq;8=@UZlH>LT)G!Y@c|vL(R~!2GJ%Iiwv>dvC(w zwCg-CBU1Deh!$2O!Ayk4$1Xdc`Msp^GenEZ#rwkUp;qoVZaMM!ha6?}tJOOiZH*b_ zjQh(L1?1atC&VOl%w5`qZEWuno)Xe03GmXH-xQNQ-pKfExT5{5H#Wp-k?+k0PP~l! znJQ``YlN*K9Q&mA9>ZG3<_P{xRf(X3D^E zNE_k*b|XER)-R6)9AyO!sILqQf;u{V8qrnJLo%e9%Aeca8ZiQ>p#D7(z*cEQNb4DC z`q21A3{0;}ap-gI^Vg6zQpX?1^!M!HeI$EZTEqC)kMBgwEmQ!_0##jW9D_SsyEcvwx+Jv*|jr6AL#dCuOay6(w~@wMy0pgtYh>kxlK*cHBxfL!5w(CFb6&F%rb_JW0o5cUO^$N|`eDiHa5ktf;N zVG^XUjR2mQuLvl0hX8|k6lV{}Rubrr?%A)pk6}6kb8aRWt)p~kX4z^PD>|2x;U>*L z>_xUpAAlxSihLCB)GyFMoe>j=Y5yen`0L^HIHZb9?_j#IPr3nH@eGSbj{M>YOhUSG zYf%$52|d$NUtO1Adsrbw%kxtEp@6Hmr@JWur*$Infsy{zKA`UR@j*{E_+bC|z) z4gcIAN-RBK9M0>isir5Yt}2;VIeVo;>Z~HPqK_NMWhJE$08r-dujIZI#iPfD^mAPi z27R;%3SeqWn1roXb)}>8p<0li7l9o4MRs6TgeZ~F<6~owQ1skuwbWyqV{~=4&GiF? zR)5}1_Yn}_^P4HceJvc!Q<&CJ9cvd_rOrgQ z*dt6gM(sK`%ZA5;^jk)GZmaRhX@cMQiNZ8gdpgk#tA-hVuaY|`T6CWmI1+(bl7ylXZ= z)!MACEN-GqHB1d%!uQnCn?#qoachB{bUORcyJ4JW{F8Ub;k71{5$VBGcO2K~V| z30hMk0bb(ICkw~6SSuD$GZIAc2J|KR=$|`pnx?Lw32#4)xtuxFiThKqAK3(p+?8_s;sx(trsITFYRwe4F+bZ#B^+DjbBc`Kc}6Z zGV$%9&g!%vr>?q0S0c~Z-$RoYs>&g!G-HBLcyr_)Za$t+cSkXE7gwA@RV*7eYT-P$ z6O5AZgfg=(Yn}C1YpJpc!kypm-nwbBQsM8Rc&C)rOdR{>#;(e%#4`uJ?0A>m8pM5m zbyO^uk2CY;xfW)@agD48?4U{&d7M928|Bbh^0)<;gV+u*Ou;p2)Q|NckHdcY+XeOF zthp!e5ob>k0PE6&YF^d66j%Bx98a3|gh9p=)BSeX?)M-6utxDL4_FNIu{w%yqrZP< zb2%18BESt&*02rb$<{-Q-$CVc0PRb+$XFKsD6)e-Bd=nL0jpg;H=`g0d@x;X5K8l0 zXTcq;p@}tET;fj4Bi|0cc z3W;eSOBrr6MhpH}MQGymaVaJ+;VUv?iaJQiESbx^t}k=00ah?LmyKtT_%I}4YQfA|5iq(0ZsMF)sK916JBG(`8Dw^@py{N)&3ThH< zWPrZ`vn-fWokmBbm$Dgb$f4pp_M}2!9#Xv+n%{S#K86;#9aYu!>?rNkEK8m1Vt!C} zVJ1rP)&hMJYl9Ao1!3+w(Q-DrZfa4HT{wSJY&;J!wLL3~uvR^?En_&6k22nR8f-DU z-hJ&G3!@_7Z>SZ_J0Dy_cU;lo?6CXhJUy?xk^E7;viN&v>ggS!QY)9Ncmel14RJn_ zmu1&2si@6vls-yfpy2%y-`F6}5qWW!SVW93>NR=)M;YDkU0gYt7=EPX7nK#N^3B9? zmju?;*(j_OgAG}@mjAwUn?AG6PhIsq=~ro?nxdHY`1MJMyy`+G+Ub1ZnG!y&(3lmg z3t1O|0q+q@{H6NX?CnebiK|)p?E3WoycYjWY2?!lAhvK(3u5>_AML$zoaH=a_k2FU z+i>^fU|N}f$zd&;m=8Svy~)4JXMrO`>zSy=!RTl0Udu?{qe?b{Q-VvYoLH%mrQ>ro zhz!|->_~?-T4o;bM$UU6jU+)7Fo*$>X;(?unPa;TKmVnq=)M5*p4kr%`e5<1BcxXsoe&y&!cNi;Av_@^1hsJBRyK~^x9UGp5vd3u*wR3E zxkXVZPAb4WO)=zvD>Ct!?a@euRYo4BQ)Un34U``;twG=DjxZ1oa&=4iKcoD@1h1N} zFHPDHIrWKrX7~!uXwzt2508wB1=SO)XKJ93PlcY8DuoLG z>*OfHL7Pyi%F}WK(&jI7L1#$liN>Y9av<@K1X&V&4=_@XCu+3B^Husu85}DIfszRz zcas)TKu1*rb{=?fLtrw@A?7c_hPO@bmFQn ziO&pq+t+9gpS>!Dgt?v6&O!a9DDp2;2@UcM0x1XUYfjdvhl9z{e@TUo`#{L69CXi8 zWW(5|f@z^PVL`_15XvbQ(xRn(9qC}?e+ugBIJxlkaO6gAvp4qIVOCbPYR`P8{>u9_ zb%uQ8NSj;@cBt^pz3g4or@qxb(%I_28xh$T#)BnZa-|rMclLUmp3^&oc^Y=A-GHx( zkmIm4Bqlvqk^d3NEB)7}X3xHdr+*7P4_b_KS*(E@^}?k)fwumL-&}|choQGXI{(-d zJP4|yCDD?FUl1>Ib+6nq0UHIO#@8l?y7*QWTJQ7Ktc2bK=MSe1#dSn^Egfm*jsPCbRn3JMJ$lSWQ^6geA8qK^c3U+jn7S2_W|K9 z0uspOVuh`4UWBymx4=7dUkbH@HV;UJ=R6nMqB&WIN?IH>A00}<396bX^*07Ut#a>%(4$5s^ zdz5PyL4N;unD<^y;?^NuKGV-l;eV-!;xk55W|Izz!V&Slfpt?Y@82)Q{Ljp}J}@Z7 zo{ZB_6HRUds_Eo1R+5gkwMPTG87b7$)HU!(_N)>zv$oGm*s2KE6@J*XbhRlw)jl$+ zVLq5zQM}V}jBo?mws9#0`(SH2oi8HRJIRmHqbZ&i&hN;%9>!TYuFfyvdqS9a{kL|Q zeS00O|BFHMPfu2fRN|5(Ih%?(#cGyVm#RPEhJDy)GC{adDVq_(Qwes}M zk2u}IBXDFj5R70?q?B5Iv-<04sZ?7WjqgmQ7%>`~$>ZHh!!Vl|M@m?D4&2-F)fXvv z&n?w{rj-%AymB31hdL7#^Cn0(?0Rf`wo402pWE9~q^09U21y`15bS}WR}u0~ z>I8w%a`(LFWk_ERLTD(!XSFtMGy__XzC)W3LO6kVDjP6#Dw!hCfVQ3f1c3IgcLJ|u znEy#PhX^#tWl(PbPac9CKpd>sw-&vP5TXeXA{}ZkyC_8x!J;L3O};`gLBgPRW0q@; z6J=C)yACY*#vh*TH9d1=D^waa`Hk-umz)Me8u&*folcuPx57|4XG>@SXb7Wsu-W8OrkwV%r>=K;-ju*yR;1<5a7C3e^f`0z$Nuf02+> z#fm!*A3oHuRWqSGs>^4#ptQieb zt;=_(^Y%&>RH&3H>0IQ&PMcJY6nJ`RJX5pYDV@^)8B|mBc!S2=0dPDc)vt|(tT^_& z5Z4NXEJwOUhJQBEpd{^4eRSF4I?{9h<4C7O3KY(l5060v@)KV-!K;mIv|nz_W+b;p zVIlGSDsJOGk>SADcDA*i#I#o3v)0zuQ(G6OAaNxO+9N{_nV{N{8>#g)0lzq#k)Li6 z5YVLp(eps^v#kPHL8IpuZ?$qJ%Fu4KtKF=kk5aSt zg(EJ{4wh)p1uHwz<}tw9ZMG+J>qMVA%c%6*ZM5@%xjkgMyK?J6SpXU2p}>vM0~!uH zbP3)(>0ovlJU-b8J9I~3Z(^akl8B6T&8n8NxK)DgZ#fY5y#$3Y#Pw-HEGc&dXnhs- zgte6+xrNeqg3nnUZG-2ql;3t#3S-#=8`;^@gDKeZ;ae0_f;Z&ws4$W$ejR@CFNbYu zIwfnIRh03(r3W-#JtLPS`=k(0t&8?ZeYo8PlGy3$+7U?YEa&@cIihoUHT&Qr{5)Qh znj^^K9L3*wdGJp=zFUrU-qfD$p;wvoKe2uT>=D82^UI~wQbBb|he6q$q458@{e65O z-I?)nZeeS$9Ww94SWc(wTzcuu3Q$JR^W_Wi#2y>+^eQ6)X31N<4n~ zG&5!H{f^bfU77PNIr}D6X&x>}$CS7Oyo+s53h?&ZL#beIJ%OIQ1BWUpTH0eB)y8RH zyN{-SwGtz0@v=QFd9C}|MFWWoIR zg*edutRJ$diKKR&sTq@HIF>9x>Q8x`gb|yZn)0U}pDqQ5F3C}?>?cVXrDTUqD*+9V z044<99|2qQZ_(6;VRENXyd^wCCn{`~4YZ%uJH>z@Pq*3dkfAogRaW-QS#(CnI6Cu) zdAZf?Klk(lWlZ$Gk?p$_HG@Vg5Tl|3T=&Pm87q8*)pa)zBys8Axrfs4Ou+__WGKEk znsO;JaMOuUs3W=#LE7(<2Q-qOk>&5BRa?^lc7dCsVI= zX(o43zieLeyV;aF_JLx%@jF!(5@9QAN{Sow8Fgi*D3ln%k4TB^#Z*r3 zwB574mMA#WGr+x*Mt*TfIEFsV-(?|czWOC}R>&E9SPk~Ir=9ui-b@skn}3*!Ph7oH z!@<6*vE#sdsOe>y1h@7{MMZV)2|Rd`?H$)#k&ZJd3CJQ16Nn*@O)2i(7qI}HTKOnP zxC-P6$Zc@W%iVi1#5;99?r?ds^p)SessTwd4m$w>N&*d{_;&BD9TJhUH0#O8_^(GC ze-fe4nx_#Y#)?Atv`55Mecp#j}3f{h0DT_=YVIgNcw_#9T%s&uEk=B1i3iQc?avQ z7qYzj{z@GZ(WTO{zKXT$(KvuwZohGY=8y24Wb|DeyUJV4RNTX9b;Yi1&alp z;WpId{NP1BRQBrV`EyDwDT#cF$ISwYC~^EI&Eek~h+XL)hW0x+*Hv||ctLBD)&+?~ zGzApXM~Wmp>xz0r#r^%-b$nEjcm@4~c#ox~2eW@rd2s&8Y5qh6eNVlDv%fhKNe1HR z^5KVI@tA$>XGVUmyA8}0-A+{5*L>DA09~pYT@Tr>#ShkC5o6rk)Q~clIQMOUocqoM#G{FX_Zk@hN<}i%0E(rMHf7{@#EP>~~ z#DGA}VsqNGxTC|V>(g55)tMY`hn4EeAR>~bD^2401Byudne@d~|5O8BVXGRe{H|Be zDSJlxOw>D-Ok|5vXIlo29@EU*9A07CzT0K;K-rx`U#pk$@r8G=I7JQ9X58}36E24Dl>TcQ0rgMr@zmSW~}?x<#W5_r0nTm7L8_fc;{l_VjB0B&h)0*T)1Bc&`@4D;KMB}r3N5j!Tjfr4r2GSeD;(qj`y$ObXyF$_`1DpkJhB zpGiz}#?(us6yYVtTk5+EqgeO7IAMz^0r-*fN0a%r#9Hj?vMcC-KW=;6Omw0+Wo6q5 zP?9z{@IC_M!|5aH(Giv}1r}DU2x1Z=&foWFn``YIA>Zd-h&f$GWmK(AOt6vEyXu)@ z(X0{E*5@*k+VbN>#l3UF^X>*{Iox&4d5!Mo>GVCUo@Uai@8=VwU!21I za_FrzXYh3uZ_6?nZ`=DX&$=X_zut}JUou#E=DxkTzAj-^v%Cd8`j9=`dJkbY`Iw%_ z*s@1$+UtqGke+FKocaA&%V<*%IH^4CPgq*APu_EZ{NO9EK?7)hn$0T3PAg)K@myUY zX;0jMclLQ%s5B|FSFlE}8qtRz(3Z}-IzyUtMUGla_c`&vLzdS?F~X+6HERDRWR8R=MJ zx5RxTPp7vsN>1UYt)Os`b{cW69bdL01ie|EI{BlPI!5%sp!?XhDKtJC=OIsV=(HNy7Ln}G}ti%=f?4q8?gZ_=1^7vKFW;SYg@K;S{`^ArB8^7>7UM6 zzrU100u0XCB)0j!S09ipuOhD;)Ab%L(=lIqYM%TdERBOkMed5Qz@I_1OgFxnRaT|I z{hF12?LQaa9sI|Fynw`d{}}aNoafKfS^OSWs>|jKf+o5hg7o0uIa&+Vl4&Hj&LfBu zVX!%e>t!c~$#r^_B}5ZZCMG6;lcpA~AlS5i_WU!$Sz_FQ9G#nvzu$YDM%21lRNJq?Ig>74 zI>v!1i4FFIlZRvhFuyH~Z0aPO^70*P!NPA;rJ6_kMrrbMaK; zh|!Ax+aK#h`_9jkDW2=KdF=#=K+Z4g3nJOs*$YpME5HzATLpdW?`|Hc-?oZ%q?{Lr znd=bU{Sg6XroL`!L7i)Kq{xwpfB2J-XOVv^@9i3xSa<8-aXu*smL(?I`U}7jy~D-UKR# z!w>Q74-zIynELI(W)iu+)FsNgV7fViYv|VZN-w1a$ZHIFp!2u4g6@cKgz4$Oy%jok zdOzy__Eu1J-K{)>m7o`SSMc5KGLyAqRNAjM-sOMzq|)$qlBE>c7tV;GV-05eq55ai zP~GzP_b;fVRZ)w?=?DD2DR~lB@3-=JG^Y}dreL|V=kJxS)HuE~pp8V6ySLAE=iPGe zntZ#}8dM^AH>irS&L&$JFEM<~!`xpawmy^ar}XQ+a;3R8V)%I0U0;EEM6_{jxa7G+ zI{IQGs4cqaFURqY=XG{=etAqRb`r)btJN28DLjcaP1t81F4JUsS+%r6mR_kA>t?VXe0{30pKxbc~;-vjgAmBQBT{$x+YQ4=dhUFHoFAl;OCkE z<3n{XJ}q2WI=Qw<=`SCnZ+5J8X@wy{dL4!o%ddF3wv!A@=`q8 z_de90mfJCSy>yv5!A7hMn{z$kuYH~!UzD%^w|!o(F3*ML4mOQL3Iqi*IKkK5P85?1 zdrk7Y-K9)^Y=Zi2p4ixlo*LL5es7szUAV(ndEU`B7=B+$oq8aXEui0aO50d&MN<;SOi+R8IP%jTpZ*$zstggNp>e^Jg z&;xPxZ@8^Kgfl*vRGM>!G^(?nz7mF(AJo&N{(QP+S=I$u!+0Dic;kQ|+Ygw=-p%4R zs=c{XBlzgcpI1-D_%4SwLnpB9+xuIeZ^Z;nLE(C+Y$yVH(>Dj1AYr$hnAgt@=jJLK zF{sr_9U4Bmc~KR)qSuG(;KCWx@RO1O}AE9!*0 zme(`fug>(JpXtk7yfF3QOwn75Uuzfmou59Gi%VK%(^aSwdS2X|&iS)a?z zy&)5}Afgi!^?Vcr13YU}dvFTZe2 zm;6}9Mys!~3paPQ!#RNcFfT&&a-dPMuPQceV{m|wsq(7X+__;cd?@V`HL993fT9)376dMB)lAKYPY*OgincIjSpD zITN?~ICZ8U6EM?V)vb{`9A0>md~Q(eM(G@3;UP0rbqBR~B@`w`94NM79XE443&u2> zx!=T0KxGNJsbuHB^tY*c*^Y~A0P9`s=n^X{>lCQ_4_6^E_c4O>;oVXTM?@yx7A!zk z_~%tf-g-XQR+v=V&ytBql>y>-r}Em<9}kwH3-Jz^x7Um6IIl}=t*Af7L@+%a^)ZE3 z)+(j?P2t@!@WMDt*QCNGAy|tAi(^bmM4OJ1>ckJ zf<=Cf@ZpT^TY~_HzM2!&;_pk+IHj@yfL+SDjl0pEgXXNOZ}B{}3my#weEWjHoa$bx zpet38wU%iTJt}8zR_%Z_%U^hDa0w~4+QU+HV(dodb~aZ$(Lyb>txHL63E9t^fb)QH>eW6ITDSEr_Y;nIl{=4pL1`W#&NC2u(0^hUY#bL#Fh^OIUkUHe7vI6 zljQj{XMTF%Sz5biRrKCI+UtJS;zQxgLs-Y@UXh%x3U3#IA5%LAig?XYlLV(O;zsEq z<@Vh%54pD!4caJLYI2Ra-yC@iOYg(4bY(^Rgs)t=GH4L8`gH&|j!Zca3%hSAV73Hl z_UQuL4e^i{C?EA(9i&fLR{Z+L35rrr)yrfX{> z-rm~vLDvRVF-iMrV&vM+h;uZBzu?4s89Mc7-jP-}_dj<9WbAA?S+9!VMShTC1G#My zn1pJCo7M`n=2N9jPOSK!oPJ^anf=NUa~reC8|hvHNtsvm_d{-XMpGono2aA@IP10a z8M?RK-2CyG$eX<1$$6e6wYd%OuO@xF`qL!ag6>;ZHC{udKb@PuIwG9R0bE}Tgxi=~ zSO5Ilc+Cp_&G@nQ*IX+cO5;~WzaF8axtus!_cHBM99a5wDjm(odQN|N+W+v?i-xT1 z>^DXKXR}O0L>}A#?!4RJ_2+jPa5Cc9*?TG4zaX{U?f7YdjK4RV?%MuunzTkj|F1pg zYTn0h+_KqBxAtVM|G%Fve_Lh0{JvZDi@)Q?SwWGP7i$LWSy%)d*qsHe$a_M+#2H+k zat1gk?l*z)y4kfWOL}W14t=izrm3h6YlJO+y!&sYdiR^*?rkA6>+L?I-EHK_1~s$) z{)_s3u&Ee6_tieT`7WNX zJ^K66ZM`Bz>{rreurgiTzhr9@qp3*?@DfGoRDnzK+wTh9FYC{=%>|yITmAjovD&vc zH>=lKP2RFTZmCZB65Hj=_68*Q>wdfc?zYpwl?+m>-&gE7=*cB?N@PyT z?W~#G=I2f5yKR>l+nA7dv3U~kK!TaT7XDy#WbvU^kH z4$0sv?&Wt@8Ry^I+!`+~oxPB- zeg|lA<@dYg={$Zxdw+cQthqh!bgb5o6Lsse_y4Zl4IFkj0yF{GnF4izUumoEXZm;r@l61?fSX@c76Vr zdhp>~>20%g->)^^9~#zv_3=T^+5b1Li(4=yfA`l~WHoM_PVqSfn}~cvfSA)KcJr9QNSVnjWi{il%N0dZx~* z0X+Qj9<9gBvSb5? zi&C|N8k%`SvNtTA$G%xM{ts~V^|9NVdmR*6O^&XJdwgof5}nLey;CKHFBZ15F$U!Y zq$FSe3JKNT-sZ=?$`7jl1IKhOYlz(s?E?2xf4uWJ{chv3YkJz+#=j0Y2aDVXbw8IG zJdv8oYIP+gSL@|P)gQNl|IWU-wQ~2(y_wa|*#6r+y1nkAa4_VeJGA~xqY~(@XO29f zSQmxx#w83(W6pb46Ss^&q1WNK$gpoSjE3zDl2i7){3E>% zc*wEOJe!-ZS%6)zuUYPke=+60v;!Uvc^ov34cxuH-HxRi>!23Ks **The scope of "prompt" here**: In agent applications, "prompt" refers not only to the narrow system prompt, but also to all natural language assets that drive agent behavior—skill descriptions, rule specifications, sub-agent coordination instructions, tool usage instructions, etc. Their essence is natural language text interpreted by LLMs; as long as they influence agent decisions, they can be optimization targets for `AgentOptimizer`. + +The module consists of four sub-modules, driven externally through a single entry point `AgentOptimizer.optimize`: + +| Sub-module | Responsibility | +|---|---| +| **Optimization Algorithm** | Reflection-evaluation-retention loop; currently built-in [GEPA](https://github.com/gepa-ai/gepa) (Genetic-Evolutionary Pareto, MIT License), extensible to other algorithms via `OPTIMIZER_REGISTRY` | +| **Evaluation Bridge** | Reuses `AgentEvaluator`, allowing the optimization process to share the same `EvalSet` and metric configuration with daily regression | +| **Prompt Management** | `TargetPrompt` unifies prompt field read/write; supports two sources: local files (path) and arbitrary backends (callback) | +| **Runtime Orchestration** | Resource scheduling, stoppers, atomic artifact persistence, SIGINT signal safety | + +`AgentOptimizer` redefines "prompt tuning" as an engineering problem that is **bounded, reproducible, and auditable**: + +| Dimension | Expression | +|---|---| +| Optimization Objective | `evaluate.metrics[]` — a set of numerical, repeatable evaluation metrics | +| Decision Variables | Prompt fields registered with `TargetPrompt` (one or more) | +| Search Process | Reflection-evaluation-retention loop driven by reflection LM (see [§5](#5-how-gepa-works) for details) | +| Termination Conditions | 6 built-in stoppers + user-defined stoppers (see [§4.7](#47) for details) | +| Artifacts | `OptimizeResult` object + `runs//` full audit directory (see [§8](#8-artifacts-and-directory-conventions) for details) | + +> **Prerequisite Reading**: [Agent Evaluation](evaluation.md) — Optimization is built on top of evaluation; this document assumes the reader understands the basic concepts of `EvalSet` and `metric`. + +--- + +## 1 What Is This / What Problem Does It Solve + +### 1.1 Problems Solved + +After agent applications enter business-critical paths, prompts (including all natural language text that drives agent behavior such as skills, rules, etc.) are among the most expensive assets to iterate: manual tuning relies on engineers' ability to summarize failure cases, and regression risks amplify rapidly after scaling; coupling between prompt fields on multi-sub-agent chains makes single-field optimization meaningless; model upgrades, tool changes, and scenario expansion all cause "yesterday's optimal" prompts to fail today. + +The `AgentOptimizer` module completely **engineers this iterative process**: + +- **Explicit optimization objectives** — crystallizes "what counts as good" into a numerical contract of metric + threshold, shareable across evaluation, optimization, and CI/CD +- **Algorithmic search process** — reflection-evaluation-retention loop replaces manual trial and error; process is replayable, results are comparable +- **Multi-prompt joint optimization** — supports simultaneous optimization of multiple fields (e.g., router + worker + summarizer instructions, CLAUDE.md + SKILL.md), and uses GEPA's merge mechanism for cross-field search +- **Auditable runtime process** — each round's reflection input, candidate changes, evaluation scores, acceptance/rejection reasons are all persisted to `runs//`, supporting post-hoc traceability +- **Controllable and rollbackable results** — `update_source` determines whether to write back to source prompts; `TargetPrompt` provides atomic writes and failure rollback; half-written disk writes or secondary SIGINT interrupts will not corrupt source files + +### 1.2 Relationship with the Evaluation Module + +`AgentEvaluator` and `AgentOptimizer` constitute the two ends of the **evaluation-optimization closed loop**: + +| Module | Role | Output | +|---|---|---| +| `AgentEvaluator` ([evaluation.md](evaluation.md)) | Measures current prompt quality | Pass/fail per case + each metric score | +| `AgentOptimizer` (this document) | Searches for better prompts based on measurement results | Optimal prompt + full optimization history | + +The two share the same `EvalSet`, the same metric configuration, and the same `call_agent`. One set of assets supports both daily regression (pytest running `AgentEvaluator`) and periodic optimization (night window running `AgentOptimizer`, see [§4.6 CI Closed Loop](#46)). + +### 1.3 Applicable Boundaries + +The effectiveness of `AgentOptimizer` depends on three prerequisites: + +1. **Evaluation signals are sufficiently stable**. When the variance of the scoring itself is greater than the improvement brought by prompt rewriting, the optimization direction is unreliable. It is recommended to first run `AgentEvaluator` with `num_runs=3` to observe metric cross-run consistency before starting optimization. +2. **Budget matches the search space**. A typical small-scale optimization is on the order of `max_metric_calls=30~60` (one case-level evaluation counts as one metric_call), 5~20 reflection LM calls, running 1~10 minutes, consuming tens to hundreds of dollars (see [§6 Cost and Concurrency](#6-cost-and-concurrency) for details). When the budget is significantly lower than this level, you should first complete baseline tuning on `AgentEvaluator`. +3. **Prompt has optimizable semantic structure**. Prompts with fewer than 20 characters hardcoded or used only for placeholder concatenation have too narrow a search space; GEPA reflection degenerates into synonym rewriting in this scenario. + +For scenarios not within the above prerequisites, you should prioritize using [`AgentEvaluator`](evaluation.md) for continuous observation rather than starting optimization. + +## 2 5-Minute Quickstart + +Complete code and data: [`examples/optimization/quickstart/`](../../../examples/optimization/quickstart/). + +### 2.1 Example Task + +The agent in this example is an **elementary school arithmetic word problem solver**: it receives arithmetic problems described in natural language (e.g., "Xiao Ming bought 4 apples in the morning and 7 more apples in the afternoon. How many apples does he have in total?"), and outputs a numerical answer with units (e.g., "Answer: 11 apples"). + +The agent behavior is driven by two prompt files together, which are the optimization targets for this session: + +| Optimization Target | Path | Role in Agent | +|---|---|---| +| **system_prompt** | `agent/prompts/system.md` | Role and response style definition (e.g., "You are a math teaching assistant, answer in clear Chinese") | +| **skill** | `agent/prompts/skill.md` | Problem-solving methodology (e.g., "First identify the problem type → set up equation → calculate → write answer with units") | + +Evaluation scores from two dimensions simultaneously, both must pass for the agent to pass: + +| Evaluation Metric | Type | Threshold | Scoring Method | +|---|---|---|---| +| `final_response_avg_score` | Text matching | 1.0 | Agent output must **contain** the reference text (e.g., "Answer: 11 apples"), case-insensitive | +| `llm_rubric_response` | LLM judge | 0.66 | Independent LLM scores according to three rubrics and takes the mean: ① answer value matches reference ② reasoning steps are clear ③ answer has correct units | + +Dataset size: training set 5 cases, validation set 3 cases. + +### 2.2 Prepare Environment + +```bash +pip install "trpc-agent-py[optimize]" + +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +The `[optimize]` extra includes `gepa` (reflection algorithm implementation) and `rich` (terminal progress panel). + +### 2.3 Directory Structure + +```text +examples/optimization/quickstart/ +├── agent/ +│ ├── agent.py # Defines create_agent() factory function +│ ├── config.py # Model / credentials read from environment variables +│ └── prompts/ +│ ├── system.md # Baseline system prompt (to be optimized) +│ └── skill.md # Baseline skill document (to be optimized) +├── train.evalset.json # 5 training cases (source of reflection minibatch) +├── val.evalset.json # 3 validation cases (full evaluation each round, decides whether candidate is accepted) +├── optimizer.json # Algorithm + metric configuration +└── run_optimization.py # Entry script +``` + +> Training and validation sets must be different files; the framework validates at startup that paths do not overlap. + +### 2.4 Core Code + +`run_optimization.py` consists of three segments, corresponding to the three core abstractions exposed by the optimizer. + +**Segment 1: `call_agent` — Business Bridge Function** (see [§3.4](#34-call_agent) for details) + +The signature is fixed as `async def(query: str) -> str`. The framework drives the agent to complete single inference through it; agents of any form (`LlmAgent`, HTTP service, subprocess CLI, etc.) are all accessed through this layer of bridging. + +```python +async def call_agent(query: str) -> str: + # Re-read prompt files each time → GEPA writes new candidates and they take effect immediately + root_agent = create_agent() + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=root_agent, + session_service=session_service) + # ... send user_content, collect is_final_response events + return final_text.strip() +``` + +**Segment 2: `TargetPrompt` — Optimization Target Declaration** (see [§3.3](#33-targetprompt) for details) + +Registers which prompt fields will be read/written by the optimizer. Each field corresponds to a local file (`add_path`) or a pair of async read/write callbacks (`add_callback`, used for arbitrary backends like remote KV). + +```python +target = ( + TargetPrompt() + .add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + .add_path("skill", str(SKILL_PATH)) +) +``` + +**Segment 3: `AgentOptimizer.optimize` — Optimizer Invocation** (full parameters see [§7.1](#71-agentoptimizeroptimize-parameter-table)) + +```python +await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(RUNS_DIR / timestamp), + update_source=False, + verbose=1, +) +``` + +| Parameter | Description | +|---|---| +| `config_path` | `optimizer.json`, defines metric / algorithm / stop conditions | +| `output_dir` | Artifact directory; created automatically if it doesn't exist, recommended to use timestamp subdirectory | +| `update_source` | `False` only produces `best_prompts/`; `True` writes back to source files after successful optimization (CI scenario, see [§4.6](#46)) | +| `verbose` | `0` silent / `1` Rich progress panel / `2` plus gepa diagnostic logs | + +### 2.5 Configuration File `optimizer.json` + +The configuration is divided into two sections: `evaluate` (evaluation, same source as the evaluation module) + `optimize` (optimizer-specific). + +```json +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": {"text": {"match": "contains", "case_insensitive": true}} + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "...", "base_url": "...", "api_key": "..."}, + "rubrics": [ + {"id": "numeric_correct", "content": {"text": "Answer value matches reference"}, "type": "FINAL_RESPONSE_QUALITY"}, + {"id": "reasoning_clear", "content": {"text": "Reasoning steps are clear"}, "type": "FINAL_RESPONSE_QUALITY"}, + {"id": "units_present", "content": {"text": "Answer has correct units"}, "type": "FINAL_RESPONSE_QUALITY"} + ] + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": {"required_metrics": "all"}, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": {"model_name": "...", "base_url": "...", "api_key": "..."}, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 60, + "max_iterations_without_improvement": 8 + } + } +} +``` + +Key concepts used in this example: + +| Concept | Location in Config | One-Line Explanation | See Also | +|---|---|---|---| +| **metric** | `evaluate.metrics[]` | List of evaluation metrics; multiple can be stacked, each scored independently | [§4.5](#45) | +| **LLM judge** | `criterion.llm_judge` | LLM judge that scores according to rubrics; serves `llm_rubric_response` in this example | [§4.5](#45) | +| **stop.required_metrics** | `optimize.stop.required_metrics` | Framework-level stop: which metrics must all reach threshold before stopping | [§7.3.5](#735-optimizestop-section) | +| **reflection_lm** | `optimize.algorithm.reflection_lm` | Reflection LLM that reviews failed cases each round and generates new candidate prompts | [§3.8](#38-reflection-lm) / [§6.5](#65-reflection-lm-selection-suggestions-table) | +| **candidate_selection_strategy** | `optimize.algorithm` | Which candidate to pick as reflection parent each round | [§7.3.3](#733-optimizealgorithm-section) | +| **module_selector** | `optimize.algorithm` | Which field to rewrite each round in multi-field optimization | [§4.3](#43) | +| **reflection_minibatch_size** | `optimize.algorithm` | How many cases to sample from train each round for reflection | [§5](#5-how-gepa-works) | +| **stopper** | `optimize.algorithm.max_*` / `timeout_seconds` / `score_threshold` | Algorithm-level stop conditions, at least one must be set | [§4.7](#47) / [§7.3.3](#733-optimizealgorithm-section) | + +See [§7.3](#73-optimizerjson-configuration-items-table) for the complete field reference. + +### 2.6 Run + +```bash +python examples/optimization/quickstart/run_optimization.py +``` + +The terminal outputs in order: baseline evaluation scores → acceptance/rejection records for each round's reflection → final summary. Completes in 1~3 minutes under small-scale configuration. + +![Quickstart Terminal Output Example](../assets/imgs/optimization_quickstart.png) + +```text +runs// +├── result.json # Complete run record (OptimizeResult serialized) +├── summary.txt # Human-readable overview (read this first) +├── run.log # Single-line status +├── config.snapshot.json # Snapshot copy of input configuration +├── rounds/round_NNN.json # Each round's RoundRecord +├── baseline_prompts/.md # Pre-optimization snapshot +└── best_prompts/.md # Best candidate after optimization (only if SUCCEEDED) +``` + +Key lines in `summary.txt`: + +```text +Optimization complete | status=SUCCEEDED | algorithm=gepa_reflective +pass_rate : 0.5000 -> 0.8500 (+0.3500, improved) +rounds : 3 accepted / 7 total +duration : 124.31s +stop_reason : required_metrics_passing +update_source : false +``` + +> **What is pass_rate?** +> +> pass_rate measures: **what proportion of cases your agent "got right" on the validation set**. +> +> --- +> +> **Step 1: Each metric independently determines pass/fail** +> +> Each metric has its own threshold. Score ≥ threshold means pass; otherwise fail. +> +> **Step 2: A case passes only when ALL metrics pass** +> +> Think of it like an exam with multiple subjects — you must pass every subject to pass overall. Failing any single subject means the whole case fails. +> +> **Step 3: pass_rate = number of passing cases ÷ total cases** +> +> --- +> +> **Walkthrough example**: Suppose the validation set has 4 cases, with 3 metrics configured: +> +> | | metric_A (threshold 0.8) | metric_B (threshold 0.6) | metric_C (threshold 1.0) | Does this case pass? | +> | --- | --- | --- | --- | --- | +> | case_1 | score 0.9 ✅ | score 0.7 ✅ | score 1.0 ✅ | **Pass** (all 3 met) | +> | case_2 | score 0.85 ✅ | score 0.4 ❌ | score 1.0 ✅ | **Fail** (metric_B not met) | +> | case_3 | score 0.6 ❌ | score 0.8 ✅ | score 0.0 ❌ | **Fail** (metric_A & C not met) | +> | case_4 | score 0.95 ✅ | score 0.9 ✅ | score 1.0 ✅ | **Pass** (all 3 met) | +> +> 2 passed out of 4 total: +> +> ``` +> pass_rate = 2 / 4 = 0.5 +> ``` +> +> --- +> +> **Back to the summary.txt above**: +> +> ``` +> pass_rate : 0.5000 -> 0.8500 (+0.3500, improved) +> ``` +> +> This means: before optimization the agent could only get half the cases right; after optimization it gets 85% right. An improvement of 35 percentage points. +> +> **Three related fields**: +> +> | Field | Meaning | +> | --- | --- | +> | `baseline_pass_rate` | Pass rate before optimization (scored with the initial prompt) | +> | `best_pass_rate` | Highest pass rate found during optimization | +> | `pass_rate_improvement` | `best - baseline`, the improvement gained from this optimization run | + +See [§8 Artifacts and Directory Conventions](#8-artifacts-and-directory-conventions) for the complete meaning of each field. + +### 2.7 Next Steps + +| Your Next Question | Jump to Section | +|---|---| +| What exactly are these API concepts? | [§3 Core Concepts](#3-core-concepts) | +| My agent isn't this kind of local LlmAgent, how do I integrate? | [§4 Your Scenario → How to Integrate](#4-your-scenario--how-to-integrate) | +| What exactly does each step of the reflection-evaluation-retention loop do? | [§5 How GEPA Works](#5-how-gepa-works) | +| Want to estimate LLM call costs / adjust concurrency parameters? | [§6 Cost and Concurrency](#6-cost-and-concurrency) | +| Want to directly look up parameters / configuration items? | [§7 Complete API Reference](#7-complete-api-reference) | + +## 3 Core Concepts + +> This section uses 8 concepts to establish a "mental model" of the optimization module. Each concept starts from "what does it correspond to in your work" rather than from type signatures. The introduction order is consistent with the appearance order of the three code segments in [§2.4 Core Code](#24-core-code). + +### 3.1 Module Overall Data Flow + +The optimization module's work loop: the user inputs 4 types of assets, and the module produces 2 types of results in the reflection-evaluation-retention loop. + +```text + +---> Evaluate candidate + | | + call_agent ---+ | v + | | Reflect on failures + optimizer.json ---+ | | + | | v ---> OptimizeResult + +------>| Write new candidate + runs// + TargetPrompt ---+ | | + | | v + EvalSet x 2 ---+ | Accept new best? + | Y:keep / N:drop + | | + +---------+ +``` + +Roles of the four inputs: + +| Input | Form | Role in the Loop | +| --- | --- | --- | +| `call_agent` | `async (str) -> str` | Passes query to business agent; optimizer samples behavior through this | +| `optimizer.json` | JSON configuration | Defines evaluation metrics (`evaluate.metrics`) and algorithm parameters (`optimize.algorithm`) | +| `TargetPrompt` | Multi-field prompt registration table | Declares which prompt files / remote configuration entries are optimization targets | +| `EvalSet × 2` | Two evalsets | Training set for reflection LM to see failure cases, validation set for scoring / early stop determination | + +Destinations of the two outputs: + +| Output | Form | Typical Use | +| --- | --- | --- | +| `OptimizeResult` | In-memory object returned by `optimize()` | Programmatic reading (baseline / best / each round details) | +| `runs//` | Audit directory | Manual review, CI parsing, re-run (see [§8](#8-artifacts-and-directory-conventions) for details) | + +### 3.2 call_agent + +**One sentence**: The "universal plug" for your business agent. + +**Why needed**: Your agent might be a local `LlmAgent`, might be a deployed HTTP service, might be a black-box CLI like `claude` / `codex`. The module cannot write adapters for every form; you only need to wrap "given a query → get the agent's final response" into an async function, and the module drives the agent to run evaluations through it. + +**How to use**: + +```python +async def call_agent(query: str) -> str: + # Your implementation: call local agent / HTTP service / subprocess CLI, all fine + # Key point: re-read prompt files each time (so GEPA's new candidates take effect immediately) + root_agent = create_agent() + runner = Runner(...) + return await run_and_collect_final_response(runner, query) +``` + +The signature is fixed as `async (str) -> str`, cannot have more parameters nor be synchronous. + +**When the framework calls it**: + +| Timing | Frequency | +|---|---| +| Baseline evaluation | Each val case × `num_runs` | +| Each round's minibatch evaluation | Each sampled case 1 time | +| Each round's candidate validation set evaluation | Each val case × `num_runs` | + +### 3.3 TargetPrompt + +**One sentence**: Tells the module "which prompt files are to be optimized", equivalent to an **optimization target registration table**. + +**Why needed**: In agent projects, prompts are usually scattered across multiple files or even multiple backends (system.md / skill.md / also placed in QCS versions); the module needs to know: **when a new candidate is reflected, where should it be written, and where should it read from when reading baseline**. `TargetPrompt` is this "address book". + +**How to use**: + +```python +from trpc_agent_sdk.evaluation import TargetPrompt + +target = ( + TargetPrompt() + .add_path("system_prompt", "agent/prompts/system.md") # File type + .add_path("skill", "agent/prompts/skill.md") # File type + .add_callback("rule", # Callback type (remote KV) + read=load_rule_from_kv, + write=save_rule_to_kv) +) +``` + +Each field `name` (e.g., `"system_prompt"`) will become, after optimization ends: + +- `result.best_prompts["system_prompt"]` — programmatic reading of optimal prompt +- `runs//best_prompts/system_prompt.md` — human reading of optimal prompt +- Elements in `RoundRecord.optimized_field_names` — see which field was changed each round + +**Two types of sources**: + +| Source | Applicable When | What the Framework Does | +|---|---|---| +| `add_path(name, path)` | Prompt is in local file | Write to disk using tmp + `os.replace` atomic write; multi-field failure rolls back source files | +| `add_callback(name, *, read, write)` | Prompt is in remote configuration center / database / git, etc., any backend | Calls your `read` / `write` async functions; atomicity is guaranteed by you | + +See [§7.2](#72-targetprompt-api-table) for the complete API. + +### 3.4 AgentOptimizer + +**One sentence**: The module's "power button". + +**Why needed**: You wouldn't want to manually write the whole process of "read config → validate inputs → run reflection loop → persist to disk → assemble result"; `AgentOptimizer` encapsulates this process into one call—you give it **inputs**, it returns **results**. + +**How to use**: + +```python +from trpc_agent_sdk.evaluation import AgentOptimizer + +result = await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs/2026-05-19T17-00-00", +) +print(result.best_pass_rate) +``` + +This module has only this one public entry point, **no other way to start optimization**. + +**What it does**: + +1. Loads and validates `optimizer.json` (throws error before running if schema is wrong) +2. Validates `call_agent` is async function / `target_prompt` has at least one registered field / training set ≠ validation set +3. Runs reflection-evaluation-retention loop +4. Persists artifacts to `output_dir/` +5. Returns an `OptimizeResult` object + +`optimize` has 11 keyword-only parameters in total; the 6 commonly used ones are in [§2.4](#24-core-code), all parameters see [§7.1](#71-agentoptimizeroptimize-parameter-table). + +**`update_source` decision table** (key parameter shared by all §4.x scenarios): Determines whether to **write back** the optimal candidate to the source prompt files registered in `TargetPrompt` after successful optimization— + +| `update_source` | What to do after success | Effective Path | Applicable Scenario | +|---|---|---|---| +| `False` (default) | Only write the optimal candidate to `output_dir/best_prompts/` | You **manually** review → copy to online prompt file → takes effect on next call | Grayscale deployment, requires manual review, don't want optimizer to directly modify online files | +| `True` | Directly **overwrite** source prompt files with the optimal candidate | Business next call **immediately** uses the new prompt | Automated closed loop (e.g., night optimization task, see [§4.6 CI Closed Loop](#46)) | + +Regardless of which you choose, the business side requires **zero restart, zero code changes**—the way to perceive prompt changes is always "re-read file on next call". + +> Safety guarantee of `update_source=True`: Overwrite uses tmp + `os.replace` atomic write; if optimization is interrupted midway or by SIGINT, the source prompt file **will not be half-written**, preserving original content (see [§8.3 Atomic Disk Persistence](#83-atomic-disk-persistence-guarantee) for details). + +### 3.5 optimizer.json + +**One sentence**: A configuration file that tells the module "what counts as good" and "how to search". + +**Why needed**: Metric thresholds, minibatch size, reflection LM configuration, stop conditions... if these parameters are scattered in code, you need to modify code every time you run an experiment. After centralizing to one JSON file, tuning parameters = modify JSON, and reproducibility is also better (a copy of `config.snapshot.json` will be saved in the artifacts). + +**What it looks like**: [§2.5](#25-configuration-file-optimizerjson) already showed the complete example. Structurally divided into two sections: + +```text +{ + "evaluate": { ... }, # Same schema as AgentEvaluator: metric list + num_runs + "optimize": { + "eval_case_parallelism": 2, + "stop": { # Framework-level stop: which metrics must reach threshold + "required_metrics": "all" + }, + "algorithm": { # Algorithm-specific: reflection_lm / minibatch / 6 types of stoppers + "name": "gepa_reflective", + ... + } + } +} +``` + +**Division of labor between the two sections**: + +- `evaluate` section: **completely reuses** the evaluation module's schema. Metric configurations you wrote for evaluation projects can be directly copied over +- `optimize` section: **optimizer-specific**. Among them, `algorithm.name` is the algorithm selector; currently the only optional value is `"gepa_reflective"`, will be extended by [§9.2 Registering New Algorithms](#92) when new algorithms are added in the future + +See [§7.3](#73-optimizerjson-configuration-items-table) for the complete field table. + +### 3.6 EvalSet / EvalCase + +**One sentence**: Training set + validation set, format identical to the evaluation module. + +**Why need two separate files**: + +- **Training set**: The module randomly **samples** a few cases from it each round (`reflection_minibatch_size`, default lets gepa decide) for the reflection LM to see failure cases → used to "find improvement directions" +- **Validation set**: After each new candidate is generated, **run fully** on it for scoring → used to "verify whether the candidate is actually better" + +**Why must they be different files**: The training set determines what the reflection LM sees, the validation set determines whether a candidate is accepted. If the two overlap, it becomes "using exam questions for practice, then using exam questions for grading"—the resulting best_pass_rate is not credible. The framework validates at startup by comparing paths (`os.path.normpath(os.path.abspath(...))`) to defend against this, and directly throws `ValueError` if they overlap. + +See [Evaluation Set Writing Guide](evaluation.md#evaluation-set-evalset-writing-guide) for format and writing guidelines. + +### 3.7 OptimizeResult + +**One sentence**: The "complete output" after one optimization run, both the return value of `optimize()` and the content of `runs//result.json`. + +**Why needed**: After running optimization, you care most about three things—success or not / how much improvement / what is the optimal prompt. `OptimizeResult` packages them: + +```python +result = await AgentOptimizer.optimize(...) + +# 1. Success or not +if result.status == "SUCCEEDED": + ... + +# 2. How much improvement +print(f"{result.baseline_pass_rate:.2%} → {result.best_pass_rate:.2%}, " + f"+{result.pass_rate_improvement:.2%}") + +# 3. What is the optimal prompt +new_system_prompt = result.best_prompts["system_prompt"] +new_skill = result.best_prompts["skill"] +``` + +It also carries process data (what happened each round, reflection LM call count, total duration, etc.) for post-hoc analysis. + +**The 6 most frequently viewed fields**: + +| Field | Type | Meaning | +|---|---|---| +| `status` | `"SUCCEEDED"` / `"FAILED"` / `"CANCELED"` | Final state | +| `baseline_pass_rate` / `best_pass_rate` | `float` | Pass rate before / after optimization | +| `pass_rate_improvement` | `float` | Difference between the two | +| `best_prompts` | `dict[str, str]` | Field name → optimal prompt text | +| `rounds` | `list[RoundRecord]` | Each round's record | +| `stop_reason` | `Literal[...]` or `None` | Which stopper triggered the stop | + +See [§7.4](#74-optimizeresult--roundrecord-field-table) for all 22 fields (including `RoundRecord`). + +### 3.8 Reflection LM + +**One sentence**: The LLM used internally by the module, which receives a set of failure cases each round and outputs improved prompt candidates; it is a separate configuration from the business LM used by your agent. + +Configured in the `optimizer.json::optimize.algorithm.reflection_lm` section, type is `OptimizeModelOptions`: + +```json +"reflection_lm": { + "model_name": "gpt-4o", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-...", + "generation_config": {"temperature": 0.6, "max_tokens": 4096} +} +``` + +See [§6.5](#65-reflection-lm-selection-suggestions-table) for model selection suggestions; see [§7.3.3](#733-optimizealgorithm-section) for complete fields. + +## 4 Your Scenario → How to Integrate + +| Your Situation | Section | Corresponding Example | +|---|---|---| +| Agent is an online HTTP service (FastAPI / Gin / self-developed interface) | [§4.1](#41) | `http_service` | +| Agent is a subprocess / command-line tool (`claude` / `codex` / internal CLI) | [§4.2](#42) | `blackbox_cli` | +| Agent is a multi-sub-agent chain (multiple sub-agents collaborate to complete one response), want to optimize each sub-agent's prompt simultaneously | [§4.3](#43) | `multi_agent_pipeline` | +| Prompts are not in local files, stored in remote KV / configuration center / database / Git, etc., any backend | [§4.4](#44) | `remote_prompt_store` | +| Single evaluation metric is insufficient, need to run multiple evaluation metrics simultaneously (e.g., answer accuracy + hallucination rate + style compliance rate) and fuse into a total score | [§4.5](#45) | `multi_metric_with_judges` | +| Want to integrate CI closed loop: run evaluation gate on PR, run optimization in night window and automatically write back new prompts | [§4.6](#46) | `ci_integration` | +| Optimization task has hard constraints (e.g., must complete within 1-hour window / cumulative calls not exceeding N / stop after consecutive no-improvement) | [§4.7](#47) | `slo_runtime_control` | +| Can already run through the basic process, want to further improve results (adjust GEPA candidate selection / Pareto frontier / cross-field fusion) | [§4.8](#48) | `advanced_strategies` | +| Other common extensions (connect Grafana / WandB, etc. for monitoring, custom stop strategy, use your own optimization algorithm) | [§4.9](#49) | (Multiple examples combined) | + +### 4.1 My Agent is an HTTP Service, How to Integrate? {#41} + +**Your situation**: The business agent is already online as an independent service (FastAPI / Gin / self-developed framework are all acceptable), hoping to perform automatic optimization on its prompts—but the service runs long-term and cannot stop, service implementation details are a black box to the optimizer, and prompts are usually injected in file form. + +**Integration model**: The optimizer accesses as a **pure client**, with only **one coupling point** with the service process—the prompt files on disk. + +```text ++-------------------+ HTTP request + query +-------------------+ +| AgentOptimizer | --------------------------------> | HTTP agent | +| (optimizer) | <--------- text response -------- | (no code change) | ++---------+---------+ +---------+---------+ + | ^ + | write new prompt candidate | Each request + v | re-reads prompt + +------------------------------------------------------------+ + | prompt files (on disk) | + +------------------------------------------------------------+ +``` + +The service process **does not need any code changes**, only needs to satisfy one convention: **re-read prompt files before processing each request**—so that the new candidate written by the optimizer takes effect on the next request. + +**Integration in 3 steps**: + +**Step 1: Register `TargetPrompt` on the prompt files read by the HTTP service** + +```python +target = TargetPrompt().add_path("system_prompt", "service/prompts/system.md") +``` + +The second parameter of `add_path` must be **the exact file path that the service process actually reads** (not an arbitrary copy), otherwise the new candidate written by the optimizer will not be perceived by the service. + +**Step 2: Write `call_agent` as an HTTP client to the service** + +```python +async def call_agent(query: str) -> str: + async with httpx.AsyncClient(timeout=120.0) as client: + resp = await client.post("http://my-agent-service/chat", + json={"query": query}) + resp.raise_for_status() + return resp.json()["final_text"] +``` + +Modify the `json=...` field according to the actual interface payload schema of the business; adjust `timeout` according to the business's first inference latency (example default 120s). + +**Step 3: Call `AgentOptimizer.optimize`** + +```python +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir=f"runs/{timestamp}", + update_source=False, # Decision table see [§3.4](#34-agentoptimizer) +) +``` + +**Pre-integration checklist**: + +| Check Item | Description | +|---|---| +| Does the service re-read prompt files on each request? | No → New candidates written by optimizer won't be seen by the service, optimization is ineffective. Need to add re-read logic in the handler | +| Does the optimizer process have write permission to prompt files? | No → Optimizer cannot persist new candidates | +| Are the prompt file paths seen by the service and the optimizer consistent? | Especially need to confirm in containerized deployment (mount path / symlink) | +| What is the service's 5xx behavior? | The service should not silently retry internally—this would mask the real failure rate, letting the optimizer see a false "high score" | + +**→ Complete example**: [`examples/optimization/http_service/`](../../../examples/optimization/http_service/) +- `service/server.py` — Demonstrates FastAPI service with prompt hot-loading (`/chat` rebuilds agent and re-reads `system.md` each time), can be used as a reference for business service transformation +- `run_optimization.py` — Client optimizer entry, includes pre-start service health check (fail-fast) + +### 4.2 My Agent is an External Command-Line Tool (CLI), Optimizer Cannot Get Its Code {#42} + +**Your situation**: The business agent is an external executable program—`claude` / `codex` / self-developed CLI, etc. Its source code, internally used LLM client, and runtime language are **completely black boxes** to the optimizer, but it reads several prompt files from a working directory at startup (typically `CLAUDE.md` + `.claude/skills//SKILL.md`). You hope to optimize these prompt files without modifying the CLI code or binding to any of its internal dependencies. + +**Integration model**: The optimizer calls the CLI through **subprocess**, and the **only coupling point** with the CLI is still the prompt files on disk—this is the same structure as §4.1's HTTP service, the difference is only replacing "HTTP request" with "starting a subprocess". + +```text ++-------------------+ start subprocess + pass query +-------------------+ +| AgentOptimizer | --------------------------------> | External CLI | +| (optimizer) | <--------- stdout text ---------- | (no code change) | ++---------+---------+ +---------+---------+ + | ^ + | write new prompt candidate | Each startup + v | auto-reads + +------------------------------------------------------------+ + | prompt files (on disk) | + +------------------------------------------------------------+ +``` + +The CLI binary itself **does not need any modifications**, only needs to satisfy: **it loads prompt files from the specified directory on each startup** (most CLI tools are designed this way). + +**Integration in 3 steps**: + +**Step 1: Register `TargetPrompt` on the prompt files read by the CLI (use `add_path` multiple times for multiple files)** + +```python +target = ( + TargetPrompt() + .add_path("claude_md", "workspace/CLAUDE.md") + .add_path("skill_md", "workspace/.claude/skills/city-info/SKILL.md") +) +``` + +Each `add_path` registers one independent field; GEPA treats each field as an independently optimizable module, can optimize separately/jointly (see §3.7, §4.3 for details). + +**Step 2: Wrap subprocess call + stdout normalization into `call_agent`** + +```python +async def call_agent(query: str) -> str: + proc = await asyncio.create_subprocess_exec( + "trpc-claudecode", "--print", + "--add-dir", str(WORKSPACE_DIR), # CLI loads prompt files from here + "--dangerously-skip-permissions", + query, # Pass query as argv, avoid shell escaping + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=_build_cli_env(), # Environment variables expected by business's own CLI + ) + stdout_b, stderr_b = await asyncio.wait_for( + proc.communicate(), timeout=90.0, # Prevent single CLI from hanging + ) + if proc.returncode != 0: + raise RuntimeError(f"CLI exited {proc.returncode}: {stderr_b[:400]!r}") + return _normalize_response(stdout_b.decode("utf-8", "replace")) +``` + +`call_agent` still has the standard signature `async (query: str) -> str` from §3.1; to the optimizer main loop, this `call_agent` is no different from "calling local LLM". `_build_cli_env` / `_normalize_response` are helper functions implemented by the business according to their CLI's characteristics (the former modifies/supplements environment variables to the form expected by the CLI, the latter normalizes CLI stdout into a stable string comparable for evaluation)—this framework does not prescribe their form, implement as needed. + +**Step 3: Run once to confirm baseline works, then hand over to GEPA reflection optimization** + +```python +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs//", + update_source=False, +) +``` + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Does the CLI re-read prompt files on each startup? | No → New candidates written by optimizer won't take effect; evaluation between candidates is equivalent to running the same baseline | +| Does the CLI support passing query through argv / stdin / `--query xxx`? | No → Integration is not feasible (need to add this entry point to CLI first) | +| Is the CLI's average single-run latency known? | No → Cannot reasonably set `CLI_TIMEOUT_SEC` and `max_metric_calls` | +| Does the CLI process pollute shared disk state (other than prompt files)? | Yes → Evaluation is not reproducible; need `eval_case_parallelism=1` or independent workspace for each case | + +**→ Complete example**: [`examples/optimization/blackbox_cli/`](../../../examples/optimization/blackbox_cli/) +- `agent/call_agent.py` — Subprocess call + environment variable adaptation + stdout normalization engineering implementation, can be used as a starting point for integrating your own CLI +- `run_optimization.py` — Standard entry for dual-field (`CLAUDE.md` + `SKILL.md`) `TargetPrompt` + +### 4.3 My Agent is a Multi-Sub-Agent Chain, Want to Optimize Each Sub-Agent's Prompt Simultaneously {#43} + +**Your situation**: The business side has already orchestrated a multi-sub-agent collaboration chain. Each sub-agent has its own system prompt, and there are implicit contracts between fields (the output form of upstream sub-agent must match downstream expectations). Common symptoms during manual iteration are **"fixing A shows effect, but drags down B"**. You hope to **jointly optimize** prompts for all sub-agents, so that end-to-end metrics improve. + +**Integration model**: Register each sub-agent's prompt file as an **independent field** of `TargetPrompt`—GEPA treats each field as an independently optimizable module (component), selects 1 or more fields to write back each round according to `module_selector`, and the optimizer only looks at the end-to-end metric score as feedback. The chain code requires **zero modifications**; each sub-agent just needs to re-read its own prompt file each time it is called. + +```text ++-----------------------------+ select 1 field each round +---------------------+ +| AgentOptimizer | --------------------------> | prompt files | +| (multi-field TargetPrompt) | write back new candidate | (each sub-agent | +| | | has 1 file) | ++--------------+--------------+ +----------+----------+ + ^ | + | End-to-end metric score | Each call + | | re-reads prompt + | v + | +-----------------------------------------+ + +------------- | call_agent(query) | + | = Your multi-sub-agent chain | + | call entry | + | (sub-agent A → sub-agent B → ...) | + +-----------------------------------------+ +``` + +**Integration in 3 steps**: + +**Step 1: Register each sub-agent's prompt file as an independent field** + +```python +target = ( + TargetPrompt() + .add_path("agent_a", ".md") + .add_path("agent_b", ".md") + # ... one add_path per sub-agent +) +``` + +The key is the identifier of this field in reflection prompts / artifact filenames; it just needs to be readable by the business. + +**Step 2: Wrap the entire chain call into `call_agent`, and ensure sub-agents re-read prompts each time** + +```python +async def call_agent(query: str) -> str: + return await invoke_pipeline(query) # Your existing chain entry +``` + +Key constraint inside `invoke_pipeline`: **each sub-agent must re-read its own prompt file each time it is called**, otherwise new candidates written by the optimizer will not take effect. + +**Step 3: Turn on multi-field related switches in `optimizer.json`** + +```jsonc +{ + "optimize": { + "algorithm": { + "module_selector": "round_robin", // Select 1 field per round in rotation, convenient for attribution + "use_merge": true, // Actively fuse after accumulating several single-field improvements + "max_merge_invocations": 3, + "reflection_history_top_k": 3 // Recommended to increase when multi-field rotation (default 2) + } + } +} +``` + +See [§7 Complete API Reference](#7-complete-api-reference) for the complete semantics and value mappings of each parameter. + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Does each sub-agent re-read its own prompt file each time it is called? | No → New candidates written by optimizer won't take effect; evaluation between candidates is equivalent to running the same baseline | +| Can end-to-end metrics reflect the joint quality of all fields? | No → Feedback signal seen by reflection LM is not real; recommend using `final_response_avg_score` to evaluate final response | +| How many LLM inferences does a single case go through? | Call volume multiplies by chain depth; need to correspondingly reduce `eval_case_parallelism` / `reflection_minibatch_size` to prevent rate limit | +| Do sub-agents need to be in the same process? | Not necessary—`call_agent` internals can be HTTP / gRPC / internal SDK / other orchestration frameworks; as long as it ultimately returns `str` | + +**→ Complete example**: [`examples/optimization/multi_agent_pipeline/`](../../../examples/optimization/multi_agent_pipeline/) +- `pipeline/orchestrator.py` — Multi-sub-agent chain implementation, sub-agents re-read prompts on each call +- `run_optimization.py` — Standard entry for multi-field `TargetPrompt` +- `optimizer.json` — Recommended configuration for multi-field scenarios + +### 4.4 My Prompts Are Not in Local Files, Stored in Remote Configuration Center / KV / Database {#44} + +**Your situation**: Business prompts are not in local files, but placed in a remote configuration center (QCS / Apollo / Nacos / self-developed KV / database / Git, etc.), and the business fetches and uses them from the center. The optimizer cannot directly access the file system—it can only interact with the remote through the business's own SDK. + +**Integration model**: `TargetPrompt` abstracts "where prompts are" into a pair of async functions `read` / `write`—the optimizer calls `read` to get the baseline snapshot, calls `write` to persist candidates; the remote backend form (KV / RPC / SQL / Git API ...) is **completely black box** to the optimizer. This is isomorphic to the structure coupled through local prompt files in §4.1 / §4.2, the difference is only replacing "read/write files" with "calling two async functions given by the business". + +```text ++-------------------+ async read / write +---------------------+ +| AgentOptimizer | <-------------------------------> | Remote config | +| (optimizer) | (your own SDK / HTTP / RPC) | (KV / DB / Git ...)| ++---------+---------+ +---------+-----------+ + ^ | + | best_prompts/ persisted locally | Business calls + | | pulls config + v v + +-------------------+ +---------------------------+ + | output_dir/ | | call_agent internals | + | best_prompts/ | | Pull latest prompt then | + +-------------------+ | call agent | + +---------------------------+ +``` + +**Integration in 3 steps**: + +**Step 1: Implement a pair of async functions to operate remote prompts** + +```python +async def read_prompt() -> str: + return await your_config_sdk.get(key="system_prompt") + +async def write_prompt(value: str) -> None: + await your_config_sdk.put(key="system_prompt", value=value) +``` + +Signature constraints: `read: async () -> str`, `write: async (str) -> None`. Retry / idempotency / authentication are guaranteed by the business's own SDK. + +**Step 2: Use `add_callback` instead of `add_path` to register `TargetPrompt`** + +```python +target = TargetPrompt().add_callback( + "system_prompt", + read=read_prompt, + write=write_prompt, +) +``` + +`add_callback` and `add_path` are peers on `TargetPrompt`—multi-field can also be mixed (some fields in local files, some fields in remote configuration center). + +**Step 3: Write `call_agent` as "pull now, use now", call `optimize` as usual** + +```python +async def call_agent(query: str) -> str: + prompt_text = await read_prompt() # Pull now, ensure candidate writes take effect immediately + agent = create_agent(prompt_text) + return await runner.run_async(query, ...) + +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs//", + update_source=False, # Decision table see §3.4 +) +``` + +The value of `update_source` is determined by the business side's prompt write-back strategy (see §3.4 decision table for details), the framework has no additional restrictions on it. + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Does the business side re-pull configuration on each call? | No → After optimizer writes new candidate, business cannot perceive it, reflection loop fails | +| Are both `read` / `write` async functions? | No → Error reported immediately when registering with `add_callback` | +| Is `write` idempotent (accepts repeated writes of the same value)? | No → May fail when automatically rolling back to baseline at finish, leaving remote contaminated | +| Does the optimizer process have write permission for this key / namespace? | No → `write` throws permission error, current candidate evaluation fails | + +> **Safe mode involving production prompts** (adopt as needed, not forced by framework): If the business side already has sandbox / production namespace isolation, you can let the optimizer only read/write sandbox keys, cooperate with `update_source=False` to let the optimizer automatically roll back sandbox at finish, the best candidate is only persisted locally in `best_prompts/`, then synchronized to production through the business's own approval flow. `examples/optimization/remote_prompt_store/` demonstrates this workflow. + +**→ Complete example**: [`examples/optimization/remote_prompt_store/`](../../../examples/optimization/remote_prompt_store/) +- `store/prompt_client.py` — `read` / `write` async function definitions, core transformation point for integrating business configuration center SDK +- `run_optimization.py` — Standard entry for `add_callback` registration (demonstrates workflow using sandbox + `update_source=False` + manual approval) + +### 4.5 Single Evaluation Metric Is Insufficient, Need Multiple Metrics and Fuse into Total Score {#45} + +**Your situation**: Business launch has requirements for agent output in more than one dimension—answer must be correct (correctness hard constraint) + must not talk nonsense (hallucination rate) + style must comply with specifications (format / tone) + must not contain sensitive words (compliance)... Single metric cannot contain all, forcibly using a single composite metric means the feedback signal seen by the reflection LM is a mixed scalar, making it difficult to attribute directionally. + +**Integration model**: `optimizer.json`'s `evaluate.metrics` is a **list**—directly list multiple metrics, each scored independently, with independent threshold and independent configuration. Early stop determination declares which metrics must reach the threshold through `optimize.stop.required_metrics`; GEPA internally decides how to maintain the Pareto frontier among multiple metrics through `optimize.algorithm.frontier_type` to avoid "fixing A drags down B". The entire mechanism is purely configuration-driven—`call_agent` and `TargetPrompt` both do not need to change a single line of code for multi-metric. + +**Configuration in 3 steps**: + +**Step 1: List all metrics in `evaluate.metrics`** + +```jsonc +{ + "evaluate": { + "num_runs": 2, // Smooth LLM output variance (>1 lets each case run multiple times and take mean) + "metrics": [ + { + "metric_name": "llm_final_response", // Hard constraint: is answer substantively equivalent to reference + "threshold": 1.0, + "criterion": { "...": "..." } // Complete fields see §7 / example + }, + { + "metric_name": "llm_rubric_response", // Soft constraint: multiple rubrics (format / style / units ...) + "threshold": 0.75, + "criterion": { "...": "..." } + } + ] + } +} +``` + +Each metric is scored independently and written independently to `metric_breakdown` in `result.json`, convenient for reverse-attributing which metric a certain evaluation lost points on. + +**Step 2: Declare early stop gate in `optimize.stop.required_metrics`** + +| Value | Semantics | Applicable Scenario | +| --- | --- | --- | +| `"all"` | Early stop only when all metrics reach threshold | All metrics are must-pass items | +| `["m1", "m2"]` | Early stop only when all metrics in the list reach threshold (other metrics still participate in evaluation but do not affect early stop) | Some metrics are reference observation items, not used as gates | +| `null` or `[]` | Does not participate in early stop, only controlled by algorithm-level budget / no-improvement / score_threshold | Just want to run out the budget and see results | + +**Step 3: Adjust `frontier_type` to a value that correctly handles multiple metrics** + +| Value | Meaning | Applicable | +| --- | --- | --- | +| `instance` | Maintain one best candidate per case | Single metric or no obvious conflict between metrics | +| `objective` | Maintain one best candidate per metric | Multiple metrics but small case count | +| `hybrid` | Maintain both case + metric two-layer frontier | **Real conflict scenario with multiple metrics** (recommended default) | +| `cartesian` | One best candidate per (case, metric) combination | Extremely complex / debugging use, candidate pool easily explodes | + +`hybrid` lets GEPA not lose the best candidate on another metric when improving one metric—the **safe default for multi-metric business**. See [§7](#7-complete-api-reference) for the complete definition of each value. + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Do the `threshold` values of each metric conform to business requirements? | No → Early stop determination is inaccurate; business-critical metrics may not have reached standard when optimization ends | +| Are only "hard constraints" listed in `stop.required_metrics`? | No → Soft constraint fluctuations will repeatedly interrupt early stop determination, wasting budget | +| Does `eval_case_parallelism` consider the concurrency of metric count × judge count? | No → Single-round LLM call volume explodes (N cases × M metrics × K judges × `num_runs`), easily hitting LLM backend rate limit | +| Is `num_runs` reasonable (default 1)? | Single LLM judge output has variance; recommend `num_runs=2` to let each case run twice and take mean to eliminate jitter | + +**→ Complete example**: [`examples/optimization/multi_metric_with_judges/`](../../../examples/optimization/multi_metric_with_judges/) +- `optimizer.json` — Complete configuration example with `llm_final_response` (multi-judge `all_pass` voting) + `llm_rubric_response` (single judge multi-rubric) + `frontier_type=hybrid` + `stop.required_metrics` list style +- `run_optimization.py` — Standard entry consistent with single-metric scenarios (multi-metric does not affect entry code) + +### 4.6 Want to Integrate CI Closed Loop: PR Gate + Night Optimization Auto Write-Back {#46} + +**Your situation**: You hope prompt engineering also follows the CI/CD process—each PR automatically runs evaluation gate (score below threshold means CI red light, preventing degraded prompts from entering main branch), while simultaneously running reflection optimization in a low-peak window to write back better prompts, and the next PR automatically uses them. **Using either link alone is not enough**: pure gate will not automatically make prompts better, pure optimization has no quality gate. + +**Integration model**: `AgentEvaluator.evaluate` (pytest runs PR gate) and `AgentOptimizer.optimize` (night optimization) share **the same set of assets**—the same `call_agent`, the same evalset (physically split into train / val two files to prevent leakage, logically one set of corpus), the same pair of prompt files. `update_source=True` is the key switch for the closed loop: after optimization succeeds (`OptimizeResult.status=SUCCEEDED`), the optimal candidate directly overwrites the source prompt files, and the next PR-triggered pytest automatically reads the new content. + +```text + +-----------------------------------------------------+ + | Shared assets: call_agent + evalset + prompt files | + +------+----------------------------------------+-----+ + | | + Trigger: PR | | Trigger: Night window + v v + +---------------------------+ +---------------------------+ + | AgentEvaluator.evaluate | | AgentOptimizer.optimize | + | (pytest runs) | | update_source=True | + | | | | + | Score < threshold → Red | | Success → Overwrite | + | pytest exit != 0 → | | source prompts | + | Block PR | | Failure → Files unchanged| + +---------------------------+ +-------------+-------------+ + | + v + Next PR automatically + uses new prompts + (Forms "eval→optimize→eval" + evolution closed loop) +``` + +**Integration in 3 steps**: + +**Step 1: Extract `call_agent` into a module shared by evaluate / optimize** + +```python +# agent/agent.py (both pytest and optimizer import from here) +async def call_agent(query: str) -> str: + ... +``` + +**Why must share**: The agent used during evaluation and the agent used during optimization must be **equivalent**—otherwise "optimizer found a good prompt that evaluator cannot verify" or the reverse problem will occur. Sharing the same `call_agent` file is the most direct code-level guarantee. Any agent changes (model switch / temperature adjustment / output schema change) only need to be changed in one place. + +**Step 2: Write pytest entry for PR gate** + +```python +# tests/test_agent_quality.py +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator +from agent.agent import call_agent + +@pytest.mark.asyncio +async def test_agent_quality(): + await AgentEvaluator.evaluate( + call_agent=call_agent, + eval_set_path="data/val.evalset.json", + test_config_path="optimizer.json", # Reuse same metric configuration + ... + ) # Framework throws AssertionError when score is below threshold → pytest red +``` + +Run in CI pipeline: + +```bash +pytest tests/ --junitxml=runs/pytest_report.xml +``` + +The `--junitxml` output is a standard format test report, parsed natively by mainstream platforms like GitHub Actions / BlueKing Pipeline / Tencent CI. When failing, the `AssertionError` message contains the failure details JSON for each case; when the CI platform displays the stack trace, it can directly see which case failed, what the agent actually output, and where the difference from expected is. + +**Step 3: Night window runs optimization + `update_source=True`** + +```python +# run_optimization.py (triggered by night cron) +await AgentOptimizer.optimize( + config_path="optimizer.json", # Same metric configuration as pytest + call_agent=call_agent, # Same call_agent as pytest + target_prompt=target, + train_dataset_path="data/train.evalset.json", + validation_dataset_path="data/val.evalset.json", + output_dir="runs/optimize_/", + update_source=True, # Key switch for CI closed loop +) +``` + +Safety guarantee of `update_source=True`: Source prompt files are only written back when `OptimizeResult.status=SUCCEEDED`; source files remain unchanged in other states such as failure / budget exhaustion. Overwrite uses atomic write (tmp + `os.replace`), midway exceptions / SIGINT will not corrupt source prompt files (see [§8.3](#83-atomic-disk-persistence-guarantee) for details). + +It is recommended to add `git diff --quiet agent/prompts/` at the end of the night script to determine if there are changes; exit directly if no changes; if there are changes, then `git checkout -b ...` + automatically open a PR—letting new prompts go through the standard PR review process instead of directly entering main branch. + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Is `call_agent` **the same code** shared by pytest and optimizer? | No → Agent for evaluation and agent for optimization are not equivalent; optimization direction and gate direction drift | +| Do pytest and optimizer use **the same metric configuration**? | No → "Evaluation can pass but optimizer sees low score" or the reverse problem. Recommend reusing through `test_config_path` in pytest for the `optimizer.json.evaluate` section | +| Is evalset physically split into train / val two files? | No → SDK `_validate_inputs` forcibly validates `train != val`, otherwise reports error fail-fast | +| Does the night script have `git diff` + automatic PR opening steps at the end? | No → Optimized prompts directly enter main branch, bypassing review; recommend always going through PR process | +| Is there a grayscale strategy for prompt changes ready? | When multiple business lines share the same prompt repository, recommend switching to `update_source=False` + business's own grayscale deployment tool | + +**→ Complete example**: [`examples/optimization/ci_integration/`](../../../examples/optimization/ci_integration/) +- `agent/agent.py` — `call_agent` shared by pytest and optimizer +- `tests/test_agent_quality.py` — pytest gate entry (called at PR stage) +- `run_optimization.py` — Night optimization entry (`update_source=True`) +- `ci/run_pr_check.sh` / `ci/run_nightly_optimize.sh` — CI pipeline shell entries + +### 4.7 Optimization Task Has Hard Constraints: Must Complete Within a Time Window / Cumulative Calls Not Exceeding N / Stop After Consecutive No-Improvement {#47} + +**Your situation**: Your optimization task runs in a constrained environment—CI pipeline must end within N minutes, LLM backend quota is calculated monthly and single run cannot exhaust it, should actively give up after several consecutive rounds without improvement. **Single stop condition is not enough**: only setting timeout may stop before budget is used up, only setting budget may run until the end of time. You need a multi-stop strategy of "stop immediately when any SLO triggers". + +**Integration model**: The `optimize.algorithm` section of `optimizer.json` provides 6 algorithm-level stop conditions, with **OR semantics**—stop immediately when any one triggers. You reverse-calculate each threshold according to business SLO, and enable multiple switches simultaneously. When optimization ends, the `OptimizeResult.stop_reason` field tells you which SLO triggered first, convenient for subsequent parameter tuning. + +**Configuration in 3 steps**: + +**Step 1: Select several stop conditions that the business cares about from the 6 types** + +| Field | Trigger Condition | Typical Business Scenario | +| --- | --- | --- | +| `timeout_seconds` | Wall-clock exceeds N seconds | CI pipeline time window hard constraint (must end within N minutes) | +| `max_metric_calls` | Cumulative case evaluation count ≥ N | LLM backend quota hard upper limit | +| `max_candidate_proposals` | Reflection LM cumulative proposal count ≥ N | Limit reflection LM call budget | +| `max_iterations_without_improvement` | N consecutive rounds without best valset improvement | Actively give up when already converged or trapped in local optimum | +| `score_threshold` | Best valset pass_rate ≥ threshold | Already reached business goal, no need to continue | +| `max_tracked_candidates` | Pareto frontier candidate pool size ≥ N | Control memory and merge candidate space size | + +See [§7.3.3](#733-optimizealgorithm-section) for the complete definition of each field. **Configure at least 1**—otherwise the framework reports fail-fast at startup. + +**Step 2: Reverse-calculate each threshold according to business SLO** + +```jsonc +{ + "optimize": { + "algorithm": { + "timeout_seconds": 90.0, // CI must end within X minutes → set X*60 / 2 to leave buffer + "max_metric_calls": 30, // LLM quota → reverse-calculate by "calls × single-run duration" + "max_iterations_without_improvement": 3, // Give up after 3 consecutive rounds without improvement + "score_threshold": 1.0 // Stop when business goal is reached + } + } +} +``` + +**Two key reverse-calculations**: + +| Item | How to test | How to reverse-calculate | +| --- | --- | --- | +| Typical single-round duration | Run a baseline, look at `rounds[*].durationSeconds` in `runs//result.json` (take median) | `timeout_seconds` should be at least single-round duration × 2, otherwise the first round triggers stop and you cannot see optimization progress | +| Single-round metric_calls count | Same as above, look at `totalMetricCalls / totalRounds` in round | `max_metric_calls` should be able to run through at least `max_iterations_without_improvement` rounds, otherwise budget always triggers stop first | + +**Step 3: Clarify whether to participate in framework-level metric early stop** + +| Value | Semantics | +| --- | --- | +| `optimize.stop.required_metrics: "all"` or `["m1"]` | Metric reaching threshold also participates in OR trigger | +| `optimize.stop.required_metrics: []` | Only let the 6 algorithm-level stoppers decide | + +Business requirements: +- **Care about whether metrics reach standard** (typical prompt quality optimization) → use `"all"` or specific list +- **Only care about time / call budget** (known to converge, purely carding resources) → use `[]` + +**`stop_reason` value reference**: When optimization ends, the `OptimizeResult.stop_reason` value can tell you the trigger—`score_threshold_reached` / `budget_exhausted` / `timeout_reached` / `no_improvement` / `max_proposals_reached` / `max_tracked_candidates_reached` / `user_requested_stop` (user actively triggers through `optimize.stop` sentinel file). + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Are thresholds all reverse-calculated through baseline measurements, not intuited? | No → Highly likely some stopper always triggers first (e.g., timeout triggers in round 1), other configurations are decoration | +| Does `timeout_seconds` leave buffer (≤ 50% of real business window)? | No → Under the framework's "complete current round then stop" semantics, actual termination time may exceed the timeout set value, hitting business hard deadline | +| Do single-round LLM calls have their own timeout (e.g., CLI / HTTP calls)? | No → Single round hangs, entire timeout can only wait for current round to finish, may seriously exceed timeout (refer to CLI_TIMEOUT_SEC pattern in §4.2) | +| Have you run a baseline in the test environment once to verify `stop_reason` is consistent with expectations? | No → Only discover stopper behavior is inconsistent with expectations after going to CI, cannot quickly diagnose | + +**→ Complete example**: [`examples/optimization/slo_runtime_control/`](../../../examples/optimization/slo_runtime_control/) +- `optimizer.json` — Configuration example with all 6 stop conditions enabled (business real integration should reverse-calculate thresholds according to own SLO, do not directly copy example values) +- `run_optimization.py` — After running, `result.json.stop_reason` field identifies the trigger + +### 4.8 Can Already Run Through Basic Process, Want to Further Improve Results (GEPA Candidate Selection / Pareto Frontier / Cross-Field Fusion) {#48} + +**Your situation**: You have already run through the basic optimization process according to quickstart, and can stably see score improvement from baseline → best. Now you want to understand several advanced switches of GEPA—`candidate_selection_strategy` / `frontier_type` / `use_merge` / `skip_perfect_score`—whether they are **actually useful on your task, whether they can squeeze out a few more points**. But running optimization once often cannot see the difference, because GEPA can converge to similar `best_pass_rate` on most tasks—**the difference is hidden in the arrival path** (round count / acceptance rate / whether merge triggered / reflection LM call count), not in the final score. + +**Integration model**: Use **A/B controlled experiment**—same business, same evalset, same `seed`, run two different `optimizer.json`: one is the current online configuration or default configuration (baseline), one is the advanced combination to be verified. After running, compare the two `result.json`, focusing on **multi-dimensional metrics** rather than single `best_pass_rate`. + +**Experiment in 3 steps**: + +**Step 1: Use current configuration as baseline, fix other variables** + +```jsonc +// optimizer_baseline.json +{ + "optimize": { + "algorithm": { + "seed": 42, // Fix seed to exclude randomness + "max_metric_calls": 30, // Keep consistent with advanced to fairly compare + "candidate_selection_strategy": "pareto", + "frontier_type": "instance", + "skip_perfect_score": false, + "use_merge": false + } + } +} +``` + +**Step 2: Write advanced configuration, only change the switches to be verified** + +```jsonc +// optimizer_advanced.json (only differs from baseline by a few switches) +{ + "optimize": { + "algorithm": { + "seed": 42, + "max_metric_calls": 30, + "candidate_selection_strategy": "pareto", + "frontier_type": "objective", // Change: from instance to objective + "skip_perfect_score": true, // Change: skip perfect score cases to save reflection calls + "use_merge": true // Change: enable cross-field fusion (only actually triggers in multi-field) + } + } +} +``` + +**Step 3: Run twice + parse `result.json` to output multi-dimensional comparison** + +```bash +python run_baseline.py # Produce runs/baseline_/result.json +python run_advanced.py # Produce runs/advanced_/result.json +python compare.py # Parse two result.json, output comparison table +``` + +Dimensions `compare.py` should focus on: + +| Dimension | Field (indexed by camelCase in `result.json`) | Interpretation | +| --- | --- | --- | +| Final quality | `bestPassRate` / `baselinePassRate` | End-to-end score improvement; two strategies converge closely on most tasks | +| Exploration depth | `totalRounds` / `roundsAccepted` | Acceptance rate (`roundsAccepted / totalRounds`) reflects frontier acceptance threshold | +| Merge behavior | `mergeRoundsTotal` / `rounds[*].kind` | Verify `use_merge=true` actually triggers merge | +| Reflection budget | `metricCallsTotal` / `proposalsTotal` | `skip_perfect_score=true` saves more obviously on large training set + high baseline start | +| `stop_reason` | `stopReason` | Which stopper triggered; cannot directly compare when advanced/baseline have different stop_reason | + +> **Pitfall reminder**: Fields in `result.json` are camelCase (`bestPassRate` not `best_pass_rate`). SDK uses snake_case internally, automatically converted to camelCase during serialization through pydantic alias. Index by camelCase when reading `result.json`. + +**Expected performance of several advanced switches** (may not all hold on business tasks—use your own actual measurements as basis): + +| Switch | Expected Benefit | Applicable Prerequisites | +| --- | --- | --- | +| `frontier_type="objective"` (vs `"instance"`) | Higher acceptance rate / more aggressive exploration | Multi-metric scenario; may overfit train minibatch on small training set (< 10 cases) causing valset oscillation | +| `frontier_type="hybrid"` | Multiple metrics do not overwrite each other | Real conflict scenario with multiple metrics (see §4.5) | +| `skip_perfect_score=true` | Save reflection LM calls | Large-scale training set + high baseline start; few perfect score cases on small dataset, limited savings | +| `use_merge=true` | Cross-field fusion candidates | **Only actually triggers when multi-field (`add_path` ≥ 2)**; always 0 merge rounds in single-field configuration (`mergeRoundsTotal=0` is expected, see §4.3) | + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Do the two configurations only differ in **the few switches to be verified**, all others identical? | No → Comparison result contains confounding variables, conclusion is not credible | +| Is `seed` consistent between the two sets? | No → Difference may come from randomness rather than configuration strategy | +| Is `max_metric_calls` consistent between the two sets? | No → One set naturally has higher score with more budget, cannot attribute to strategy | +| Are you simultaneously focusing on **multi-dimensional comparison** rather than single `bestPassRate`? | No → Final scores of two strategies are close on most tasks, cannot see difference; difference is hidden in arrival path | +| Do switches like `use_merge` / `skip_perfect_score` make sense in your task structure? | Enabling `use_merge` on single-field task never triggers (harmless but no benefit); enabling `skip_perfect_score` on high-baseline task saves considerably | + +> Advanced configuration is **not the more complex the better**. On many tasks, baseline configuration can already achieve reasonable convergence; advanced only shows value in specific task structures (multi-objective, multi-field, large-scale training set, etc.). **Use data to decide, not intuition**. + +**→ Complete example**: [`examples/optimization/advanced_strategies/`](../../../examples/optimization/advanced_strategies/) +- `optimizer_baseline.json` / `optimizer_advanced.json` — Two configurations for A/B control (only differ by 3 switches) +- `run_baseline.py` / `run_advanced.py` — Two independent entries (keeping other variables consistent) +- `compare.py` — Standard template for parsing two `result.json` and outputting multi-dimensional comparison table + +## 5 How GEPA Works + +After running an optimization and watching the score increase from 0.4 to 0.85, you don't know **what exactly the framework did along the way**—what data did it read? What did the reflection LM see? On what basis did it decide to retain or discard a candidate? When SLO triggers, does it stop immediately or wait for the current round to finish? + +> **GEPA** = Genetic-Evolutionary Pareto, is a reflection-based evolutionary search algorithm ([gepa-ai/gepa](https://github.com/gepa-ai/gepa), MIT License). This framework wraps `gepa.optimize()` into `GepaReflectiveOptimizer` through `OPTIMIZER_REGISTRY`, and adds a layer of SDK adaptation (evaluation bridging, reflection feedback construction, stop determination, atomic disk persistence, etc.). + +### 5.1 What Exactly Runs in One Optimization Round + +**First remember three roles**—all subsequent diagrams and tables revolve around these three: + +| Role | Who Is It | What It Does | +| --- | --- | --- | +| **agent** | Your business agent (accessed through `call_agent`) | Receives one query, outputs one response | +| **judge / metric** | Configured evaluators in `evaluate.metrics` | Score agent responses (0~1) | +| **Reflection LM** | LLM configured in `algorithm.reflection_lm` | Views failure case feedback → generates new prompt candidates | + +**Round 0**: Run valset with baseline prompt → get baseline score (your "starting line") + +**Each subsequent round (reflective round)** follows these 5 steps: + +```text + ┌────────────────────────────┐ + │ Candidate prompt selected │ + │ in previous round │ + └──────────────┬─────────────┘ + ▼ + (1) Sample minibatch → Randomly sample N cases from trainset + (N = reflection_minibatch_size) + │ + ▼ + (2) Run one evaluation → Write candidate to prompt file + → Call call_agent to run these N cases + → Metric scores, get failure cases + │ + ▼ + (3) Reflection LM → Feed failure case feedback to + generates candidate reflection LM + → It outputs new prompt text + │ + ▼ + (4) Re-evaluate + enter → Re-run new candidate on minibatch + Pareto frontier → Better than historical → enter + frontier, otherwise discard + │ + ▼ + (5) Check stop conditions → Any of 6 stoppers triggered → stop + → Otherwise enter next round +``` + +**Several key explanations**: + +- **"Evaluation" in step (2)** actually runs `len(minibatch) × num_runs × len(metrics)` LLM evaluations (see §6.1 for details) +- **"What reflection LM sees" in step (3)** determines rewrite quality—this is the content of next section §5.2 +- **"Pareto frontier" in step (4)** simply put is "retain the set of candidates that are not surpassed in all aspects"; specific granularity is controlled by `frontier_type` (see §5.3 for details) +- **"Stop when any triggers" in step (5)** has a detail: after triggering, **wait for current round to finish before actually stopping**, not immediately kill (see §5.4 for details) +- **Valset evaluation** is interleaved in the middle rounds (determined internally by gepa), used to calculate the "real score of current best candidate on valset", also the basis for stopper judgments such as `score_threshold` / `required_metrics` + +**Special case: merge round** + +When `use_merge=true`, a **merge round** is inserted every several reflective rounds: select two candidates from the Pareto frontier and fuse them into one new candidate ("take A's wording on field X + B's wording on field Y"). **Only meaningful in multi-field scenarios**—never triggers in single-field, `mergeRoundsTotal=0` is expected. See §4.3 for details. + +### 5.2 What Reflection LM Actually Sees + +The quality of the reflection LM's prompt rewriting **completely depends on how rich the failure feedback it can see**. If you only tell it "case_3 failed, score 0.3", it can only guess blindly; if you tell it "case_3 turn 2 agent should output `{"city":"Shanghai"}` but actually output `Shanghai`, rule requires case-sensitive exact match", it can targetedly modify the prompt. + +`_AgentGEPAAdapter.make_reflective_dataset` renders a markdown record for each **failed case**, fed to the reflection LM. Each record field: + +| Field | One-Line Explanation | When It Appears | +| --- | --- | --- | +| `case_id` | Stable ID of the case (for reflection LM cross-reference) | Always | +| `score` | Aggregate score of this case (0~1, 1.0 = all metrics passed) | Always | +| `Case Body` | Markdown of failure scene: one segment per turn, containing user input, expected response, agent actual response, tool call trace, each metric's judgment (PASS/FAIL + score + failure reason) | Always | +| `Other Active Components` | What do other prompt fields NOT being rewritten in this round look like | When multi-field optimization—lets reflection LM see B/C status when modifying A, avoiding breaking upstream/downstream compatibility | +| `history_top_k` | Best agent responses for this case in history (sorted by score) | When `reflection_history_top_k > 0` | + +**Specific structure of `Case Body`**: + +```text +### Turn 1 +**User**: +**Expected**: +**Agent Response**: +**Tool Trace**: ← Only when tool calls exist + - tool_name(args) → response +**Verdict** (Turn 1): + [FAIL] metric_name: score=0.0000, threshold=1.0000 + reason: agent output not byte-equal to expected (case-sensitive) + · rubric[no_emoji]: PASS score=1.00 ← Only for LLM rubric metric + +### Turn 2 +... + +### Overall (case-level aggregate) ← When multi-turn or multi-run +... +``` + +**Failure reason synthesis for deterministic metrics**: When metric is an evaluator without LLM judge like `final_response_avg_score`, only outputting score+status, the framework will **automatically synthesize a failure explanation** (e.g.: `agent output not byte-equal to expected (case-sensitive)` / `expected substring not contained in agent output (case-insensitive)` / `JSON structural comparison failed`), letting the reflection LM directly see **why it didn't match**, without having to diff text to guess. + +> Want to see the full reflection prompt that the reflection LM actually receives? Set `verbose=2` when running optimization, gepa internal logs will include each round's reflection prompt text—read it once and you'll have a good understanding. + +### 5.3 Actual Behavior of 5 Core Operators + +The 5 switches most frequently asked about in the `optimize.algorithm` section of `optimizer.json`, what they actually do in the source code: + +| Operator | One-Line Function | Typical Motivation to Adjust It | Detailed Reference | +| --- | --- | --- | --- | +| `reflection_minibatch_size` | How many cases the reflection LM sees each round | Smaller saves tokens, larger gives reflection LM more complete view | [§7.3.3](#733-optimizealgorithm-section) | +| `module_selector` | Which field to modify this round in multi-field (`round_robin` rotation / `all` select all / `random` random) | Want clear attribution of each field's contribution → `round_robin` | [§4.3](#43) | +| `frontier_type` | Pareto frontier granularity (`instance` one best per case / `objective` one per metric / `hybrid` two-layer / `cartesian` Cartesian product) | When multiple metrics truly conflict → `hybrid` | [§4.5](#45) | +| `candidate_selection_strategy` | How to select parent for next round's reflection (`pareto` default select from frontier / `current_best` use current best / etc.) | Want to accelerate convergence or increase exploration | [§7.3.3](#733-optimizealgorithm-section) | +| `use_merge` + `max_merge_invocations` | Whether to enable cross-field fusion + upper limit on trigger count | **Only actually triggers in multi-field**—`mergeRoundsTotal=0` is expected in single-field | [§4.3](#43) / [§4.8](#48) | + +### 5.4 Stop Timing: Complete Current Round Before Stopping + +6 algorithm-level stop conditions (`max_metric_calls` / `timeout_seconds` / `no_improvement` / `score_threshold` / `max_candidate_proposals` / `max_tracked_candidates`) are **synchronously checked at the end of each round**—stop when any condition is satisfied. + +**3 easily stepped-on details**: + +| Detail | Meaning | How to Avoid | +| --- | --- | --- | +| **Does not immediately kill current round** | When stop is triggered, it will not interrupt the currently running round; must wait for current round to finish before actually stopping | In SLO hard deadline scenarios, set `timeout_seconds` to about 50% of the real business window, leave buffer | +| **Actual termination time often exceeds `timeout_seconds`** | Direct consequence of the previous point—especially obvious when stuck in a long round | Add your own timeout to LLM calls inside `call_agent` (refer to 90s timeout in §4.2 CLI) | +| **Priority when multiple stoppers trigger simultaneously** | `framework_stopper` (`required_metrics` policy) first; then take the first one in algorithm-level stopper insertion order | `OptimizeResult.stop_reason` field records the trigger, see which one triggered directly after running | + +**`stop_reason` value reference** (`OptimizeResult.stop_reason`): + +``` +required_metrics_passing ← framework-level (highest priority) +score_threshold ← Reached target score +budget_exhausted ← max_metric_calls +timeout ← timeout_seconds +no_improvement ← max_iterations_without_improvement +max_candidate_proposals +max_tracked_candidates +user_requested_stop ← User touched optimize.stop file +completed ← No stopper triggered, gepa naturally finished +``` + +### 5.5 A Special Case: FAILED + +Normally `OptimizeResult.status = "SUCCEEDED"`—gepa finished the loop (natural end / stopper trigger both count). But there is one special status worth user attention: + +- **`status = "FAILED"`**: gepa threw an exception during running (most common: training/validation set loading failure, `gepa.optimize()` internal exception, reflection LM call failure) +- **At this time `best_prompts` is forcibly set to `baseline_prompts`**—ensuring the artifacts you get **will never be worse than baseline** +- **`update_source=True` will not write back** source prompt files when FAILED (see §3.4 decision table for details) + +Another easily confused point is "finished running but no improvement": in this case `status` is still `"SUCCEEDED"`, but `finish_reason="no_improvement"`, and `best_prompts == baseline_prompts`—`summary.txt` will show `baseline → baseline` (no degradation nor improvement). This is expected, not a bug. + + +## 6 Cost and Concurrency + +How many LLM calls does one optimization run require? Which knobs affect call volume, which affect concurrency, which affect both? + +### 6.1 Where LLM Calls in One Optimization Come From + +LLM calls are divided into two parts—**evaluation side eats the vast majority**, reflection side is just a fraction: + +**Evaluation side (agent + judge)**: Run each of these once, each calls LLM once— + +```text +Run one baseline evaluation: Run valset fully once ← Starting point, 1 time +Each reflective round: Sample N cases and run once + re-run candidate ← Main cost +Specific reflective round: Re-evaluate current best candidate on valset ← Determined by gepa +``` + +Actual LLM call count triggered by each "run once" = **number of cases × agent call count per case × `num_runs` × judge call count per metric**. Among them: + +| Multiplier | Source | Typical Value | +| --- | --- | --- | +| Agent call count per case | Evalset data; accumulate by turn count in multi-turn conversation | Single turn = 1, multi-turn = N | +| `evaluate.num_runs` | Run each case several times and take mean to eliminate LLM output variance | 1 (default, saves) / 2~3 (recommended, stable) | +| Judge call count per metric | Depends on metric type: `final_response_avg_score` type deterministic matching = 0 times; `llm_judge` / `llm_rubric_response` ≥ 1 time (however many are in `judge_models` array) | 0~3 | + +**Reflection side (reflection LM)**: + +```text +Each reflective round: 1 time (generate new candidate prompt) +Each merge round: 1 time (only when use_merge=true and multi-field) +``` + +Reflection side call count is much less than evaluation side—usually 5~20 times for a complete optimization. + +### 6.2 What to Read from result.json After Running + +Fields actually recorded in `OptimizeResult` (camelCase indexed in artifact `result.json`): + +| Field | Meaning | +| --- | --- | +| `totalMetricCalls` | Cumulative case-level evaluation count by gepa | +| `totalReflectionLmCalls` | Cumulative reflection LM call count (including retries) | +| `totalTokenUsage` | Cumulative tokens for reflection LM: `{prompt, completion, total}` | +| `durationSeconds` | Total wall-clock duration | + +When needing to estimate actual USD cost on the business side, use `totalTokenUsage` × LLM backend unit price to reverse-calculate reflection side; agent / judge side is pulled from LLM backend usage records (API console / billing reports). + +### 6.3 Multiplier Effect of 4 Commonly Used Knobs + +Sorted by "magnitude of impact on total call volume" from large to small—when encountering optimization running out of budget, adjust the ones above first: + +| Knob | Multiplies By How Much | Cost of Turning Down | Details | +| --- | --- | --- | --- | +| `algorithm.max_metric_calls` | **Hard upper limit on total call volume**—gepa stops when cumulative reaches it | Too small → Stopped by it in the 1st round; cannot see any score improvement | [§4.7](#47) | +| `evaluate.num_runs` | **Multiply by N**—run each case N times and take mean | LLM output variance directly enters score when 1 (same prompt gets different scores on two runs); recommend ≥ 2 | [§4.5](#45) | +| `optimize.eval_case_parallelism` | **Does not affect total volume**, only affects **wall-clock time** and **instantaneous QPS** | Higher saves time but easily hits LLM backend rate limit | [§4.5](#45) | +| `algorithm.reflection_minibatch_size` | **Multiply by a few**—how many cases the reflection LM sees each round; evaluation side also calculates by this number | Too large → Reflection prompt explodes LLM context window | [§4.3](#43) | + +### 6.4 Want to Reasonably Set Thresholds? Run a Baseline First + +Before setting thresholds such as `timeout_seconds` / `max_metric_calls`, **first run a baseline with default configuration**—read two numbers from the artifacts: + +| Value to Measure | How to Test | How to Use | +| --- | --- | --- | +| **Typical single-round duration** | `rounds[*].durationSeconds` in `runs//result.json` (take median) | `timeout_seconds` should be at least single-round duration × 2, otherwise stop is triggered in round 1 and you cannot see optimization progress | +| **Single-round metric_calls** | Same as above, `totalMetricCalls / totalRounds` | `max_metric_calls` should be able to run through at least `max_iterations_without_improvement` rounds, otherwise budget always triggers stop first | + +**Example**: Baseline run shows 30 seconds per round, 4 metric_calls per round, CI window 5 minutes—then `timeout_seconds=120` (leave buffer), `max_metric_calls=24` (enough to run 6 rounds for `max_iterations_without_improvement=3` to trigger stop). + +### 6.5 Single-Round Instantaneous LLM QPS Control + +Number of LLM requests concurrently sent in a single round: + +```text +Single-round instantaneous LLM QPS ≈ eval_case_parallelism + × num_runs + × (agent calls per case + all judge calls) +``` + +**Typical scenario estimation**: 3 judges + `num_runs=2` + `eval_case_parallelism=4` + 1 agent call per case + 3 judge calls → about 32 LLM requests per round instantaneous. When LLM backend rate limit is 30 QPS, this configuration will inevitably trigger rate limiting. + +**Two parameters to control instantaneous QPS** (sorted by effect): + +| Parameter | Impact | Applicable | +| --- | --- | --- | +| `eval_case_parallelism` | Directly reduces concurrent case count | First choice for most situations; set to `1` for serial execution in scenarios with intensive single-case calls such as black-box CLI, multi-judge (see [§4.2](#42), [§4.5](#45)) | +| `num_runs` | Reduces repeated evaluation per case | Sacrifices some variance stability; recommend only lowering after confirming LLM output variance is small | + +### 6.6 Reflection LM Selection and Configuration + +The output quality of the reflection LM directly determines prompt rewriting quality. Configuration location (`optimizer.json`): + +```jsonc +{ + "optimize": { + "algorithm": { + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { + "max_tokens": 4096, // Reflection prompt is long, leave enough output space + "temperature": 0.6 // Between 0.6~0.8, let LM be creative + } + } + } + } +} +``` + +**Two suggestions**: + +- **Can be configured independently from agent / judge**—the `reflection_lm` section is independent, business can choose different model (avoid "self-evaluation" bias, or purely because reflection tasks require higher model reasoning power) +- **Token usage is truly recorded**—the `totalTokenUsage` field will accumulate actual prompt + completion + total token count for reflection LM; reverse-calculate USD by LLM backend unit price + + +## 7 Complete API Reference + +Reference manual section, organized by "what parameter are you looking for". **Each table has a "Required" column**, three-gear meaning: + +- **Required**: Not passed/not configured → fail-fast error at startup +- **Optional**: Can be omitted; uses default value when not configured +- **Conditionally Required**: Can be omitted when looking at the entry alone, but **must be configured when satisfying certain conditions**—conditions written in the "Condition" column at the end of each entry + +All fields are based on actual source code (source file path annotated in each table header). + +### 7.1 `AgentOptimizer.optimize` Parameter Table + +Source code: `trpc_agent_sdk/evaluation/_agent_optimizer.py:AgentOptimizer.optimize`. **11 keyword-only parameters**—must be passed in `key=value` form, positional parameters not accepted. + +| Parameter | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `config_path` | **Required** | `str` | — | optimizer.json configuration file path | +| `call_agent` | **Required** | `async (str) -> str` | — | Business agent adapter function; signature fixed as "accept query return str" | +| `target_prompt` | **Required** | `TargetPrompt` | — | Register which prompt fields are optimization targets (at least 1, otherwise error) | +| `train_dataset_path` | **Required** | `str` | — | Training evalset file path | +| `validation_dataset_path` | **Required** | `str` | — | Validation evalset file path; **must be different from `train_dataset_path`** (prevent data leakage, framework will normalize paths before comparing) | +| `output_dir` | **Required** | `str` | — | Artifact directory; created automatically if it doesn't exist | +| `callbacks` | Optional | `Optional[Callbacks]` | `None` | Evaluator lifecycle callbacks (rarely used) | +| `update_source` | Optional | `bool` | `False` | Whether to write back to source prompt files after successful optimization (decision table see [§3.4](#34-agentoptimizer)) | +| `verbose` | Optional | `int` | `1` | Terminal output verbosity: `0` silent / `1` default Rich panel / `2` plus gepa internal log forwarding | +| `extra_stop_callbacks` | Optional | `Optional[Sequence]` | `None` | Stoppers appended at runtime (SLO monitoring / kill switch, etc.); ordinary callable displays as `stop_reason="completed"`, use `_LabeledStopper` wrapper or expose `.label` attribute when needing stable labels | +| `extra_gepa_callbacks` | Optional | `Optional[Sequence]` | `None` | Gepa event callbacks appended at runtime (e.g., forwarding to dashboard); need to implement `gepa.core.callback.GEPACallback` protocol | + +**Return value**: `OptimizeResult` (see [§7.4](#74-optimizeresult--roundrecord-field-table) for details). + +**Fail-fast checks at startup** (`_validate_inputs`): + +| Situation When Check Fails | Throws | +| --- | --- | +| `output_dir` is empty string | `ValueError` | +| `target_prompt` did not register any fields | `ValueError` | +| `call_agent` is not async function (including `__wrapped__` check, supports `functools.partial` wrapped async) | `TypeError` | +| `train_dataset_path` and `validation_dataset_path` resolve to the same file (compared after normalizing with `os.path.normpath(os.path.abspath(...))`) | `ValueError` (prevent data leakage) | +| `evaluate.metrics` contains `tool_trajectory_avg_score` or `llm_rubric_knowledge_recall`—these two require session traces / tool intermediate_data, which cannot be obtained in `call_agent` black-box mode | `ValueError` | +| `algorithm.name` in config is not registered in `OPTIMIZER_REGISTRY` | `ValueError` (message lists all registered algorithm names) | +| `use_merge=true` and `TargetPrompt` field count < 2 | `UserWarning` (not fatal, but `mergeRoundsTotal` will always be 0) | + +### 7.2 `TargetPrompt` API Table + +Source code: `trpc_agent_sdk/evaluation/_target_prompt.py`. A container for registering multi-field prompts, supports both file source and callback source forms. + +| Method | Signature | Behavior | +| --- | --- | --- | +| `add_path(name, path)` | `(str, str) -> Self` | Register file source field; `name` must be unique; returns self for chained calls | +| `add_callback(name, *, read, write)` | `(str, *, AsyncRead, AsyncWrite) -> Self` | Register callback source field; `read: async () -> str`, `write: async (str) -> None` must both be async; `name` must be unique | +| `names()` | `() -> list[str]` | Return field names (in registration order) | +| `describe_source(name)` | `(str) -> str` | File source returns path; callback source returns literal `""`; unknown name throws `KeyError` | +| `read(name)` | `async (str) -> str` | Read single field | +| `read_all()` | `async () -> dict[str, str]` | Read all fields (in registration order) | +| `write_all(prompts)` | `async (dict[str, str]) -> None` | **Atomically write all fields** (see contract below for details) | + +**Atomicity contract of `write_all`** (from source code comments): + +1. **File source atomic write**: First write to `.tmp`, then `os.replace` rename (POSIX guarantees rename atomicity) +2. **Failure rollback**: When any file write fails, already successfully written files roll back to pre-call content, clean up residual `.tmp`, original exception normally re-raised +3. **Rollback itself fails**: Original exception is preserved through `__context__`, and `_RollbackError` is raised listing each field's rollback failure details—rollback is best-effort, one field's failure does not skip subsequent ones +4. **Callback source does not rollback**: After file source writes successfully, then run callback sources in order; when callback source fails, file source rolls back to baseline, but **callback source itself does not rollback** (idempotency is caller's responsibility) + +**Key validation of `write_all`**: The key set of incoming `prompts` must **exactly equal** the registered field name set, otherwise throws `ValueError`. + +### 7.3 `optimizer.json` Configuration Items Table + +Source code: `trpc_agent_sdk/evaluation/_optimize_config.py`. pydantic schema, **supports both camelCase and snake_case keys**. Top-level structure: + +```jsonc +{ + "evaluate": { ... }, // Evaluation section (same schema as AgentEvaluator) + "optimize": { // Optimizer section + "eval_case_parallelism": 4, + "stop": { ... }, // Framework-level stop + "algorithm": { ... } // Algorithm block (including reflection_lm) + } +} +``` + +#### 7.3.1 `evaluate` Section + +Source code: `_eval_config.py:EvalConfig`. + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `metrics` | **Conditionally Required** (see below) | `Optional[list[dict]]` | `None` | Metric array, each containing `metric_name` / `threshold` / `criterion`. **When `metrics` is configured, `criteria` is ignored** | +| `criteria` | **Conditionally Required** (see below) | `dict[str, Any]` | `{}` | Old-style shorthand: `metric_name → threshold` or `{threshold, criterion}` | +| `num_runs` | Optional | `int` | `1` | How many times to run each case and take mean (eliminate LLM output variance); `≥ 2` recommended | +| `user_simulator_config` | Optional | `Optional[Any]` | `None` | User simulator configuration (multi-turn scenarios; rarely used) | + +**Condition**: At **least 1** of `metrics` and `criteria` must be configured—when both are empty, `evaluate.get_eval_metrics()` returns empty list, and startup will report error due to no metrics. New integrations recommend using `metrics` (more structured), `criteria` is mainly kept for compatibility with old configurations. + +#### 7.3.2 `optimize` Section + +Source code: `_optimize_config.py:OptimizeConfig`. + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `eval_case_parallelism` | Optional | `int` | `4` | Case concurrency within same round (does not affect total call volume, affects instantaneous QPS) | +| `stop` | Optional | `FrameworkStopConfig` | `{required_metrics: "all"}` | Framework-level stop section (see [§7.3.5](#735-optimizestop-section) for details) | +| `algorithm` | **Required** | `GepaReflectiveAlgo` | — | Algorithm block (see [§7.3.3](#733-optimizealgorithm-section) for details) | + +#### 7.3.3 `optimize.algorithm` Section + +Source code: `_optimize_config.py:GepaReflectiveAlgo`. All adjustable parameters for the `gepa_reflective` algorithm. + +> **Hard constraint**: Among the **last 6 stopper fields** in the table, **at least 1 must be configured**—if all are left empty (default `None`), it will be rejected by `_require_at_least_one_stop_condition`, throwing `ValueError` fail-fast. This is why they are marked as "Conditionally Required". + +**Basic fields**: + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `name` | **Required** | `Literal["gepa_reflective"]` | — | Algorithm selector; currently the only optional value | +| `reflection_lm` | **Required** | `OptimizeModelOptions` | — | Reflection LM configuration (see [§7.3.4](#734-optimizealgorithmreflection_lm-section) for details) | +| `seed` | Optional | `int` | `42` | Random seed; two sets of configurations should be consistent when A/B testing | + +**Search behavior fields**: + +| Field | Required | Type | Default | Values and Description | +| --- | --- | --- | --- | --- | +| `candidate_selection_strategy` | Optional | Literal | `"pareto"` | `pareto` select from frontier (default recommended) / `current_best` use current best / `epsilon_greedy` exploration-exploitation / `top_k_pareto` random from top K of frontier | +| `module_selector` | Optional | `str` | `"round_robin"` | Which field to modify this round in multi-field: `round_robin` rotate in registration order / `all` select all / `random` random | +| `frontier_type` | Optional | Literal | `"instance"` | Pareto frontier granularity: `instance` one best per case / `objective` one per metric / `hybrid` two-layer / `cartesian` Cartesian product | +| `reflection_minibatch_size` | Optional | `Optional[int]` | `None` | Minibatch size for each round's reflection; `None` lets gepa decide | +| `reflection_history_top_k` | Optional | `int` (0~5) | `2` | How many historical best responses to give reflection LM for each case; 0 disables, upper limit 5 | +| `perfect_score` | Optional | `float` | `1.0` | "Perfect score" threshold (used with `skip_perfect_score`) | +| `skip_perfect_score` | Optional | `bool` | `True` | Skip cases that already have perfect score during reflection | + +**Multi-field fusion (merge) fields**: + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `use_merge` | Optional | `bool` | `False` | Enable merge round; **only actually triggers in multi-field (≥2)**, never triggers in single-field and won't report error (only `UserWarning`) | +| `max_merge_invocations` | Optional | `int` | `5` | Upper limit on merge trigger count | +| `merge_val_overlap_floor` | Optional | `int` | `5` | Minimum val set case overlap count to trigger merge | + +**Performance fields**: + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `cache_evaluation` | Optional | `bool` | `False` | Cache (candidate, case) scores; skip directly on repeated evaluation | +| `track_best_outputs` | Optional | `bool` | `False` | Track best output for each case | + +**6 stop condition items**—**configure at least 1** (OR semantics trigger): + +| Field | Required | Type | Default | Trigger Condition | +| --- | --- | --- | --- | --- | +| `max_metric_calls` | Conditionally Required | `Optional[int]` | `None` | Cumulative case-level evaluation count ≥ N → stop | +| `max_iterations_without_improvement` | Conditionally Required | `Optional[int]` | `None` | N consecutive rounds without best valset improvement → stop | +| `timeout_seconds` | Conditionally Required | `Optional[float]` | `None` | Wall-clock exceeds N seconds → stop | +| `score_threshold` | Conditionally Required | `Optional[float]` | `None` | Best valset score ≥ N → stop | +| `max_candidate_proposals` | Conditionally Required | `Optional[int]` | `None` | Candidate proposal count ≥ N → stop | +| `max_tracked_candidates` | Conditionally Required | `Optional[int]` | `None` | Pareto candidate pool size ≥ N → stop | + +**Condition**: At least 1 of the 6 items must be non-`None`, otherwise fail-fast at startup. See [§4.7 SLO Hard Constraints](#47) for details. + +#### 7.3.4 `optimize.algorithm.reflection_lm` Section + +Source code: `_optimize_model_options.py:OptimizeModelOptions`. Reflection LM connection configuration. + +> **Only need to configure 4 in daily use**: `model_name` / `base_url` / `api_key` / `generation_config` (leave others as default). The 6 items marked "advanced" in the table below generally do not need to be touched. + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `model_name` | **Required** | `str` | `""` | Model name (e.g., `"gpt-4o-mini"`); empty string equals not configured, will report error at startup | +| `base_url` | Optional | `Optional[str]` | `None` | Custom endpoint URL | +| `api_key` | Optional | `str` | `""` | API key (most providers must provide, otherwise will report error at call stage) | +| `generation_config` | Optional | `Optional[dict]` | `None` | Generation parameters; typical: `{"max_tokens": 4096, "temperature": 0.6}` | +| `provider_name` | Advanced | `str` | `""` | Provider name; empty / `"openai"` goes to `OpenAIModel`, other values go to `ModelRegistry.create_model("{provider}/{model}")` | +| `variant` | Advanced | `str` | `""` | OpenAI-compatible variant (only when provider is openai) | +| `extra_fields` | Advanced | `Optional[dict]` | `None` | Extra fields transparently passed to underlying model | +| `num_samples` | Advanced | `Optional[int]` | `None` | Number of samples | +| `weight` | Advanced | `float` | `1.0` | Weight (multi-judge scenarios) | +| `think` | Advanced | `Optional[bool]` | `None` | Whether to enable thinking mode | + +**Field values support environment variable expansion**—`"${TRPC_AGENT_API_KEY}"` will be automatically replaced. + +#### 7.3.5 `optimize.stop` Section + +Source code: `_optimize_config.py:FrameworkStopConfig`. + +| Field | Required | Type | Default | Values | +| --- | --- | --- | --- | --- | +| `required_metrics` | Optional | `Optional[Union[Literal["all"], list[str]]]` | `"all"` | `"all"`: all metrics must reach threshold; `["m1", "m2"]`: listed metrics must reach threshold (other metrics still participate in evaluation but do not affect early stop); `null` or `[]`: disable framework-level early stop (rely only on algorithm-level stoppers) | + +**List form validation**: Metric names in the list must be findable in `evaluate.metrics[]`, otherwise `OptimizeConfigFile._validate_required_metrics_against_evaluate` throws `ValueError` at startup, error message lists "unknown metrics" and "available metrics" checklist. + +### 7.4 `OptimizeResult` + `RoundRecord` Field Table + +Source code: `trpc_agent_sdk/evaluation/_optimize_result.py`. This is the return value of `optimize()`, and also the content of `runs//result.json`. + +> **Important convention**: Both `OptimizeResult` and `RoundRecord` are based on `EvalBaseModel` (`alias_generator=to_camel`). **Python in-memory uses snake_case, all converted to camelCase when serialized to JSON**—use camelCase when indexing `result.json` (`bestPassRate` not `best_pass_rate`), common pitfall. In the table below, the "Field" column uses Python names (snake_case), switch to camelCase when reading JSON. + +#### 7.4.1 `OptimizeResult` Top-Level Fields + +**Core result fields**: + +| Field (snake_case) | Type | Meaning | +| --- | --- | --- | +| `status` | `Literal["SUCCEEDED", "FAILED", "CANCELED"]` | Final status; when `FAILED`, `best_prompts = baseline_prompts` | +| `finish_reason` | Literal | `completed` / `perfect_pass_rate` / `no_improvement` / `error` | +| `stop_reason` | `Optional[StopReason]` | Which stopper triggered (see [§5.4](#54-stop-timing-complete-current-round-before-stopping) for details); `None` when FAILED early stop | +| `error_message` | `str` | Error message when FAILED (default `""`) | +| `algorithm` | `str` | Algorithm name (e.g., `"gepa_reflective"`) | + +**Score fields**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `baseline_pass_rate` | `float` | Pass rate of baseline on valset | +| `best_pass_rate` | `float` | Pass rate of optimal candidate on valset | +| `pass_rate_improvement` | `float` | `best - baseline` | +| `baseline_metric_breakdown` | `dict[str, float]` | Mean score of each metric for baseline | +| `best_metric_breakdown` | `dict[str, float]` | Mean score of each metric for optimal candidate | +| `metric_thresholds` | `dict[str, float]` | Threshold for each metric (copied from `evaluate.metrics[].threshold`) | +| `per_metric_best_candidates` | `dict[str, list[int]]` | Pareto frontier candidate index for each metric (0-based); empty = algorithm does not expose this information | + +**Prompt fields**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `baseline_prompts` | `dict[str, str]` | Starting prompt content (keyed by TargetPrompt field names) | +| `best_prompts` | `dict[str, str]` | Optimal candidate prompts; = `baseline_prompts` when `FAILED` (ensuring artifacts **will never be worse than baseline**) | + +**Round fields**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `total_rounds` | `int` | How many rounds were run | +| `rounds` | `list[RoundRecord]` | Each round's record (see §7.4.2 for details) | + +**Statistics and time fields**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `total_reflection_lm_calls` | `int` | Cumulative reflection LM call count (including retries) | +| `total_token_usage` | `dict[str, int]` | Cumulative tokens for reflection LM: `{prompt, completion, total}` | +| `duration_seconds` | `float` | Total wall-clock duration | +| `started_at` / `finished_at` | `str` | ISO-8601 timestamps | + +**Others**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `schema_version` | `str` | Default `"v1"`; bump when artifact schema upgrades | +| `extras` | `dict[str, Any]` | Custom business fields; optimizer does not read or write | + +#### 7.4.2 `RoundRecord` Fields (One Per Round) + +**Basic round information**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `round` | `int` | 1-based round number | +| `kind` | `Literal["reflective", "merge"]` | Reflection round / fusion round | +| `started_at` | `str` | ISO-8601 timestamp | +| `duration_seconds` | `float` | Wall-clock duration of this round | + +**Rewrite situation**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `optimized_field_names` | `list[str]` | Field names rewritten by reflection LM in this round | +| `candidate_prompts` | `dict[str, str]` | Full field content of this round's candidate | +| `accepted` | `bool` | Whether accepted as new best | +| `acceptance_reason` | `str` | Human-readable explanation of acceptance decision | +| `per_field_diagnosis` | `dict[str, str]` | Diagnosis text given by reflection LM for each field | + +**Scoring situation**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `validation_pass_rate` | `float` | Pass rate of this round on valset | +| `metric_breakdown` | `dict[str, float]` | Mean score of each metric on valset this round; empty = this round did not run valset | +| `failed_case_ids` | `list[str]` | Failed case IDs on valset this round | +| `failed_cases_truncated` | `int` | Number of failed cases cut off due to token budget | +| `train_minibatch_size` | `int` | Minibatch size of this round; 0 = skip, not sampled | +| `train_subsample_parent_score` | `Optional[float]` | Parent candidate's score on minibatch; `None` = not run | +| `train_subsample_candidate_score` | `Optional[float]` | New candidate's score on minibatch; `None` = not run | +| `skip_reason` | `Optional[str]` | Skip reason (e.g., `"subsample perfect"`, `"no proposal"`) | +| `error_message` | `Optional[str]` | Algorithm error message this round | + +**Statistical fields**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `reflection_lm_calls` | `int` | Reflection LM call count this round (including retries) | +| `round_token_usage` | `dict[str, int]` | Reflection LM tokens this round: `{prompt, completion, total}` | +| `budget_used` | `Optional[int]` | Cumulative used metric_calls | +| `budget_total` | `Optional[int]` | Configured budget upper limit (e.g., `max_metric_calls`) | + +**`extras`** (`dict[str, Any]`): Custom business fields; optimizer does not read or write. + +#### 7.4.3 `OptimizeResult` Utility Methods + +| Method | Behavior | +| --- | --- | +| `dump_to(path)` | Serialize to JSON file (`indent=2`, `by_alias=True`) | +| `OptimizeResult.from_file(path)` | classmethod, deserialize from JSON | +| `format_summary(*, output_dir, update_source)` | Generate human-readable text for `summary.txt` | + + +## 8 Artifacts and Directory Conventions + +Each time `optimize()` is run, the framework persists a complete set of audit artifacts under `output_dir`. All writes are **atomic**—SIGINT / process crash will not leave half-written files. + +### 8.1 Directory Layout + +```text +runs// +├── result.json Complete OptimizeResult serialization (programmatic entry) +├── summary.txt Human-readable summary (see baseline → best at a glance) +├── config.snapshot.json Complete snapshot of optimizer.json used this run (reproducible) +├── run.log Single-line status, CI parsing friendly +│ +├── baseline_prompts/ Prompt snapshots before running (one .md per field) +│ ├── system_prompt.md +│ └── ... +│ +├── best_prompts/ Optimal candidate from optimization (one .md per field) +│ ├── system_prompt.md +│ └── ... +│ +└── rounds/ Complete RoundRecord for each round + ├── round_001.json + ├── round_002.json + └── ... +``` + +Role of each file: + +| File / Directory | When Written | What It's For | +| --- | --- | --- | +| `result.json` | Optimization ends (including failure) | Most authoritative artifact for programmatic reading. Complete `OptimizeResult` serialization (see [§7.4](#74-optimizeresult--roundrecord-field-table) for details). **Field names are camelCase** | +| `summary.txt` | Optimization ends (only success) | Human-readable summary: `baseline → best` trend, metric breakdown, all best fields + character count, artifact directory index | +| `config.snapshot.json` | Optimization starts | Complete snapshot of `optimizer.json` used this run—directly use it later when wanting to "re-run this result" | +| `run.log` | Optimization ends | Single line: ` status=... algorithm=... baseline=0.4 best=0.85 delta=+0.45 rounds=10 duration_seconds=120.5`; CI platform grep-friendly | +| `baseline_prompts/.md` | Optimization starts | Content snapshot of each TargetPrompt field before running—**written regardless of `update_source` setting** (most important fallback artifact) | +| `best_prompts/.md` | Optimization ends (only when result exists) | Optimal candidate prompts—when `update_source=False`, this is the most valuable artifact (awaiting manual review and synchronization) | +| `rounds/round_.json` | Each round ends | Complete `RoundRecord` serialization (see [§7.4.2](#742-roundrecord-fields-one-per-round) for details); 3-digit zero-padded numbering for easy sorting | + +### 8.2 Sentinel File: Letting Users Actively Stop Optimization + +Source code: `_optimize_gepa_reflective.py:_build_stop_callbacks` end. + +During optimization, the user manually `touch optimize.stop` under `output_dir`: + +```bash +touch runs//optimize.stop +``` + +The framework detects this file at the beginning of the next round and stops (`gepa.utils.FileStopper` implementation), `stop_reason="user_requested_stop"`. **Typical use case**: discovered it's already sufficient after running halfway / temporarily need to release LLM quota—more elegant than Ctrl+C, ensures current round completes and disk persistence is clean. + +### 8.3 Atomic Disk Persistence Guarantee + +**All artifacts use tmp + `os.replace` atomic write**—POSIX guarantees rename atomicity, when process is kill / power failure, either clean old file or clean new file exists in `output_dir`, **will never appear in half-written state**. + +Source code: Two utility functions in `_agent_optimizer.py`: + +- `_atomic_write_text(path, content)`: First write to `.tmp`, then `os.replace(tmp, path)` +- `_mask_sigint`: Context manager, shields SIGINT during `_persist_artifacts` (avoid "second Ctrl+C interrupts finally disk persistence") + +**Source prompt file write-back when `update_source=True`**: Uses `TargetPrompt.write_all`, also guarantees atomicity for **multi-field**—when any field write fails, all already successfully written fields roll back to pre-call content (see `write_all` contract in [§7.2](#72-targetprompt-api-table) for details). + +> **Extreme fault tolerance**: If `os.replace` itself fails when `update_source=True` writes source files (e.g., target file's directory was concurrently deleted), the framework will **explicitly call `write_all(baseline)` to restore source files to pre-run content**, then re-raise the original exception—ensuring business never gets a "half-optimized" source file. + + +## 9 Want to Extend Yourself? + +Source code main entry: `_optimize_registrations.py`. The framework supports three types of extensions through a **registration mechanism**, no need to fork the SDK. + +### 9.1 Register New Algorithm + +Source code: `_base_optimizer.py:BaseOptimizer` + `_optimize_registry.py:OPTIMIZER_REGISTRY`. + +Write a `BaseOptimizer` subclass, implement `async def run(self, *, reporter=None) -> OptimizeResult`, register to `OPTIMIZER_REGISTRY`: + +```python +from trpc_agent_sdk.evaluation._base_optimizer import BaseOptimizer +from trpc_agent_sdk.evaluation._optimize_registry import OPTIMIZER_REGISTRY +from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult + + +class MyOwnOptimizer(BaseOptimizer): + async def run(self, *, reporter=None) -> OptimizeResult: + # Your algorithm main loop. Base class has already injected: + # self.config - OptimizeConfigFile (including evaluate / optimize two sections) + # self.call_agent - Business agent adapter function + # self.target_prompt - TargetPrompt instance + # self.train_dataset_path / self.validation_dataset_path + # self.callbacks / self.output_dir + # self.extra_stop_callbacks / self.extra_gepa_callbacks + ... + return OptimizeResult(...) + + +# Registration: second parameter must be BaseOptimizer subclass, otherwise register() throws TypeError +OPTIMIZER_REGISTRY.register("my_own_algo", MyOwnOptimizer) +``` + +Business side usage: Change `optimize.algorithm.name` in `optimizer.json` to `"my_own_algo"`, the framework finds your class through `OPTIMIZER_REGISTRY.get(...)` at startup, instantiates it, and runs `run()`. + +**Note**: `GepaReflectiveAlgo.name` is currently `Literal["gepa_reflective"]`—**new algorithms need a new `pydantic.BaseModel` configuration class** (e.g., `MyOwnAlgo`), and modify `OptimizeConfig.algorithm` field to discriminated union (see `_optimize_config.py:OptimizeConfig` docstring for details). + +### 9.2 Register Custom Stopper + +Source code: `AgentOptimizer.optimize`'s `extra_stop_callbacks` parameter in `_agent_optimizer.py`. + +Inject via `extra_stop_callbacks` at runtime—**no need to modify configuration file**: + +```python +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _LabeledStopper + + +class MySloMonitorStopper: + """Custom stopper: check external SLO monitoring system, stop when threshold is exceeded.""" + + def __init__(self, slo_client): + self._slo = slo_client + self.last_triggered = False + + def __call__(self, gepa_state=None) -> bool: + if self._slo.is_p99_breached(): + self.last_triggered = True + return True + return False + + +# Usage: +stopper = MySloMonitorStopper(slo_client) +result = await AgentOptimizer.optimize( + ..., + extra_stop_callbacks=[ + # Ordinary stopper: stop_reason displays as "completed" + stopper, + + # When wanting stable stop_reason label, use _LabeledStopper wrapper: + # _LabeledStopper(stopper, "slo_breach"), # But "slo_breach" is not in StopReason Literal, pydantic will reject + ], +) +``` + +**Interface contract** (see `_LabeledStopper`): + +- Must have `__call__(self, gepa_state=None) -> bool` method +- `True` means stop +- Should have `last_triggered: bool` attribute for `_classify_stop_reason` to read + +**Two behaviors of `stop_reason`**: + +- Ordinary callable / custom class: `stop_reason` displays as `"completed"` when triggered (gepa doesn't know why you stopped) +- Wrapped with `_LabeledStopper(inner, label)`: `label` must be a legal value of `StopReason` Literal (see `_optimize_result.py`); need to extend Literal type when customizing new label + +### 9.3 Register Custom Evaluation Callback + +Source code: `AgentOptimizer.optimize`'s `extra_gepa_callbacks` parameter in `_agent_optimizer.py`. + +Access gepa internal events through `extra_gepa_callbacks`—typical use: forwarding to dashboard / real-time monitoring metrics. + +```python +class MyDashboardCallback: + def on_proposal_end(self, *args, **kwargs) -> None: + # Report to Grafana / WandB / internal monitoring + ... + + # gepa silently ignores missing methods, just implement part of the protocol methods as needed + + +result = await AgentOptimizer.optimize( + ..., + extra_gepa_callbacks=[MyDashboardCallback()], +) +``` + +**Protocol constraints**: Each callback should implement several methods in `gepa.core.callback.GEPACallback` protocol (`on_iteration_start` / `on_proposal_start` / `on_proposal_end` / `on_valset_breakdown` / ...). **gepa silently ignores missing methods in callback**, so business can only implement those few that they care about. + + +## 10 FAQ + +**Q: Ran once, `bestPassRate` in `result.json` is the same as `baselinePassRate`, `accepted` are all false—is it a bug?** + +Not a bug. Optimization didn't find a candidate better than baseline—`status="SUCCEEDED"` + `finish_reason="no_improvement"` is the typical combination for this situation, `best_prompts` equals `baseline_prompts`. Possible reasons: baseline is already very good, `max_metric_calls` is too small to reach improvement point, training set and validation set have very different distributions, metric noise is too large (recommend increasing `num_runs`). + +--- + +**Q: `update_source=True` crashed during run, were source prompt files corrupted?** + +No. Two layers of protection: (1) When optimization fails (`status="FAILED"`), the framework simply doesn't call `write_all`; (2) Even if `write_all` itself fails, source files are atomically rolled back through tmp + `os.replace` (see [§8.3](#83-atomic-disk-persistence-guarantee) for details). + +--- + +**Q: Can I modify `optimizer.json` mid-run?** + +No. `optimizer.json` is loaded once at startup, subsequent modifications will not be read. Sentinel file `optimize.stop` is the only supported "runtime intervention" (see [§8.2](#82-sentinel-file-letting-users-actively-stop-optimization) for details). + +--- + +**Q: Can I run with a very small training set (< 5 cases)?** + +Yes, but effect is poor: (1) Reflection LM sees too few feedback samples, rewrite direction is unstable; (2) Small training set easily lets advanced configuration overfit (refer to [§4.8](#48)). Recommend at least 5~10 cases; consider manual tuning first when < 5. + +--- + +**Q: How to handle retries when `call_agent` internally sends HTTP / RPC?** + +Handle it yourself within `call_agent`. The framework does not do retries for business at LLM / service call layer—designed to keep `call_agent` as a black box. If the call fails, that case's evaluation score counts as 0, and the reflection LM will see the error message (refer to §5.2 Reflection LM feedback structure). + +--- + +**Q: Can multiple `optimize()` runs happen simultaneously, sharing one `output_dir`?** + +No. Multiple processes writing to one `output_dir`, atomic write constraint protects single files from being half-written, but **multiple processes overwrite files mutually**—`result.json` / `rounds/round_001.json`, etc. will step on each other. Use independent timestamp subdirectory for each run. + +--- + +**Q: When using black-box `call_agent` mode, can I use metrics like `tool_trajectory_avg_score`?** + +No. Black-box `call_agent` mode cannot obtain session traces / tool intermediate_data, the framework will fail-fast and reject at startup (see [§7.1](#71-agentoptimizeroptimize-parameter-table) startup check table for details). Switch to response-level metrics: `final_response_avg_score` / `llm_rubric_response` / `llm_final_response`. + +--- + +**Q: After running with `update_source=False`, source prompts are still in place, but `target_prompt.write_all` was called repeatedly during the process?** + +Yes. The optimizer main loop calls `write_all` every time a new candidate is generated to write the candidate to source files registered with `add_path`—this is to let the next `call_agent` call read the new prompt. **The `finally` phase will automatically `write_all(baseline_snapshot)` to roll back source files to baseline content** (source code: `cleanup_done` sentinel in `optimize` in `_agent_optimizer.py`). So after `update_source=False` finishes running, source files are **completely consistent with before running**—provided that `TargetPrompt.write_all` didn't throw an error during the rollback phase (in extreme cases when it throws an error, the framework will log a warning but will not affect `result.json` / `best_prompts/` artifact production). + +--- + +**Q: How to "re-run" last optimization result?** + +Re-run `runs//config.snapshot.json`—it is the complete configuration snapshot from last time. But LLM output has randomness, even with consistent configuration you may get different best_prompts; fixing the `seed` field can reduce (not eliminate) this randomness. Must lock seed when A/B testing (refer to [§4.8](#48)). diff --git a/docs/mkdocs/zh/optimization.md b/docs/mkdocs/zh/optimization.md new file mode 100644 index 00000000..2a24e1eb --- /dev/null +++ b/docs/mkdocs/zh/optimization.md @@ -0,0 +1,2038 @@ +# Prompt 自优化(AgentOptimizer) + +`AgentOptimizer` 是 tRPC-Agent-Python 的 **prompt 自优化模块**:它把 prompt 工程的迭代过程——失败案例分析、改写、回归验证、版本管理——整体托管为一条可复现的自动化流水线,把工程师从手工试错中解放出来。 + +> **这里 "prompt" 的外延**:在 agent 应用中,prompt 不仅指狭义的 system prompt,还包括所有以自然语言驱动 agent 行为的文本资产——skill 描述、rule 规范、sub-agent 协同指令、工具使用说明等。它们的本质都是被 LLM 解读的自然语言文本;只要会左右 agent 决策,都可以作为 `AgentOptimizer` 的优化目标。 + +模块由四个子模块组成,对外通过单一入口 `AgentOptimizer.optimize` 驱动: + +| 子模块 | 职责 | +|---|---| +| **优化算法** | 反思—评估—保留循环;当前内置 [GEPA](https://github.com/gepa-ai/gepa)(Genetic-Evolutionary Pareto,MIT License),通过 `OPTIMIZER_REGISTRY` 可扩展接入其他算法 | +| **评测桥接** | 复用 `AgentEvaluator`,让优化过程与日常回归共用同一份 `EvalSet` 与 metric 配置 | +| **Prompt 管理** | `TargetPrompt` 统一抽象 prompt 字段的读写;支持本地文件(path)与任意后端(callback)两种源 | +| **运行编排** | 资源调度、stopper(停机器)、产物原子落盘、SIGINT 信号安全 | + +`AgentOptimizer` 把"prompt 调优"重新定义成一个**有边界、可复现、可审计**的工程问题: + +| 维度 | 表达方式 | +|---|---| +| 优化目标 | `evaluate.metrics[]` —— 数值化、可重复评估的指标集合 | +| 决策变量 | `TargetPrompt` 注册的 prompt 字段(一个或多个) | +| 搜索过程 | reflection LM(反思型 LLM)驱动的反思—评估—保留循环(详见 [§5](#5-gepa-是怎么工作的)) | +| 终止条件 | 6 种内置 stopper + 用户自定义停机器(详见 [§4.7](#47)) | +| 产物 | `OptimizeResult` 对象 + `runs/<时间戳>/` 全量审计目录(详见 [§8](#8-产物与目录约定)) | + +> **前置阅读**:[Agent 评测](evaluation.md) —— 优化建立在评测之上;本文假设读者已了解 `EvalSet` 与 `metric` 的基本概念。 + +--- + +## 1 这是什么 / 解决什么问题 + +### 1.1 解决的问题 + +在 agent 应用进入业务关键链路后,prompt(含 skill、rule 等所有驱动 agent 行为的自然语言文本)是迭代成本最高的资产之一:手工调优依赖工程师对失败案例的归纳能力,规模化后回归风险快速放大;多 sub-agent 链路上 prompt 字段之间的耦合让单字段优化失去意义;模型升级、工具变更、场景扩张都会让"昨日最优"的 prompt 在今日失效。 + +`AgentOptimizer` 模块把这套迭代过程**完整地工程化**: + +- **优化目标显式化**——把"什么算好"沉淀为 metric + threshold 的数值契约,可被评测、优化、CI/CD 共享 +- **搜索过程算法化**——反思—评估—保留循环替代人工试错,过程可重放、结果可比较 +- **多 prompt 联合优化**——支持同时优化多个字段(如 router + worker + summarizer 的指令、CLAUDE.md + SKILL.md),并通过 GEPA 的 merge 机制做跨字段搜索 +- **运行过程可审计**——每轮 reflection 输入、候选改动、评估分数、接受/拒绝原因都落盘到 `runs/<时间戳>/`,支持事后追溯 +- **结果可控可回滚**——`update_source` 决定是否回写源 prompt;`TargetPrompt` 提供原子写入与失败回滚,写盘半中断或 SIGINT 二次中断都不会损坏源文件 + +### 1.2 与评测模块的关系 + +`AgentEvaluator` 与 `AgentOptimizer` 构成**评测—优化闭环**的两端: + +| 模块 | 角色 | 输出 | +|---|---|---| +| `AgentEvaluator`([evaluation.md](evaluation.md)) | 度量当前 prompt 的质量 | 每条 case 的 pass/fail + 各 metric 分 | +| `AgentOptimizer`(本文) | 在度量结果上搜索更优 prompt | 最优 prompt + 全程优化历史 | + +二者共享同一份 `EvalSet`、同一套 metric 配置、同一个 `call_agent`。一份资产同时支撑日常回归(pytest 跑 `AgentEvaluator`)与定期优化(夜间窗口跑 `AgentOptimizer`,详见 [§4.6 CI 闭环](#46))。 + +### 1.3 适用边界 + +`AgentOptimizer` 的有效性取决于三个前提: + +1. **评测信号足够稳定**。判分本身的方差大于 prompt 改写带来的提升时,优化方向不可信。建议先在 `AgentEvaluator` 上跑 `num_runs=3` 观察 metric 跨次一致性,再开始优化。 +2. **预算与搜索空间匹配**。一次典型的小规模优化在 `max_metric_calls=30~60`(一次 case-level 评估算一次 metric_call)量级、reflection LM 调用 5~20 次、运行 1~10 分钟、消耗几美元到几十美元(详见 [§6 成本与并发](#6-成本与并发))。预算显著低于该量级时,应先在 `AgentEvaluator` 上完成基线调优。 +3. **prompt 有可优化的语义结构**。少于 20 字的硬编码或仅作占位拼接的 prompt,搜索空间过窄;GEPA 反思在这种场景下退化为同义改写。 + +不在以上前提内的场景,应优先选择 [`AgentEvaluator`](evaluation.md) 持续观察,而非启动优化。 + + + + + +## 2 5 分钟 Quickstart + +完整代码与数据:[`examples/optimization/quickstart/`](../../../examples/optimization/quickstart/)。 + +### 2.1 示例任务 + +本示例的 agent 是一个 **小学算术应用题求解器**:接收自然语言描述的算术题(如"小明早上买了 4 个苹果,下午又买了 7 个苹果,他一共有多少个苹果?"),输出带单位的数字答案(如"答案:11 个")。 + +agent 行为由两个 prompt 文件共同驱动,二者就是本次优化的目标: + +| 优化目标 | 路径 | 在 agent 中的角色 | +|---|---|---| +| **system_prompt** | `agent/prompts/system.md` | 角色与回答风格定义(如"你是一个数学助教,用清晰的中文回答") | +| **skill** | `agent/prompts/skill.md` | 解题方法论(如"先识别题型 → 列式 → 计算 → 写出带单位的答案") | + +评测从两个维度同时打分,两条都达标才算 agent 通过: + +| 评测指标 | 类型 | 阈值 | 判分方式 | +|---|---|---|---| +| `final_response_avg_score` | 文本匹配 | 1.0 | agent 输出必须**包含**参考答案文本(如 "答案:11 个"),大小写不敏感 | +| `llm_rubric_response` | LLM 裁判 | 0.66 | 由独立 LLM 按三条 rubric 打分取均值:① 答案数值与参考一致 ② 推理步骤清晰 ③ 答案带正确单位 | + +数据集规模:训练集 5 条、验证集 3 条。 + + +### 2.2 准备环境 + +```bash +pip install "trpc-agent-py[optimize]" + +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +`[optimize]` extra 包含 `gepa`(反思算法实现)与 `rich`(终端进度面板)。 + + +### 2.3 目录结构 + +```text +examples/optimization/quickstart/ +├── agent/ +│ ├── agent.py # 定义 create_agent() 工厂函数 +│ ├── config.py # 模型 / 凭据从环境变量读取 +│ └── prompts/ +│ ├── system.md # baseline system prompt(待优化) +│ └── skill.md # baseline skill 文档(待优化) +├── train.evalset.json # 5 条训练用例(反思 minibatch 来源) +├── val.evalset.json # 3 条验证用例(每轮全量评估,决定候选是否被接受) +├── optimizer.json # 算法 + metric 配置 +└── run_optimization.py # 入口脚本 +``` + +> 训练集与验证集必须是不同文件,框架启动期会校验路径不重合。 + +### 2.4 核心代码 + +`run_optimization.py` 由三段构成,对应优化器对外的三个核心抽象。 + +**第一段:`call_agent` —— 业务桥接函数**(详见 [§3.4](#34-call_agent)) + +签名固定为 `async def(query: str) -> str`。框架通过它驱动 agent 完成单次推理;任意形态的 agent(`LlmAgent`、HTTP 服务、子进程 CLI 等)都通过这层桥接接入。 + +```python +async def call_agent(query: str) -> str: + # 每次重读 prompt 文件 → GEPA 写入新候选立即生效 + root_agent = create_agent() + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=root_agent, + session_service=session_service) + # ... 发送 user_content、收集 is_final_response 事件 + return final_text.strip() +``` + +**第二段:`TargetPrompt` —— 优化目标声明**(详见 [§3.3](#33-targetprompt)) + +注册哪些 prompt 字段会被优化器读写。每个字段对应一个本地文件(`add_path`)或一对异步读写回调(`add_callback`,用于远端 KV 等任意后端)。 + +```python +target = ( + TargetPrompt() + .add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + .add_path("skill", str(SKILL_PATH)) +) +``` + +**第三段:`AgentOptimizer.optimize` —— 优化器调用**(完整参数见 [§7.1](#71-agentoptimizeroptimize-参数表)) + +```python +await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(RUNS_DIR / timestamp), + update_source=False, + verbose=1, +) +``` + +| 参数 | 说明 | +|---|---| +| `config_path` | `optimizer.json`,定义 metric / 算法 / 停机条件 | +| `output_dir` | 产物目录;不存在会自动创建,建议用时间戳子目录 | +| `update_source` | `False` 只产出 `best_prompts/`;`True` 优化成功后回写源文件(CI 场景,详见 [§4.6](#46)) | +| `verbose` | `0` 静默 / `1` Rich 进度面板 / `2` 附 gepa 诊断日志 | + +### 2.5 配置文件 `optimizer.json` + +配置分两段:`evaluate`(评测,与评测模块同源)+ `optimize`(优化器专属)。 + +```json +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": {"text": {"match": "contains", "case_insensitive": true}} + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "...", "base_url": "...", "api_key": "..."}, + "rubrics": [ + {"id": "numeric_correct", "content": {"text": "答案数值与参考一致"}, "type": "FINAL_RESPONSE_QUALITY"}, + {"id": "reasoning_clear", "content": {"text": "推理步骤清晰"}, "type": "FINAL_RESPONSE_QUALITY"}, + {"id": "units_present", "content": {"text": "答案带正确单位"}, "type": "FINAL_RESPONSE_QUALITY"} + ] + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": {"required_metrics": "all"}, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": {"model_name": "...", "base_url": "...", "api_key": "..."}, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 60, + "max_iterations_without_improvement": 8 + } + } +} +``` + +本示例用到的关键概念: + +| 概念 | 在配置中的位置 | 一句话说明 | 详见 | +|---|---|---|---| +| **metric** | `evaluate.metrics[]` | 评测指标列表;多条可叠加,每条独立打分 | [§4.5](#45) | +| **LLM judge** | `criterion.llm_judge` | LLM 裁判,按 rubric 打分;本例为 `llm_rubric_response` 服务 | [§4.5](#45) | +| **stop.required_metrics** | `optimize.stop.required_metrics` | 框架级停机:哪些 metric 必须同时达阈值才停 | [§7.3.5](#735-optimizestop-段) | +| **reflection_lm** | `optimize.algorithm.reflection_lm` | 反思型 LLM,每轮检视失败用例并生成新候选 prompt | [§3.8](#38-reflection-lm) / [§6.5](#65-reflection-lm-选型建议表) | +| **candidate_selection_strategy** | `optimize.algorithm` | 每轮挑哪个候选作为反思 parent | [§7.3.3](#733-optimizealgorithm-段) | +| **module_selector** | `optimize.algorithm` | 多字段优化时每轮选哪个字段改写 | [§4.3](#43) | +| **reflection_minibatch_size** | `optimize.algorithm` | 每轮反思从 train 抽几条 case | [§5](#5-gepa-是怎么工作的) | +| **stopper** | `optimize.algorithm.max_*` / `timeout_seconds` / `score_threshold` | 算法层停机条件,至少需要设置一个 | [§4.7](#47) / [§7.3.3](#733-optimizealgorithm-段) | + +完整字段参考见 [§7.3](#73-optimizerjson-配置项表)。 + + +### 2.6 运行 + +```bash +python examples/optimization/quickstart/run_optimization.py +``` + +终端依序输出:baseline 评估分数 → 每轮反思的接受/拒绝记录 → 收尾摘要。小规模配置下 1~3 分钟完成。 + +![Quickstart 终端输出示例](../assets/imgs/optimization_quickstart.png) + +### 2.7 产物 + +```text +runs/<时间戳>/ +├── result.json # 完整运行记录(OptimizeResult 序列化) +├── summary.txt # 人类可读总览(首先看这个) +├── run.log # 单行状态 +├── config.snapshot.json # 输入配置的快照副本 +├── rounds/round_NNN.json # 每轮 RoundRecord +├── baseline_prompts/<字段>.md # 优化前快照 +└── best_prompts/<字段>.md # 优化后最佳候选(仅 SUCCEEDED) +``` + +`summary.txt` 关键行: + +```text +Optimization complete | status=SUCCEEDED | algorithm=gepa_reflective +pass_rate : 0.5000 -> 0.8500 (+0.3500, improved) +rounds : 3 accepted / 7 total +duration : 124.31s +stop_reason : required_metrics_passing +update_source : false +``` + +> **什么是 pass_rate?** +> +> pass_rate 衡量的是:**你的 agent 在验证集上"做对了"多少比例的题**。 +> +> --- +> +> **第一步:每个 metric 独立判定达标/未达标** +> +> 每个 metric 有自己的阈值(threshold),分数 ≥ 阈值就达标,否则未达标。 +> +> **第二步:一条 case 的通过规则——所有 metric 都达标才算通过** +> +> 就像考试同时考语文和数学,两科都及格才算"通过",任何一科不及格就是"失败"。 +> +> **第三步:pass_rate = 通过的 case 数 ÷ 总 case 数** +> +> --- +> +> **完整示例**:假设验证集有 4 条 case,配了 3 个 metric: +> +> | | metric_A(阈值 0.8) | metric_B(阈值 0.6) | metric_C(阈值 1.0) | 这条 case 通过了吗? | +> | --- | --- | --- | --- | --- | +> | case_1 | 得分 0.9 ✅ | 得分 0.7 ✅ | 得分 1.0 ✅ | **通过**(3 个都达标) | +> | case_2 | 得分 0.85 ✅ | 得分 0.4 ❌ | 得分 1.0 ✅ | **失败**(metric_B 没达标) | +> | case_3 | 得分 0.6 ❌ | 得分 0.8 ✅ | 得分 0.0 ❌ | **失败**(metric_A、C 没达标) | +> | case_4 | 得分 0.95 ✅ | 得分 0.9 ✅ | 得分 1.0 ✅ | **通过**(3 个都达标) | +> +> 通过 2 条,总共 4 条: +> +> ``` +> pass_rate = 2 / 4 = 0.5 +> ``` +> +> --- +> +> **回到上面的 summary.txt**: +> +> ``` +> pass_rate : 0.5000 -> 0.8500 (+0.3500, improved) +> ``` +> +> 意思是:优化前 agent 只能做对一半的 case,优化后能做对 85%。提升了 35 个百分点。 +> +> **三个相关字段**: +> +> | 字段 | 含义 | +> | --- | --- | +> | `baseline_pass_rate` | 优化前的通过率(用初始 prompt 跑出来的分数) | +> | `best_pass_rate` | 优化过程中找到的最高通过率 | +> | `pass_rate_improvement` | `best - baseline`,本次优化的提升幅度 | + +各字段完整含义见 [§8 产物与目录约定](#8-产物与目录约定)。 + +### 2.8 下一步 + +| 你的下一个问题 | 跳转章节 | +|---|---| +| 上面这些 API 概念到底是什么 | [§3 核心概念](#3-核心概念) | +| 我的 agent 不是这种本地 LlmAgent,怎么接入? | [§4 你的场景 → 怎么接入](#4-你的场景--怎么接入) | +| 反思—评估—保留循环每一步具体在做什么 | [§5 GEPA 是怎么工作的](#5-gepa-是怎么工作的) | +| 想估算 LLM 调用成本 / 调整并发参数 | [§6 成本与并发](#6-成本与并发) | +| 想直接查参数 / 配置项 | [§7 完整 API 参考](#7-完整-api-参考) | + + + +## 3 核心概念 + +> 这节用 8 个概念建立 optimization 模块的"心智模型"。每个概念都从"它对应你工作里的什么"切入,而不是从类型签名切入。介绍顺序与 [§2.4 核心代码](#24-核心代码)中三段代码的出现顺序一致。 + +### 3.1 模块整体数据流 + +optimization 模块的工作回路:用户输入 4 类资产,模块在反思—评估—保留循环里产出 2 类结果。 + +```text + +---> 评估候选 + | | + call_agent ---+ | v + | | 反思失败 + optimizer.json ---+ | | + | | v ---> OptimizeResult + +------>| 写盘新候选 (内存返回) + TargetPrompt ---+ | | + + | | v runs/<时间戳>/ + EvalSet x 2 ---+ | 接受新 best? (审计目录) + | 是:保留 / 否:丢弃 + | | + +---------+ +``` + +四类输入的角色: + +| 输入 | 形态 | 在循环中的作用 | +| --- | --- | --- | +| `call_agent` | `async (str) -> str` | 把 query 透给业务 agent;优化器以此采样行为 | +| `optimizer.json` | JSON 配置 | 定义评测指标(`evaluate.metrics`)与算法参数(`optimize.algorithm`) | +| `TargetPrompt` | 多字段 prompt 注册表 | 声明哪些 prompt 文件 / 远端配置位是优化目标 | +| `EvalSet × 2` | 两份 evalset | 训练集供反思 LM 看失败案例,验证集供打分 / 早停判定 | + +两类产出的去向: + +| 产出 | 形态 | 典型用途 | +| --- | --- | --- | +| `OptimizeResult` | `optimize()` 返回的内存对象 | 程序读取(baseline / best / 各 round 明细) | +| `runs/<时间戳>/` | 审计目录 | 人工 review、CI 解析、复跑(详见 [§8](#8-产物与目录约定)) | + +### 3.2 call_agent + +**一句话**:你的业务 agent 的"通用插头"。 + +**为什么需要**:你的 agent 可能是本地 `LlmAgent`、可能是部署好的 HTTP 服务、可能是 `claude` / `codex` 这种黑盒 CLI。模块不可能为每种形态写适配器;你只需要把"给一段 query → 拿到 agent 最终回复"这个动作包成一个 async 函数,模块通过它驱动 agent 跑评测。 + +**怎么用**: + +```python +async def call_agent(query: str) -> str: + # 你的实现:调本地 agent / HTTP 服务 / 子进程 CLI 都行 + # 关键点:每次都重读 prompt 文件(让 GEPA 写入的新候选立即生效) + root_agent = create_agent() + runner = Runner(...) + return await run_and_collect_final_response(runner, query) +``` + +签名固定为 `async (str) -> str`,不能多参数也不能同步。 + +**框架在三个时机调用它**: + +| 时机 | 频率 | +|---|---| +| baseline 评估 | 每条 val case × `num_runs` | +| 每轮反思的 minibatch 评估 | 每条抽样 case 1 次 | +| 每轮候选的验证集评估 | 每条 val case × `num_runs` | + +### 3.3 TargetPrompt + +**一句话**:告诉模块"哪些 prompt 文件是要被优化的",相当于**优化目标的注册表**。 + +**为什么需要**:agent 项目里 prompt 通常分散在多个文件甚至多个后端(system.md / skill.md / 还有放在七彩石的版本);模块需要知道:**反思出新候选时,应该把它写到哪里、读 baseline 时应该从哪里读**。`TargetPrompt` 就是这个"地址簿"。 + +**怎么用**: + +```python +from trpc_agent_sdk.evaluation import TargetPrompt + +target = ( + TargetPrompt() + .add_path("system_prompt", "agent/prompts/system.md") # 文件型 + .add_path("skill", "agent/prompts/skill.md") # 文件型 + .add_callback("rule", # 回调型(远端 KV) + read=load_rule_from_kv, + write=save_rule_to_kv) +) +``` + +每个字段 `name`(如 `"system_prompt"`)在你优化结束后会变成: + +- `result.best_prompts["system_prompt"]` —— 程序读最优 prompt +- `runs/<时间戳>/best_prompts/system_prompt.md` —— 人读最优 prompt +- `RoundRecord.optimized_field_names` 里的元素 —— 看每轮改了哪个字段 + +**两种源**: + +| 源 | 适用 | 框架做什么 | +|---|---|---| +| `add_path(name, path)` | prompt 在本地文件 | 写盘走 tmp + `os.replace` 原子写,多字段失败回滚源文件 | +| `add_callback(name, *, read, write)` | prompt 在远端配置中心 / 数据库 / git 等任意后端 | 调你的 `read` / `write` async 函数,原子性由你保证 | + +完整 API 见 [§7.2](#72-targetprompt-api-表)。 + +### 3.4 AgentOptimizer + +**一句话**:模块的"开机按钮"。 + +**为什么需要**:你不会想自己手写"读配置 → 校验输入 → 跑反思循环 → 落盘 → 拼 result"这一整套流程;`AgentOptimizer` 把这套流程封装成一个调用——你给它**输入**,它返回**结果**。 + +**怎么用**: + +```python +from trpc_agent_sdk.evaluation import AgentOptimizer + +result = await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs/2026-05-19T17-00-00", +) +print(result.best_pass_rate) +``` + +整个模块只有这一个公开入口,**没有别的方式启动优化**。 + +**它做了什么**: + +1. 加载并校验 `optimizer.json`(schema 不对就在跑之前抛错) +2. 校验 `call_agent` 是 async 函数 / `target_prompt` 至少注册一个字段 / 训练集 ≠ 验证集 +3. 跑反思—评估—保留循环 +4. 把产物落盘到 `output_dir/` +5. 返回一个 `OptimizeResult` 对象 + +`optimize` 共 11 个 keyword-only 参数,常用 6 个见 [§2.4](#24-核心代码),全部参数详见 [§7.1](#71-agentoptimizeroptimize-参数表)。 + +**`update_source` 决策表**(所有 §4.x 场景共享的关键参数):决定优化成功后是否把最优候选**回写**到 `TargetPrompt` 注册的源 prompt 文件—— + +| `update_source` | 优化成功后做什么 | 生效路径 | 适用场景 | +|---|---|---|---| +| `False`(默认) | 只把最优候选写到 `output_dir/best_prompts/` | 你**人工** review → 复制到线上 prompt 文件 → 下一次调用生效 | 灰度上线、需要人工审核、不希望优化器直接动线上文件 | +| `True` | 用最优候选**直接覆盖**源 prompt 文件 | 业务下一次调用**立即**自动用上新 prompt | 自动化闭环(如夜间优化任务,详见 [§4.6 CI 闭环](#46)) | + +无论选哪种,业务侧**零重启、零代码改动**——感知 prompt 变化的方式始终是"下一次调用重读文件"。 + +> `update_source=True` 的安全保证:覆盖采用 tmp + `os.replace` 原子写;如果优化中途异常或 SIGINT 中断,源 prompt 文件**不会被半写**,保持原内容(详见 [§8.3 原子落盘](#83-原子落盘保证))。 + +### 3.5 optimizer.json + +**一句话**:一份配置文件,告诉模块"什么算好"和"怎么搜索"。 + +**为什么需要**:metric 阈值、minibatch 大小、reflection LM 配置、停机条件……这些参数如果散在代码里,每次跑实验都要改代码。集中到一个 JSON 文件后,调参 = 改 JSON,可重现性也更好(产物里会保存一份 `config.snapshot.json`)。 + +**长什么样**:[§2.5](#25-配置文件-optimizerjson) 已经看过完整示例。结构上分两段: + +```text +{ + "evaluate": { ... }, # 与 AgentEvaluator 同 schema:metric 列表 + num_runs + "optimize": { + "eval_case_parallelism": 2, + "stop": { # 框架级停机:哪些 metric 必须达阈值 + "required_metrics": "all" + }, + "algorithm": { # 算法专属:reflection_lm / minibatch / 6 种 stopper + "name": "gepa_reflective", + ... + } + } +} +``` + +**两段的分工**: + +- `evaluate` 段:**完全复用**评测模块的 schema。你给评测项目写过的 metric 配置,可以直接拷过来 +- `optimize` 段:**优化器专属**。其中 `algorithm.name` 是算法选择器,目前唯一可选值是 `"gepa_reflective"`,未来扩展新算法时通过 [§9.2 注册新算法](#92) 增加 + +完整字段表见 [§7.3](#73-optimizerjson-配置项表)。 + +### 3.6 EvalSet / EvalCase + +**一句话**:训练集 + 验证集,格式与评测模块完全相同。 + +**为什么需要分两个文件**: + +- **训练集**:模块每轮从中**随机抽**几条 case(`reflection_minibatch_size`,默认让 gepa 决定)给 reflection LM 看失败案例 → 用来"找改进方向" +- **验证集**:每个新候选生成后,在它上面**全量跑**算分 → 用来"验证候选是否真的更好" + +**为什么必须是不同文件**:训练集决定了 reflection LM 看到什么,验证集决定了候选是否被接受。如果两者重合,就成了"用考题刷题、再用考题判分"——拿到的 best_pass_rate 不可信。框架启动期会比对路径(`os.path.normpath(os.path.abspath(...))`)防御这一点,重合直接抛 `ValueError`。 + +格式与编写指引见 [评测集编写指南](evaluation.md#评测集evalset编写指南)。 + +### 3.7 OptimizeResult + +**一句话**:一次优化跑完后的"全部产出",既是 `optimize()` 的返回值,也是 `runs/<时间戳>/result.json` 的内容。 + +**为什么需要它**:你跑完优化最关心三件事——成功了吗 / 提升多少 / 最优 prompt 是什么。`OptimizeResult` 把它们打包: + +```python +result = await AgentOptimizer.optimize(...) + +# 1. 成功了吗 +if result.status == "SUCCEEDED": + ... + +# 2. 提升多少 +print(f"{result.baseline_pass_rate:.2%} → {result.best_pass_rate:.2%}, " + f"+{result.pass_rate_improvement:.2%}") + +# 3. 最优 prompt 是什么 +new_system_prompt = result.best_prompts["system_prompt"] +new_skill = result.best_prompts["skill"] +``` + +它还携带过程数据(每轮发生了什么、reflection LM 调用次数、总耗时等)供事后分析。 + +**最常看的 6 个字段**: + +| 字段 | 类型 | 含义 | +|---|---|---| +| `status` | `"SUCCEEDED"` / `"FAILED"` / `"CANCELED"` | 终态 | +| `baseline_pass_rate` / `best_pass_rate` | `float` | 优化前 / 后 pass rate | +| `pass_rate_improvement` | `float` | 二者差值 | +| `best_prompts` | `dict[str, str]` | 字段名 → 最优 prompt 文本 | +| `rounds` | `list[RoundRecord]` | 每轮记录 | +| `stop_reason` | `Literal[...]` 或 `None` | 哪个 stopper 触发的停机 | + +完整 22 字段(含 `RoundRecord`)见 [§7.4](#74-optimizeresult--roundrecord-字段表)。 + +### 3.8 Reflection LM + +**一句话**:模块内部使用的 LLM,每轮接收一组失败案例,输出改进后的 prompt 候选;与你 agent 使用的业务 LM 是两套独立配置。 + +在 `optimizer.json::optimize.algorithm.reflection_lm` 段配置,类型是 `OptimizeModelOptions`: + +```json +"reflection_lm": { + "model_name": "gpt-4o", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-...", + "generation_config": {"temperature": 0.6, "max_tokens": 4096} +} +``` + +模型选型建议见 [§6.5](#65-reflection-lm-选型建议表);完整字段见 [§7.3.3](#733-optimizealgorithm-段)。 + +## 4 你的场景 → 怎么接入 + +| 你的情况 | 章节 | 对应 example | +|---|---|---| +| agent 是线上 HTTP 服务(FastAPI / Gin / 自研接口) | [§4.1](#41) | `http_service` | +| agent 是子进程 / 命令行工具(`claude` / `codex` / 内部 CLI) | [§4.2](#42) | `blackbox_cli` | +| agent 是多 sub-agent 链路(多个 sub-agent 协作完成一次响应),想同时优化每个 sub-agent 的 prompt | [§4.3](#43) | `multi_agent_pipeline` | +| prompt 不在本地文件,存在远端 KV / 配置中心 / 数据库 / Git 等任意后端 | [§4.4](#44) | `remote_prompt_store` | +| 单一评测指标不够用,需要同时跑多个评测指标(如答案准确率 + 幻觉率 + 风格合规率)并融合成总分 | [§4.5](#45) | `multi_metric_with_judges` | +| 想接入 CI 闭环:PR 时跑评测守门、夜间窗口跑优化并自动写回新 prompt | [§4.6](#46) | `ci_integration` | +| 优化任务有硬约束(如必须在凌晨 1 小时窗口完成 / 累计调用不超 N 次 / 连续无提升就停) | [§4.7](#47) | `slo_runtime_control` | +| 已能跑通基础流程,想进一步提升效果(调整 GEPA 候选选择 / Pareto 前沿 / 跨字段融合) | [§4.8](#48) | `advanced_strategies` | +| 其他常见扩展(接 Grafana / WandB 等监控、自定义停机策略、用自己的优化算法) | [§4.9](#49) | (多 example 综合) | + +### 4.1 我的 agent 是 HTTP 服务,怎么接入? {#41} + +**你的处境**:业务 agent 已经作为独立服务上线(FastAPI / Gin / 自研框架均可),希望对它的 prompt 做自动优化——但服务长期运行不能停、服务实现细节对优化器是黑盒、prompt 通常以文件形式注入。 + +**接入模型**:优化器以**纯客户端**身份接入,与服务进程**只有一个耦合点**——磁盘上的 prompt 文件。 + +```text ++-------------------+ HTTP request + query +-------------------+ +| AgentOptimizer | ----------------------------------> | HTTP agent | +| (optimizer) | <---------- response -------------- | (no code change) | ++---------+---------+ +---------+---------+ + | ^ + | write new prompt candidate | 每次请求 + v | 现读 prompt + +--------------------------------------------------------------+ + | prompt files (on disk) | + +--------------------------------------------------------------+ +``` + +服务进程**不需要任何代码改动**,只需要满足一个约定:**每次处理请求前重读 prompt 文件**——这样优化器写入的新候选下一次请求就生效。 + +**接入 3 步**: + +**第 1 步:在 HTTP 服务读取的 prompt 文件上注册 `TargetPrompt`** + +```python +target = TargetPrompt().add_path("system_prompt", "service/prompts/system.md") +``` + +`add_path` 的第二个参数必须是**服务进程实际读取的那个文件路径**(不是任意副本),否则优化器写入的新候选不会被服务感知。 + +**第 2 步:把 `call_agent` 写成一个对服务的 HTTP 客户端** + +```python +async def call_agent(query: str) -> str: + async with httpx.AsyncClient(timeout=120.0) as client: + resp = await client.post("http://my-agent-service/chat", + json={"query": query}) + resp.raise_for_status() + return resp.json()["final_text"] +``` + +按业务实际接口的 payload schema 改 `json=...` 字段;按业务首次推理耗时调 `timeout`(example 默认 120s)。 + +**第 3 步:调 `AgentOptimizer.optimize`** + +```python +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir=f"runs/{timestamp}", + update_source=False, # 决策表见 [§3.4](#34-agentoptimizer) +) +``` + +**接入前自检表**: + +| 检查项 | 说明 | +|---|---| +| 服务每次请求是否重读 prompt 文件 | 否 → 优化器写入的新候选服务看不到,优化无效。需要在 handler 里加重读逻辑 | +| 优化器进程对 prompt 文件有写权限 | 否 → 优化器无法落盘新候选 | +| 服务对 prompt 文件路径与优化器看到的是否一致 | 容器化部署时尤其要确认(mount 路径 / 软链) | +| 服务 5xx 行为 | 服务内部不要静默 retry——会掩盖真实失败率,让优化器看到假"高分" | + +**→ 完整 example**:[`examples/optimization/http_service/`](../../../examples/optimization/http_service/) +- `service/server.py` — 演示 prompt 热加载的 FastAPI 服务(`/chat` 每次重建 agent 重读 `system.md`),可作为业务服务改造的参考 +- `run_optimization.py` — 客户端优化器入口,含启动前服务健康检查(fail-fast) + +### 4.2 我的 agent 是外部命令行工具(CLI),优化器拿不到它的代码 {#42} + +**你的处境**:业务 agent 是个外部可执行程序——`claude` / `codex` / 自研 CLI 等。它的源代码、内部用的 LLM client、运行时语言对优化器**完全黑盒**,但它启动时会从某个工作目录读若干 prompt 文件(典型如 `CLAUDE.md` + `.claude/skills//SKILL.md`)。你希望在不改 CLI 代码、不绑定它内部任何依赖的前提下优化这些 prompt 文件。 + +**接入模型**:优化器通过**子进程**调用 CLI,与 CLI 之间**唯一耦合点**还是磁盘上的 prompt 文件——这一点和 §4.1 的 HTTP 服务结构相同,差别只是把"HTTP 请求"换成"启动一个子进程"。 + +```text ++-------------------+ subprocess + query +-------------------+ +| AgentOptimizer | ------------------------------> | External CLI | +| (optimizer) | <-------- stdout text --------- | (no code change) | ++---------+---------+ +---------+---------+ + | ^ + | write new prompt candidate | 每次启动 + v | 自动读取 + +----------------------------------------------------------+ + | prompt files (on disk) | + +----------------------------------------------------------+ +``` + +CLI 二进制本身**不需要任何改动**,只需满足:**每次启动会从指定目录加载 prompt 文件**(绝大多数 CLI 工具都是这样设计的)。 + +**接入 3 步**: + +**第 1 步:在 CLI 读取的 prompt 文件上注册 `TargetPrompt`(多文件用多次 `add_path`)** + +```python +target = ( + TargetPrompt() + .add_path("claude_md", "workspace/CLAUDE.md") + .add_path("skill_md", "workspace/.claude/skills/city-info/SKILL.md") +) +``` + +每个 `add_path` 注册一个独立字段,GEPA 把每个字段视为一个独立可优化模块,可单独/联合优化(详见 §3.7、§4.3)。 + +**第 2 步:把 subprocess 调用 + stdout 规范化包成 `call_agent`** + +```python +async def call_agent(query: str) -> str: + proc = await asyncio.create_subprocess_exec( + "trpc-claudecode", "--print", + "--add-dir", str(WORKSPACE_DIR), # CLI 从这里加载 prompt 文件 + "--dangerously-skip-permissions", + query, # query 作 argv 直传,避免 shell 转义 + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=_build_cli_env(), # 业务自有 CLI 期望的环境变量 + ) + stdout_b, stderr_b = await asyncio.wait_for( + proc.communicate(), timeout=90.0, # 防止单次 CLI 卡死 + ) + if proc.returncode != 0: + raise RuntimeError(f"CLI exited {proc.returncode}: {stderr_b[:400]!r}") + return _normalize_response(stdout_b.decode("utf-8", "replace")) +``` + +`call_agent` 仍然是 §3.1 那个标准签名 `async (query: str) -> str`,对优化器主循环来说,这一份 `call_agent` 和"调本地 LLM"是无差别的。`_build_cli_env` / `_normalize_response` 是业务按自己 CLI 的特性自己实现的辅助函数(前者把环境变量改写/补齐成 CLI 期望的形态、后者把 CLI stdout 规整成评测可比的稳定字符串)——本框架不规定它们的形态,按需实现即可。 + +**第 3 步:跑一次确认 baseline 通畅,再交给 GEPA 反思优化** + +```python +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs//", + update_source=False, +) +``` + +**接入前自检表**: + +| 检查项 | 不通过的后果 | +| --- | --- | +| CLI 是否每次启动都重读 prompt 文件 | 否 → 优化器写入的新候选不会生效;候选间评估等同于跑同一份 baseline | +| CLI 是否支持把 query 通过 argv / stdin / `--query xxx` 传入 | 否 → 接入不可行(需要先给 CLI 加这个入口) | +| CLI 平均单次耗时是否已知 | 否 → 无法合理设置 `CLI_TIMEOUT_SEC` 与 `max_metric_calls` | +| CLI 进程是否会污染共享磁盘状态(除 prompt 文件外) | 是 → 评测不可重复;需要 `eval_case_parallelism=1` 或为每个 case 起独立 workspace | + +**→ 完整 example**:[`examples/optimization/blackbox_cli/`](../../../examples/optimization/blackbox_cli/) +- `agent/call_agent.py` — subprocess 调用 + 环境变量适配 + stdout 规范化的工程实现,可作为接入自有 CLI 的改造起点 +- `run_optimization.py` — 双字段(`CLAUDE.md` + `SKILL.md`)`TargetPrompt` 的标准入口 + +### 4.3 我的 agent 是多 sub-agent 链路,想同时优化每个 sub-agent 的 prompt {#43} + +**你的处境**:业务侧已经编排好多 sub-agent 协作链路。每个 sub-agent 有自己的 system prompt,字段间还存在隐式契约(上游 sub-agent 的输出形态必须匹配下游期望)。手工迭代时常见症状是**"改 A 见效,但拖累 B"**。你希望对所有 sub-agent 的 prompt **联合优化**,让端到端指标上分。 + +**接入模型**:把每个 sub-agent 的 prompt 注册成 `TargetPrompt` 的一个**独立字段**——GEPA 把每个字段视为一个独立可优化模块(component),每轮按 `module_selector` 选 1 个或多个字段写回,优化器只看端到端 metric 分数作为反馈。链路代码**完全零修改**,每个 sub-agent 在每次被调用时重读自己的 prompt 文件即可。 + +```text ++-----------------------------+ round-robin fields +---------------------+ +| AgentOptimizer | ---------------------> | prompt files | +| (multi-field TargetPrompt) | write new candidate | (one per agent) | +| | | | ++--------------+--------------+ +----------+----------+ + ^ | + | end-to-end metric score | 每次调用 + | | 现读 prompt + | v + | +-----------------------------------------+ + +------------- | call_agent(query) | + | = multi sub-agent pipeline entry | + | (sub-agent A -> sub-agent B -> ...) | + +-----------------------------------------+ +``` + +**接入 3 步**: + +**第 1 步:把每个 sub-agent 的 prompt 文件注册为独立字段** + +```python +target = ( + TargetPrompt() + .add_path("agent_a", ".md") + .add_path("agent_b", ".md") + # ... 一个 sub-agent 一个 add_path +) +``` + +key 是该字段在反思 prompt / 产物文件名中的标识,业务可读即可。 + +**第 2 步:把整条链路调用包成 `call_agent`,并保证 sub-agent 每次现读 prompt** + +```python +async def call_agent(query: str) -> str: + return await invoke_pipeline(query) # 你已有的链路入口 +``` + +`invoke_pipeline` 内部的关键约束:**每个 sub-agent 在每次被调用时必须重读自己的 prompt 文件**,否则优化器写入的新候选不会生效。 + +**第 3 步:在 `optimizer.json` 打开多字段相关的开关** + +```jsonc +{ + "optimize": { + "algorithm": { + "module_selector": "round_robin", // 每轮选 1 个字段轮换改写,便于归因 + "use_merge": true, // 累积若干单字段改进后主动融合 + "max_merge_invocations": 3, + "reflection_history_top_k": 3 // 多字段轮换时建议调大(默认 2) + } + } +} +``` + +各参数完整语义与取值对照见 [§7 完整 API 参考](#7-完整-api-参考)。 + +**接入前自检表**: + +| 检查项 | 不通过的后果 | +| --- | --- | +| 每个 sub-agent 是否每次被调用都重读自己的 prompt 文件 | 否 → 优化器写入的新候选不会生效;候选间评估等同于跑同一份 baseline | +| 端到端 metric 是否能反映各字段联合质量 | 否 → 反思 LM 拿到的反馈信号不真实;建议用 `final_response_avg_score` 评最终答复 | +| 单 case 经过几次 LLM 推理 | 调用量按链路深度倍增,需相应调小 `eval_case_parallelism` / `reflection_minibatch_size` 防 rate limit | +| sub-agent 是否需要在同一进程 | 不必——`call_agent` 内部可以是 HTTP / gRPC / 内部 SDK / 其他编排框架;只要最终返回 `str` 即可 | + +**→ 完整 example**:[`examples/optimization/multi_agent_pipeline/`](../../../examples/optimization/multi_agent_pipeline/) +- `pipeline/orchestrator.py` — 多 sub-agent 链路实现,sub-agent 在每次调用时重读 prompt +- `run_optimization.py` — 多字段 `TargetPrompt` 的标准入口 +- `optimizer.json` — 多字段场景的推荐配置 + +### 4.4 我的 prompt 不在本地文件,存在远端配置中心 / KV / 数据库 {#44} + +**你的处境**:业务 prompt 不在本地文件,而是放在远端配置中心(七彩石 / Apollo / Nacos / 自研 KV / 数据库 / Git 等),业务从中心拉取使用。优化器无法直接走文件系统——只能通过业务自有 SDK 与远端交互。 + +**接入模型**:`TargetPrompt` 把"prompt 在哪里"抽象成一对 async 函数 `read` / `write`——优化器调 `read` 拿 baseline 快照、调 `write` 落候选,远端后端形态(KV / RPC / SQL / Git API ...)对优化器**完全黑盒**。这与 §4.1 / §4.2 通过本地 prompt 文件耦合的结构同构,差别只是把"读写文件"换成"调用业务给的两个 async 函数"。 + +```text ++-------------------+ async read / write +---------------------+ +| AgentOptimizer | <--------------------------------> | Remote Config | +| (optimizer) | (your SDK / HTTP / RPC) | (KV / DB / Git ...)| ++---------+---------+ +---------+-----------+ + ^ | + | best_prompts/ saved locally | 业务每次调用 + | | 现拉配置 + v v + +-------------------+ +---------------------------+ + | output_dir/ | | inside call_agent | + | best_prompts/ | | pull latest prompt & run | + +-------------------+ +---------------------------+ +``` + +**接入 3 步**: + +**第 1 步:实现一对操作远端 prompt 的 async 函数** + +```python +async def read_prompt() -> str: + return await your_config_sdk.get(key="system_prompt") + +async def write_prompt(value: str) -> None: + await your_config_sdk.put(key="system_prompt", value=value) +``` + +签名约束:`read: async () -> str`、`write: async (str) -> None`。重试 / 幂等性 / 鉴权由业务自有 SDK 保证。 + +**第 2 步:用 `add_callback` 而非 `add_path` 注册 `TargetPrompt`** + +```python +target = TargetPrompt().add_callback( + "system_prompt", + read=read_prompt, + write=write_prompt, +) +``` + +`add_callback` 与 `add_path` 在 `TargetPrompt` 上对等并存——多字段也可以混用(部分字段在本地文件、部分字段在远端配置中心)。 + +**第 3 步:把 `call_agent` 写成"现拉现用",照常调 `optimize`** + +```python +async def call_agent(query: str) -> str: + prompt_text = await read_prompt() # 现拉,保证候选写入立即生效 + agent = create_agent(prompt_text) + return await runner.run_async(query, ...) + +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs//", + update_source=False, # 决策表见 §3.4 +) +``` + +`update_source` 取值由业务侧 prompt 写回策略决定(详见 §3.4 决策表),框架对它没有额外限制。 + +**接入前自检表**: + +| 检查项 | 不通过的后果 | +| --- | --- | +| 业务侧每次调用是否重新拉配置 | 否 → 优化器写入新候选后业务感知不到,反思循环失效 | +| `read` / `write` 是否都是 async 函数 | 否 → `add_callback` 注册时即报错 | +| `write` 是否幂等(接受重复写同一 value) | 否 → 收尾自动回滚到 baseline 时可能失败,遗留远端被污染 | +| 优化器进程是否对该 key / namespace 有写权限 | 否 → `write` 抛权限错误,当前候选评估失败 | + +> **涉及生产 prompt 的安全模式**(按需采用,非框架强制):业务侧若已有 sandbox / production namespace 隔离,可让优化器只读写 sandbox key,配合 `update_source=False` 让优化器收尾自动回滚 sandbox,最佳候选仅落本地 `best_prompts/`,再由业务自有审批流同步到 production。`examples/optimization/remote_prompt_store/` 演示的就是这种工作流。 + +**→ 完整 example**:[`examples/optimization/remote_prompt_store/`](../../../examples/optimization/remote_prompt_store/) +- `store/prompt_client.py` — `read` / `write` async 函数定义,是接入业务配置中心 SDK 的核心改造点 +- `run_optimization.py` — `add_callback` 注册的标准入口(演示采用 sandbox + `update_source=False` + 人工审批的安全工作流) + +### 4.5 单一评测指标不够用,需要多个指标并融合成总分 {#45} + +**你的处境**:业务上线对 agent 输出的要求往往不止一个维度——答案得对(正确性硬约束)+ 不能乱说(幻觉率)+ 风格符合规范(格式 / 语气)+ 不带敏感词(合规)……单一 metric 装不下,强行用单个综合 metric 的话,反思 LM 看到的反馈信号是混合后的标量,很难定向归因。 + +**接入模型**:`optimizer.json` 的 `evaluate.metrics` 是**列表**——直接列多条 metric,每条独立打分、独立 threshold、独立配置。早停判定通过 `optimize.stop.required_metrics` 声明哪些 metric 必须达标;GEPA 内部通过 `optimize.algorithm.frontier_type` 决定如何在多 metric 间维护 Pareto 前沿避免"改 A 拖累 B"。整个机制纯配置驱动——`call_agent` 与 `TargetPrompt` 都不需要为多 metric 改一行代码。 + +**配置 3 步**: + +**第 1 步:在 `evaluate.metrics` 列出所有 metric** + +```jsonc +{ + "evaluate": { + "num_runs": 2, // 平滑 LLM 输出方差(>1 让每条 case 跑多次取均值) + "metrics": [ + { + "metric_name": "llm_final_response", // 硬约束:答案是否与 reference 实质等价 + "threshold": 1.0, + "criterion": { "...": "..." } // 完整字段见 §7 / example + }, + { + "metric_name": "llm_rubric_response", // 软约束:多 rubric(格式 / 风格 / 单位 ...) + "threshold": 0.75, + "criterion": { "...": "..." } + } + ] + } +} +``` + +每条 metric 独立打分独立写入 `result.json` 的 `metric_breakdown`,便于反向归因某次评测在哪条 metric 上掉分。 + +**第 2 步:在 `optimize.stop.required_metrics` 声明早停门禁** + +| 取值 | 语义 | 适用场景 | +| --- | --- | --- | +| `"all"` | 所有 metric 都达 threshold 才早停 | 所有 metric 都是必须达标项 | +| `["m1", "m2"]` | 列表中所有 metric 达 threshold 才早停(其他 metric 仍参与评测但不影响早停) | 部分 metric 是参考观测项、不作为门禁 | +| `null` 或 `[]` | 不参与早停,仅靠算法层 budget / no-improvement / score_threshold 控制 | 只想跑满预算看结果 | + +**第 3 步:把 `frontier_type` 调到能正确处理多 metric 的取值** + +| 取值 | 含义 | 适用 | +| --- | --- | --- | +| `instance` | 每个 case 维护一个 best 候选 | 单 metric 或 metric 间无明显冲突 | +| `objective` | 每个 metric 维护一个 best 候选 | 多 metric 但 case 量较小 | +| `hybrid` | 同时维护 case + metric 双层前沿 | **多 metric 真冲突场景**(推荐默认) | +| `cartesian` | 每个 (case, metric) 组合一个 best | 极复杂 / 调试用,候选池容易爆炸 | + +`hybrid` 让 GEPA 在改进一个 metric 时不丢失另一个 metric 上的最佳候选——**多 metric 业务的安全默认**。各取值完整定义见 [§7](#7-完整-api-参考)。 + +**接入前自检表**: + +| 检查项 | 不通过的后果 | +| --- | --- | +| 各 metric 的 `threshold` 是否符合业务诉求 | 否 → 早停判定不准;优化结束时业务关键指标可能未达标 | +| 是否只有"硬约束"被列入 `stop.required_metrics` | 否 → 软约束波动会反复打断早停判定,浪费预算 | +| `eval_case_parallelism` 是否考虑了 metric 数 × judge 数的并发量 | 否 → 单轮 LLM 调用量爆炸(N case × M metric × K judge × `num_runs`),容易撞 LLM 后端 rate limit | +| `num_runs` 是否合理(默认 1) | 单 LLM judge 输出存在方差;建议 `num_runs=2` 让每条 case 跑两次取均值消除抖动 | + +**→ 完整 example**:[`examples/optimization/multi_metric_with_judges/`](../../../examples/optimization/multi_metric_with_judges/) +- `optimizer.json` — `llm_final_response`(多 judge `all_pass` 投票)+ `llm_rubric_response`(单 judge 多 rubric)+ `frontier_type=hybrid` + `stop.required_metrics` 列表式的完整配置范例 +- `run_optimization.py` — 与单 metric 场景一致的标准入口(多 metric 不影响入口代码) + +### 4.6 想接入 CI 闭环:PR 守门 + 夜间优化自动写回 {#46} + +**你的处境**:你希望 prompt 工程也走 CI/CD 流程——每次 PR 自动跑评测守门(分数低于阈值即 CI 红灯,阻止劣化 prompt 进主干),同时在低峰窗口自动跑反思优化把更优 prompt 写回源文件,下一次 PR 自动用上。**单独使用任一链路都不够**:纯守门不会让 prompt 自动变好,纯优化没有质量门禁。 + +**接入模型**:`AgentEvaluator.evaluate`(pytest 跑 PR 守门)与 `AgentOptimizer.optimize`(夜间优化)共享**同一份资产**——同一个 `call_agent`、同一份 evalset(物理上拆 train / val 两文件防泄漏,逻辑上一套语料)、同一对 prompt 文件。`update_source=True` 是闭环的关键开关:优化成功(`OptimizeResult.status=SUCCEEDED`)后最优候选直接覆盖源 prompt 文件,下一次 PR 触发的 pytest 自动读取新内容。 + +```text + +-----------------------------------------------------+ + | Shared: call_agent + evalset + prompt files | + +------+----------------------------------------+-----+ + | | + Trigger: PR | | Trigger: Night + v v + +---------------------------+ +---------------------------+ + | AgentEvaluator.evaluate | | AgentOptimizer.optimize | + | (pytest) | | update_source=True | + | | | | + | Score < threshold -> Red | | OK -> overwrite prompt | + | pytest exit != 0 -> Block| | Fail -> keep unchanged | + +---------------------------+ +-------------+-------------+ + | + v + 下一次 PR 自动用新 prompt + (形成 eval->optimize->eval 闭环) +``` + +**接入 3 步**: + +**第 1 步:把 `call_agent` 抽到 evaluate / optimize 共享的模块里** + +```python +# agent/agent.py(pytest 与 optimizer 都从这里 import) +async def call_agent(query: str) -> str: + ... +``` + +**为什么必须共享**:评测时使用的 agent 和优化时使用的 agent 必须**等价**——否则会出现"优化器找到了 evaluator 验证不了的好 prompt"或反向问题。共享同一个 `call_agent` 文件是最直接的代码级保证。任何 agent 改动(模型切换 / temperature 调整 / output schema 变化)只需改一处。 + +**第 2 步:写 PR 守门的 pytest 入口** + +```python +# tests/test_agent_quality.py +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator +from agent.agent import call_agent + +@pytest.mark.asyncio +async def test_agent_quality(): + await AgentEvaluator.evaluate( + call_agent=call_agent, + eval_set_path="data/val.evalset.json", + test_config_path="optimizer.json", # 复用同一份 metric 配置 + ... + ) # 分数低于 threshold 时框架抛 AssertionError → pytest 红 +``` + +CI 流水线里跑: + +```bash +pytest tests/ --junitxml=runs/pytest_report.xml +``` + +`--junitxml` 输出标准格式的测试报告,GitHub Actions / 蓝盾流水线 / Tencent CI 等主流平台均原生解析。失败时 `AssertionError` 消息里包含每条 case 的失败明细 JSON,CI 平台展示 stack trace 时可直接看到具体哪条 case 失败、agent 实际输出是什么、与 expected 的差异在哪。 + +**第 3 步:夜间窗口跑优化 + `update_source=True`** + +```python +# run_optimization.py(夜间 cron 触发) +await AgentOptimizer.optimize( + config_path="optimizer.json", # 与 pytest 共用 metric 配置 + call_agent=call_agent, # 与 pytest 共用 call_agent + target_prompt=target, + train_dataset_path="data/train.evalset.json", + validation_dataset_path="data/val.evalset.json", + output_dir="runs/optimize_/", + update_source=True, # CI 闭环的关键开关 +) +``` + +`update_source=True` 的安全保证:仅 `OptimizeResult.status=SUCCEEDED` 时才会写回;失败 / 预算耗尽等其他状态下源文件保持不变。覆盖采用原子写(tmp + `os.replace`),中途异常 / SIGINT 不会损坏源 prompt 文件(详见 [§8.3](#83-原子落盘保证))。 + +夜间脚本末尾建议加 `git diff --quiet agent/prompts/` 判断是否有改动,无改动直接退出;有改动则 `git checkout -b ...` + 自动开 PR——让新 prompt 走标准 PR review 流程而不是直接进主干。 + +**接入前自检表**: + +| 检查项 | 不通过的后果 | +| --- | --- | +| `call_agent` 是否被 pytest 与 optimizer **共用同一份代码** | 否 → 评测与优化的 agent 不等价;优化方向与守门方向漂移 | +| pytest 与 optimizer 是否使用**同一份 metric 配置** | 否 → "评测能过但优化器看到的分数低"或反向问题。建议 `optimizer.json.evaluate` 段在 pytest 里通过 `test_config_path` 复用 | +| evalset 是否物理拆为 train / val 两文件 | 否 → SDK `_validate_inputs` 强制校验 `train != val`,否则报错 fail-fast | +| 夜间脚本结束时是否有 `git diff` + 自动开 PR 步骤 | 否 → 优化的 prompt 直接进主干,绕过 review;建议永远走 PR 流程 | +| 是否准备好 prompt 改动的灰度策略 | 多业务线共享同一份 prompt 仓库时,建议改用 `update_source=False` + 业务自有灰度发布工具 | + +**→ 完整 example**:[`examples/optimization/ci_integration/`](../../../examples/optimization/ci_integration/) +- `agent/agent.py` — pytest 与 optimizer 共享的 `call_agent` +- `tests/test_agent_quality.py` — pytest 守门入口(PR 阶段调用) +- `run_optimization.py` — 夜间优化入口(`update_source=True`) +- `ci/run_pr_check.sh` / `ci/run_nightly_optimize.sh` — CI 流水线 shell 入口 + +### 4.7 优化任务有硬约束:必须在某时间窗内完成 / 累计调用不超 N 次 / 连续无提升就停 {#47} + +**你的处境**:你的优化任务跑在受约束的环境里——CI 流水线必须 N 分钟内结束、LLM 后端配额按月计算单次不能跑爆、连续若干轮没改善应主动放弃别浪费预算。**单个停止条件不够**:只设 timeout 可能预算还没用完就停、只设预算可能跑到天荒地老。你需要"任意一个 SLO 触发就立刻停"的多重停止策略。 + +**接入模型**:`optimizer.json` 的 `optimize.algorithm` 段提供 6 种 algorithm-level stop conditions,**OR 语义**——任意一条触发即停止。你按业务 SLO 反推每条阈值,多个开关同时启用即可。优化结束时 `OptimizeResult.stop_reason` 字段告诉你哪条 SLO 抢闸,便于后续调参。 + +**配置 3 步**: + +**第 1 步:从 6 种 stop condition 中选出业务关心的几条** + +| 字段 | 抢闸条件 | 典型业务场景 | +| --- | --- | --- | +| `timeout_seconds` | wall-clock 超过 N 秒 | CI 流水线时间窗硬约束(必须 N 分钟内结束) | +| `max_metric_calls` | 累计 case 评估次数 ≥ N | LLM 后端配额硬上限 | +| `max_candidate_proposals` | reflection LM 累计提议次数 ≥ N | 限制反思 LM 调用预算 | +| `max_iterations_without_improvement` | 连续 N 轮 best valset 无提升 | 已收敛或陷入局部最优时主动放弃 | +| `score_threshold` | best valset pass_rate ≥ 阈值 | 已达业务目标,无需继续 | +| `max_tracked_candidates` | Pareto 前沿候选池大小 ≥ N | 控制内存与 merge 候选空间规模 | + +各字段完整定义见 [§7.3.3](#733-optimizealgorithm-段)。**至少配 1 个**——否则框架启动期 fail-fast。 + +**第 2 步:按业务 SLO 反推每条阈值** + +```jsonc +{ + "optimize": { + "algorithm": { + "timeout_seconds": 90.0, // CI 必须 X 分钟内结束 → 设 X*60 / 2 留缓冲 + "max_metric_calls": 30, // LLM 配额 → 按"调用次数 × 单次耗时"反算 + "max_iterations_without_improvement": 3, // 连续 3 轮无提升即放弃 + "score_threshold": 1.0 // 达到业务目标即停 + } + } +} +``` + +**两个反推关键**: + +| 项 | 怎么测 | 怎么反推 | +| --- | --- | --- | +| 单轮典型耗时 | 测一次基准跑,看 `runs//result.json` 中 round 的 wall-clock 时间 | `timeout_seconds` 应至少为单轮耗时 × 2,否则第 1 轮就抢闸看不到优化进展 | +| 单轮 metric_calls 数 | 同上,看 round 的 `metric_calls_in_round` | `max_metric_calls` 应至少能跑过 `max_iterations_without_improvement` 轮,否则永远是 budget 先抢闸 | + +**第 3 步:明确是否参与 framework-level metric 早停** + +| 取值 | 语义 | +| --- | --- | +| `optimize.stop.required_metrics: "all"` 或 `["m1"]` | metric 达 threshold 也参与 OR 抢闸 | +| `optimize.stop.required_metrics: []` | 只让 6 个 algorithm 级 stopper 决定 | + +业务诉求: +- **关心 metric 是否达标**(典型的 prompt 质量优化)→ 用 `"all"` 或具体列表 +- **只关心时间 / 调用预算**(已知必收敛、纯卡资源) → 用 `[]` + +**`stop_reason` 取值参考**:优化结束时 `OptimizeResult.stop_reason` 值能告诉你抢闸者——`score_threshold_reached` / `budget_exhausted` / `timeout_reached` / `no_improvement` / `max_proposals_reached` / `max_tracked_candidates_reached` / `user_requested_stop`(用户通过 `optimize.stop` 哨兵文件主动触发)。 + +**接入前自检表**: + +| 检查项 | 不通过的后果 | +| --- | --- | +| 各阈值是否经过基准测量反推、而非凭直觉拍脑袋 | 否 → 大概率某条 stopper 永远先抢闸(如 timeout 在第 1 轮就触发),其他配置形同虚设 | +| `timeout_seconds` 是否预留缓冲(≤ 业务真实窗口的 50%) | 否 → 框架"完成当前轮再停"语义下实际终止时间可能超过 timeout 设定值,撞业务硬截止 | +| 单轮内的 LLM 调用是否有自己的超时(如 CLI / HTTP 调用) | 否 → 单轮卡住整个 timeout 也只能等当前轮跑完,可能严重超时(参考 §4.2 的 CLI_TIMEOUT_SEC 模式) | +| 是否在测试环境跑过一次基准,验证 `stop_reason` 与预期一致 | 否 → 上 CI 后才发现 stopper 行为与预期不符,无法快速诊断 | + +**→ 完整 example**:[`examples/optimization/slo_runtime_control/`](../../../examples/optimization/slo_runtime_control/) +- `optimizer.json` — 6 种 stop condition 全部启用的配置范例(业务真实接入应根据自有 SLO 反推阈值,不要直接复制 example 的值) +- `run_optimization.py` — 跑完后 `result.json.stop_reason` 字段标识抢闸者 + +### 4.8 已能跑通基础流程,想进一步提升效果(GEPA 候选选择 / Pareto 前沿 / 跨字段融合) {#48} + +**你的处境**:你已经按 quickstart 跑通了基础优化流程,能稳定看到 baseline → best 的提分。现在想理解 GEPA 的几个高阶开关——`candidate_selection_strategy` / `frontier_type` / `use_merge` / `skip_perfect_score`——在你的任务上**到底有没有用、能不能再榨出几个点**。但你单跑一次优化往往看不出差异,因为 GEPA 在多数任务上都能收敛到相近 `best_pass_rate`——**差异藏在到达路径里**(轮次数 / 接受率 / merge 是否触发 / reflection LM 调用数),不在最终分数。 + +**接入模型**:用 **A/B 对照实验**——同一份业务、同一份 evalset、同一个 `seed`,跑两套不同的 `optimizer.json`:一份是当前线上配置或默认配置(baseline),一份是希望验证的高阶组合(advanced)。跑完后对比两次的 `result.json`,关注**多维度指标**而非单一 `best_pass_rate`。 + +**实验 3 步**: + +**第 1 步:把当前配置作为 baseline,固定其余变量** + +```jsonc +// optimizer_baseline.json +{ + "optimize": { + "algorithm": { + "seed": 42, // 固定 seed 排除随机性 + "max_metric_calls": 30, // 与 advanced 保持一致以公平对比 + "candidate_selection_strategy": "pareto", + "frontier_type": "instance", + "skip_perfect_score": false, + "use_merge": false + } + } +} +``` + +**第 2 步:写 advanced 配置,只改要验证的开关** + +```jsonc +// optimizer_advanced.json(与 baseline 仅差几个开关) +{ + "optimize": { + "algorithm": { + "seed": 42, + "max_metric_calls": 30, + "candidate_selection_strategy": "pareto", + "frontier_type": "objective", // 改:从 instance 切到 objective + "skip_perfect_score": true, // 改:跳过满分 case 节省反思调用 + "use_merge": true // 改:启用跨字段融合(仅多字段时实际生效) + } + } +} +``` + +**第 3 步:跑两次 + 解析 `result.json` 输出多维度对比** + +```bash +python run_baseline.py # 产出 runs/baseline_/result.json +python run_advanced.py # 产出 runs/advanced_/result.json +python compare.py # 解析两份 result.json,输出对比表 +``` + +`compare.py` 应关注的维度: + +| 维度 | 字段(`result.json` 中按 camelCase 索引) | 解读 | +| --- | --- | --- | +| 最终质量 | `bestPassRate` / `baselinePassRate` | 端到端提分;多数任务上两套策略收敛接近 | +| 探索深度 | `totalRounds` / `roundsAccepted` | 接受率(`roundsAccepted / totalRounds`)反映 frontier 接受门槛 | +| merge 行为 | `mergeRoundsTotal` / `rounds[*].kind` | 验证 `use_merge=true` 是否真的触发 merge | +| 反思预算 | `metricCallsTotal` / `proposalsTotal` | `skip_perfect_score=true` 在大训练集 + 高基线起点时节省更明显 | +| `stop_reason` | `stopReason` | 哪条 stopper 抢闸;两套 advanced/baseline 的 stop_reason 不同时不可直接对比 | + +> **踩坑提醒**:`result.json` 中字段是 camelCase(`bestPassRate` 而非 `best_pass_rate`)。SDK 内部用 snake_case,序列化时通过 pydantic alias 自动转 camelCase。读 `result.json` 时按 camelCase 索引。 + +**几个高阶开关的预期表现**(业务任务上未必都成立——以你自己的实测为准): + +| 开关 | 期望收益 | 适用前提 | +| --- | --- | --- | +| `frontier_type="objective"`(vs `"instance"`) | 接受率更高 / 探索更激进 | 多 metric 场景;小训练集(< 10 case)下可能过拟合 train minibatch 导致 valset 震荡 | +| `frontier_type="hybrid"` | 多 metric 间不互相覆盖 | 多 metric 真冲突场景(参见 §4.5) | +| `skip_perfect_score=true` | 节省 reflection LM 调用 | 大规模训练集 + 高 baseline 起点;小数据集下满分 case 极少,节省有限 | +| `use_merge=true` | 跨字段融合候选 | **仅多字段(`add_path` ≥ 2)才会真实触发**;单字段配置永远 0 merge round(`mergeRoundsTotal=0` 是预期,参见 §4.3) | + +**接入前自检表**: + +| 检查项 | 不通过的后果 | +| --- | --- | +| 两套配置是否仅差**要验证的几个开关**、其余全部相同 | 否 → 对比结果含混杂变量,结论不可信 | +| `seed` 是否两套一致 | 否 → 差异可能来自随机性而非配置策略 | +| `max_metric_calls` 是否两套一致 | 否 → 一套有更多预算自然分数更高,不能归因到策略 | +| 是否同时关注**多维度对比**而非单一 `bestPassRate` | 否 → 多数任务两套最终分数接近,看不出差异;差异藏在到达路径 | +| `use_merge` / `skip_perfect_score` 等开关是否在你的任务结构下有意义 | 单字段任务开 `use_merge` 永远 0 触发(无害但无收益);高基线任务开 `skip_perfect_score` 节省可观 | + +> 高阶配置**不是越复杂越好**。许多任务上 baseline 配置已能达到合理收敛,advanced 只在特定任务结构(多目标、多字段、大规模训练集等)下显示价值。**用数据决定,不用直觉**。 + +**→ 完整 example**:[`examples/optimization/advanced_strategies/`](../../../examples/optimization/advanced_strategies/) +- `optimizer_baseline.json` / `optimizer_advanced.json` — A/B 对照的两套配置(仅差 3 个开关) +- `run_baseline.py` / `run_advanced.py` — 两个独立入口(保持其余变量一致) +- `compare.py` — 解析两次 `result.json` 输出多维度对比表的标准模板 + + +## 5 GEPA 是怎么工作的 + +跑了一次优化、看着分数从 0.4 涨到 0.85,但你不知道**这一路框架到底干了什么**——它读了哪些数据?反思 LM 看到了什么?凭什么决定保留还是丢弃一个候选?SLO 触发时是立刻停还是等当前轮跑完? + +> **GEPA** = Genetic-Evolutionary Pareto,是一个基于**反思**(reflection)的进化搜索算法([gepa-ai/gepa](https://github.com/gepa-ai/gepa),MIT License)。本框架通过 `OPTIMIZER_REGISTRY` 把 `gepa.optimize()` 包成 `GepaReflectiveOptimizer` 接入,并补一层 SDK 适配(评估桥接、反思反馈构造、停机判定、原子落盘等)。 + +### 5.1 一轮优化里到底跑了什么 + +**先记住三个角色**——后面所有图和表都围绕这三个: + +| 角色 | 是谁 | 干什么 | +| --- | --- | --- | +| **agent** | 你的业务 agent(通过 `call_agent` 接入) | 接一条 query 输出一条答复 | +| **judge / metric** | `evaluate.metrics` 配置的评测器 | 给 agent 答复打分(0~1) | +| **反思 LM** | `algorithm.reflection_lm` 配置的 LLM | 看失败 case 反馈 → 生成新的 prompt 候选 | + +**第 0 轮**:用 baseline prompt 跑 valset → 得到 baseline 分数(你的"起点线") + +**之后每一轮(reflective round)**按这 5 步走: + +```text + +----------------------------+ + | Previous round's prompt | + +--------------+-------------+ + | + v + (1) 抽 minibatch -> 从 trainset 随机抽 N 条 case + (N = reflection_minibatch_size) + | + v + (2) 跑一次评估 -> 把候选写到 prompt 文件 + -> 调 call_agent 跑这 N 条 case + -> metric 打分,得到失败案例 + | + v + (3) 反思 LM 生成新候选 -> 把失败 case 反馈喂给反思 LM + -> 它输出新的 prompt 文本 + | + v + (4) 重评 + 入 Pareto 前沿 -> 新候选在 minibatch 上重跑一次 + -> 比历史候选好就入前沿,否则丢弃 + | + v + (5) 检查停机条件 -> 6 个 stopper 任一触发 -> 停 + -> 否则进入下一轮 +``` + +**几条关键说明**: + +- **第 (2) 步的"评估"** 实际跑了 `len(minibatch) × num_runs × len(metrics)` 次 LLM 评估(详见 §6.1) +- **第 (3) 步的"反思 LM 看到什么"** 决定改写质量——这是下一节 §5.2 的内容 +- **第 (4) 步的"Pareto 前沿"** 简单说就是"保留各方面都不被超越的候选集";具体粒度由 `frontier_type` 控制(详见 §5.3) +- **第 (5) 步的"任一触发即停"** 有个细节:触发后**等当前轮跑完才真正停**,不是立即 kill(详见 §5.4) +- **valset 评估**穿插在中间几轮里发生(gepa 内部决定何时跑),用于计算"当前最优候选在 valset 上的真实分数",也是 `score_threshold` / `required_metrics` 等 stopper 的判断依据 + +**特殊情况:merge round** + +`use_merge=true` 时,每隔若干 reflective round 会插入一轮 **merge round**:从 Pareto 前沿挑两个候选融合成一个新候选("取 A 在字段 X 上的写法 + B 在字段 Y 上的写法")。**仅在多字段场景下有意义**——单字段时永远不触发,`mergeRoundsTotal=0` 是预期。详见 §4.3。 + +### 5.2 反思 LM 实际看到什么 + +反思 LM 改写 prompt 的质量,**完全取决于它能看到多丰富的失败反馈**。如果只告诉它"case_3 失败了,分数 0.3",它只能瞎猜;如果告诉它"case_3 第 2 turn 时 agent 应输出 `{"city":"上海"}` 但实际输出 `Shanghai`,规则要求 case-sensitive 精确匹配",它就能针对性改 prompt。 + +`_AgentGEPAAdapter.make_reflective_dataset` 为每条**失败的 case** 渲染一份 markdown 记录,喂给反思 LM。每条记录字段: + +| 字段 | 一句话解释 | 何时出现 | +| --- | --- | --- | +| `case_id` | case 的稳定 ID(用于反思 LM 跨条引用) | 总是 | +| `score` | 这条 case 的聚合分数(0~1,1.0 = 全 metric 通过) | 总是 | +| `Case Body` | 失败现场的 markdown:每个 turn 一段,里面有用户输入、期望答复、agent 实际答复、tool 调用轨迹、每条 metric 的判定(PASS/FAIL + 分数 + 失败原因) | 总是 | +| `Other Active Components` | 当前轮**不被改写**的其他 prompt 字段长什么样 | 多字段优化时——让反思 LM 在改 A 时看到 B/C 现状,避免改坏上下游兼容性 | +| `history_top_k` | 这条 case 历史上跑得最好的几次 agent 答复(按分数排) | `reflection_history_top_k > 0` 时 | + +**`Case Body` 的具体结构**: + +```text +### Turn 1 +**User**: <用户原始输入> +**Expected**: <期望答复> +**Agent Response**: +**Tool Trace**: ← 仅有 tool 调用时 + - tool_name(args) → response +**Verdict** (Turn 1): + [FAIL] metric_name: score=0.0000, threshold=1.0000 + reason: agent output not byte-equal to expected (case-sensitive) + · rubric[no_emoji]: PASS score=1.00 ← 仅 LLM rubric metric + +### Turn 2 +... + +### Overall (case-level aggregate) ← 多 turn 或多 run 时 +... +``` + +**对确定性 metric 的失败原因合成**:当 metric 是 `final_response_avg_score` 这类不带 LLM judge 的评测器、只输出 score+status 时,框架会**自动合成一句失败说明**(例如:`agent output not byte-equal to expected (case-sensitive)` / `expected substring not contained in agent output (case-insensitive)` / `JSON structural comparison failed`),让反思 LM 直接看到**为什么没 match**,而不必自己 diff 文本去猜。 + +> 想看反思 LM 实际拿到的 prompt 全貌?跑优化时把 `verbose=2` 打开,gepa 内部日志会附带每轮的反思 prompt 文本——读一次心里就有数了。 + +### 5.3 5 个核心算子的实际行为 + +`optimizer.json` 的 `optimize.algorithm` 段里,最常被问到的 5 个开关,在源码里到底干什么: + +| 算子 | 一句话功能 | 调它的典型动机 | 详细参考 | +| --- | --- | --- | --- | +| `reflection_minibatch_size` | 每轮反思 LM 看几条 case | 调小省 token,调大让反思 LM 视野更全 | [§7.3.3](#733-optimizealgorithm-段) | +| `module_selector` | 多字段时这一轮改哪个字段(`round_robin` 轮换 / `all` 全选 / `random` 随机) | 想清晰归因每个字段贡献 → `round_robin` | [§4.3](#43) | +| `frontier_type` | Pareto 前沿粒度(`instance` 每 case 一个 best / `objective` 每 metric 一个 / `hybrid` 双层 / `cartesian` 笛卡尔积) | 多 metric 真冲突时 → `hybrid` | [§4.5](#45) | +| `candidate_selection_strategy` | 下一轮反思的 parent 怎么挑(`pareto` 默认从前沿挑 / `current_best` 用当前最优 / 等) | 想加快收敛或加大探索 | [§7.3.3](#733-optimizealgorithm-段) | +| `use_merge` + `max_merge_invocations` | 是否启用跨字段融合 + 触发次数上限 | **仅多字段才真触发**——单字段下 `mergeRoundsTotal=0` 是预期 | [§4.3](#43) / [§4.8](#48) | + +### 5.4 停机时机:完成当前轮再停 + +6 种 algorithm 级停机条件(`max_metric_calls` / `timeout_seconds` / `no_improvement` / `score_threshold` / `max_candidate_proposals` / `max_tracked_candidates`)在每轮结束时**同步检查**——任一条件满足即停。 + +**3 个容易踩的细节**: + +| 细节 | 含义 | 怎么避雷 | +| --- | --- | --- | +| **不立即 kill 当前轮** | 触发停机时不会把正在跑的 round 中断;要等当前 round 跑完才真正停 | SLO 硬截止场景下,`timeout_seconds` 设为业务真实窗口的 50% 左右,留缓冲 | +| **实际终止时间常超过 `timeout_seconds`** | 上一条的直接后果——卡在长 round 里时尤其明显 | 给 `call_agent` 内部的 LLM 调用加自己的超时(参考 §4.2 CLI 的 90s 超时) | +| **多个 stopper 同时触发的优先级** | `framework_stopper`(`required_metrics` 政策)优先;其次按 algorithm 级 stopper 的插入顺序取第一个 | `OptimizeResult.stop_reason` 字段记录抢闸者,跑完直接看就知道是哪条触发的 | + +**`stop_reason` 取值参考**(`OptimizeResult.stop_reason`): + +``` +required_metrics_passing ← framework 级(最高优先级) +score_threshold ← 达到目标分 +budget_exhausted ← max_metric_calls +timeout ← timeout_seconds +no_improvement ← max_iterations_without_improvement +max_candidate_proposals +max_tracked_candidates +user_requested_stop ← 用户 touch 了 optimize.stop 文件 +completed ← 没有 stopper 触发,gepa 自然跑完 +``` + +### 5.5 一种特殊情况:FAILED + +正常情况下 `OptimizeResult.status = "SUCCEEDED"`——gepa 跑完了循环(自然结束 / stopper 触发都算)。但有一种特殊状态值得用户关注: + +- **`status = "FAILED"`**:gepa 在跑的过程中抛了异常(最常见:训练/验证集加载失败、`gepa.optimize()` 内部异常、反思 LM 调用失败) +- **此时 `best_prompts` 强制设为 `baseline_prompts`**——保证你拿到的产物**永远不会比 baseline 差** +- **`update_source=True` 在 FAILED 时不会回写**源 prompt 文件(详见 §3.4 决策表) + +另一个易混点是"跑完了但没改善":这种情况 `status` 仍是 `"SUCCEEDED"`,但 `finish_reason="no_improvement"`,且 `best_prompts == baseline_prompts`——summary.txt 里会显示 `baseline → baseline`(没退化也没提升)。这是预期,不是 bug。 + + + +## 6 成本与并发 + +跑一次优化要多少 LLM 调用?哪些旋钮影响调用量、哪些影响并发量、哪些两者都影响? + +### 6.1 一次优化的 LLM 调用从哪来 + +LLM 调用分两块——**评估侧吃绝大部分**,反思侧零头: + +**评估侧(agent + judge)**:跑这些事各调一次 LLM—— + +```text +跑一次 baseline 评估: valset 全跑一遍 ← 起点,1 次 +每个 reflective round: 抽 N 条 case 跑一遍 + 新候选重跑 ← 主要成本 +特定的 reflective round: 在 valset 上重评当前最优候选 ← gepa 决定何时跑 +``` + +每次"跑一遍"实际触发的 LLM 调用数 = **case 数 × 每条 case 的 agent 调用数 × `num_runs` × 每条 metric 的 judge 调用数**。其中: + +| 乘数 | 来源 | 典型取值 | +| --- | --- | --- | +| 每条 case 的 agent 调用数 | evalset 数据;多轮 conversation 时按 turn 数累加 | 单 turn = 1,多 turn = N | +| `evaluate.num_runs` | 让每条 case 跑几次取均值消除 LLM 输出方差 | 1(默认,省)/ 2~3(推荐,稳) | +| 每条 metric 的 judge 调用数 | 看 metric 类型:`final_response_avg_score` 类确定性匹配 = 0 次;`llm_judge` / `llm_rubric_response` ≥ 1 次(`judge_models` 数组里几个就是几次) | 0~3 | + +**反思侧(reflection LM)**: + +```text +每个 reflective round: 1 次(生成新候选 prompt) +每个 merge round: 1 次(仅 use_merge=true 且多字段时才有 merge round) +``` + +反思侧调用数远少于评估侧——通常一次完整优化反思 LM 也就 5~20 次。 + +### 6.2 跑完后从 result.json 读到什么 + +`OptimizeResult` 里实际记录的统计字段(产物 `result.json` 里 camelCase 索引): + +| 字段 | 含义 | +| --- | --- | +| `totalMetricCalls` | gepa 累计的 case-level 评估次数 | +| `totalReflectionLmCalls` | 反思 LM 累计调用次数(含重试) | +| `totalTokenUsage` | 反思 LM 累计 token:`{prompt, completion, total}` | +| `durationSeconds` | 总 wall-clock 耗时 | + +需要估算业务侧的实际 USD 成本时,用 `totalTokenUsage` × LLM 后端单价反算反思侧;agent / judge 侧从 LLM 后端用量记录中拉取(API 控制台 / billing 报表)。 + +### 6.3 4 个常用旋钮的乘数效应 + +按"对总调用量的影响倍率"从大到小排——遇到优化跑爆预算时,先调上面的: + +| 旋钮 | 乘多少 | 调小的代价 | 详细 | +| --- | --- | --- | --- | +| `algorithm.max_metric_calls` | **总调用量的硬上限**——gepa 累计达到就停 | 太小→优化第 1 轮就被它停;看不到任何提分 | [§4.7](#47) | +| `evaluate.num_runs` | **乘 N**——每条 case 跑 N 次取均值 | 1 时 LLM 输出方差直接进入分数(同 prompt 两次跑分不一样);建议 ≥ 2 | [§4.5](#45) | +| `optimize.eval_case_parallelism` | **不影响总量**,只影响**墙钟时间**和**瞬时 QPS** | 调高省时间但容易撞 LLM 后端 rate limit | [§4.5](#45) | +| `algorithm.reflection_minibatch_size` | **乘几条**——每轮反思 LM 看几条 case;评估侧也按这个数算 | 太大→反思 prompt 撑爆 LLM 上下文窗口 | [§4.3](#43) | + +### 6.4 想合理设阈值?先跑一次基准 + +设 `timeout_seconds` / `max_metric_calls` 等阈值前,**先按默认配置跑一次基准**——从产物里读两个数: + +| 要测的值 | 怎么测 | 怎么用 | +| --- | --- | --- | +| **单轮典型耗时** | `runs//result.json` 里 `rounds[*].durationSeconds`(取中位数) | `timeout_seconds` 至少设为单轮耗时 × 2,否则第 1 轮就抢闸看不到优化进展 | +| **单轮 metric_calls** | 同上,`totalMetricCalls / totalRounds` | `max_metric_calls` 至少能跑过 `max_iterations_without_improvement` 轮,否则永远是 budget 先抢闸 | + +**例**:基准跑显示单轮 30 秒、单轮 4 次 metric_calls,CI 窗口 5 分钟——那么 `timeout_seconds=120`(留缓冲)、`max_metric_calls=24`(跑 6 轮够 `max_iterations_without_improvement=3` 抢闸)。 + +### 6.5 单轮瞬时 LLM QPS 控制 + +单轮内并发跑出的 LLM 请求数: + +```text +单轮瞬时 LLM QPS ≈ eval_case_parallelism (并行跑几条 case) + × num_runs (每条 case 跑几次) + × (每条 case 的 agent 调用数 + 所有 judge 调用数) +``` + +**典型场景估算**:3 个 judge + `num_runs=2` + `eval_case_parallelism=4` + 每 case 1 次 agent 调用 + 3 次 judge 调用 → 单轮瞬时约 32 次 LLM 请求。当 LLM 后端 rate limit 为 30 QPS 时该配置必然触发限流。 + +**控制瞬时 QPS 的两个参数**(按效果排序): + +| 参数 | 影响 | 适用 | +| --- | --- | --- | +| `eval_case_parallelism` | 直接降低并发 case 数 | 大多数情况首选;黑盒 CLI、multi-judge 等单 case 调用密集的场景下设为 `1` 串行执行(详见 [§4.2](#42)、[§4.5](#45)) | +| `num_runs` | 减少每条 case 的重复评估 | 牺牲一定的方差稳定性;建议在确认 LLM 输出方差较小后才下调 | + +### 6.6 反思 LM 选型与配置 + +反思 LM 的输出质量直接决定 prompt 改写质量。配置位置(`optimizer.json`): + +```jsonc +{ + "optimize": { + "algorithm": { + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { + "max_tokens": 4096, // 反思 prompt 较长,留够输出空间 + "temperature": 0.6 // 0.6~0.8 之间,让 LM 有创造性 + } + } + } + } +} +``` + +**两条建议**: + +- **可与 agent / judge 独立配置**——`reflection_lm` 段是独立的,business 可以选不同的 model(避免"自评"偏差,或者纯粹因为 reflection 任务对模型推理力要求更高) +- **token 用量真实记录**——`totalTokenUsage` 字段会累计反思 LM 的实际 prompt + completion + total token 数;按 LLM 后端单价反算 USD 即可 + + + +## 7 完整 API 参考 + +工具书章节,按"想找什么参数"组织。**每个表都有"必填"列**,三档含义: + +- **必填**:不传/不配 → 启动期 fail-fast 报错 +- **选填**:可不配;不配走默认值 +- **条件必填**:单看条目可不配,但**满足某条件时必须配**——条件写在条目末尾的"条件"列 + +所有字段都基于实际源码(每个表头标注源文件路径)。 + +### 7.1 `AgentOptimizer.optimize` 参数表 + +源码:`trpc_agent_sdk/evaluation/_agent_optimizer.py:AgentOptimizer.optimize`。**11 个 keyword-only 参数**——必须用 `key=value` 形式传,不接受位置参数。 + +| 参数 | 必填 | 类型 | 默认 | 说明 | +| --- | --- | --- | --- | --- | +| `config_path` | **必填** | `str` | — | optimizer.json 配置文件路径 | +| `call_agent` | **必填** | `async (str) -> str` | — | 业务 agent 适配函数;签名固定为"接 query 返回 str" | +| `target_prompt` | **必填** | `TargetPrompt` | — | 注册哪些 prompt 字段是优化目标(至少 1 个,否则报错) | +| `train_dataset_path` | **必填** | `str` | — | 训练 evalset 文件路径 | +| `validation_dataset_path` | **必填** | `str` | — | 验证 evalset 文件路径;**必须与 `train_dataset_path` 不同**(防数据泄漏,框架会规范化路径再比对) | +| `output_dir` | **必填** | `str` | — | 产物目录;不存在自动创建 | +| `callbacks` | 选填 | `Optional[Callbacks]` | `None` | 评测器生命周期回调(少用) | +| `update_source` | 选填 | `bool` | `False` | 优化成功后是否回写源 prompt 文件(决策表见 [§3.4](#34-agentoptimizer)) | +| `verbose` | 选填 | `int` | `1` | 终端输出详细度:`0` 静默 / `1` 默认 Rich 面板 / `2` 加 gepa 内部日志转发 | +| `extra_stop_callbacks` | 选填 | `Optional[Sequence]` | `None` | 运行时追加的 stopper(SLO 监控 / kill switch 等);普通 callable 显示为 `stop_reason="completed"`,需稳定标签时用 `_LabeledStopper` 包装或暴露 `.label` 属性 | +| `extra_gepa_callbacks` | 选填 | `Optional[Sequence]` | `None` | 运行时追加的 gepa 事件 callback(如转发到 dashboard);需实现 `gepa.core.callback.GEPACallback` 协议 | + +**返回值**:`OptimizeResult`(详见 [§7.4](#74-optimizeresult--roundrecord-字段表))。 + +**启动期 fail-fast 检查**(`_validate_inputs`): + +| 检查不通过的情况 | 抛出 | +| --- | --- | +| `output_dir` 是空字符串 | `ValueError` | +| `target_prompt` 没注册任何字段 | `ValueError` | +| `call_agent` 不是 async 函数(含 `__wrapped__` 检查,支持 `functools.partial` 包装的 async) | `TypeError` | +| `train_dataset_path` 与 `validation_dataset_path` 解析后是同一个文件(用 `os.path.normpath(os.path.abspath(...))` 规范化后比对) | `ValueError`(防数据泄漏) | +| `evaluate.metrics` 含 `tool_trajectory_avg_score` 或 `llm_rubric_knowledge_recall`——这俩需要 session traces / tool intermediate_data,`call_agent` 黑盒模式拿不到 | `ValueError` | +| 配置中 `algorithm.name` 不在 `OPTIMIZER_REGISTRY` 注册过 | `ValueError`(消息列出所有已注册算法名) | +| `use_merge=true` 且 `TargetPrompt` 字段数 < 2 | `UserWarning`(不致命,但 `mergeRoundsTotal` 会一直是 0) | + +### 7.2 `TargetPrompt` API 表 + +源码:`trpc_agent_sdk/evaluation/_target_prompt.py`。一个注册多字段 prompt 的容器,支持文件源和回调源两种形态。 + +| 方法 | 签名 | 行为 | +| --- | --- | --- | +| `add_path(name, path)` | `(str, str) -> Self` | 注册文件源字段;`name` 必须唯一;返回 self 供链式调用 | +| `add_callback(name, *, read, write)` | `(str, *, AsyncRead, AsyncWrite) -> Self` | 注册回调源字段;`read: async () -> str`、`write: async (str) -> None` 必须都是 async;`name` 必须唯一 | +| `names()` | `() -> list[str]` | 返回字段名(按注册顺序) | +| `describe_source(name)` | `(str) -> str` | 文件源返回路径;回调源返回字面量 `""`;未知 name 抛 `KeyError` | +| `read(name)` | `async (str) -> str` | 读取单个字段 | +| `read_all()` | `async () -> dict[str, str]` | 读取全部字段(按注册顺序) | +| `write_all(prompts)` | `async (dict[str, str]) -> None` | **原子写入全部字段**(详见下方契约) | + +**`write_all` 的原子性契约**(来自源码注释): + +1. **文件源原子写**:先写到 `.tmp`,再 `os.replace` 重命名(POSIX 保证 rename 原子) +2. **失败回滚**:任一文件写失败时,已写成功的文件回滚到 pre-call 内容、清理残留 `.tmp`,原异常正常上抛 +3. **回滚自身失败**:原异常通过 `__context__` 保留,并抛 `_RollbackError` 列出每个字段的回滚失败明细——回滚是 best-effort,一个字段失败不会跳过后续 +4. **回调源不回滚**:文件源写成功后再依次跑回调源;回调源失败时,文件源回滚 baseline,但**回调源自身不回滚**(幂等性由调用方负责) + +**`write_all` 的 keys 校验**:传入 `prompts` 的 key 集合必须**精确等于**注册的字段名集合,否则抛 `ValueError`。 + +### 7.3 `optimizer.json` 配置项表 + +源码:`trpc_agent_sdk/evaluation/_optimize_config.py`。pydantic schema,**支持 camelCase 和 snake_case 两种 key**。顶层结构: + +```jsonc +{ + "evaluate": { ... }, // 评测段(与 AgentEvaluator 同 schema) + "optimize": { // 优化器段 + "eval_case_parallelism": 4, + "stop": { ... }, // 框架级停机 + "algorithm": { ... } // 算法块(含 reflection_lm) + } +} +``` + +#### 7.3.1 `evaluate` 段 + +源码:`_eval_config.py:EvalConfig`。 + +| 字段 | 必填 | 类型 | 默认 | 说明 | +| --- | --- | --- | --- | --- | +| `metrics` | **条件必填**(见下) | `Optional[list[dict]]` | `None` | metric 数组,每条含 `metric_name` / `threshold` / `criterion`。**配了 `metrics` 就忽略 `criteria`** | +| `criteria` | **条件必填**(见下) | `dict[str, Any]` | `{}` | 旧式简写:`metric_name → threshold` 或 `{threshold, criterion}` | +| `num_runs` | 选填 | `int` | `1` | 每条 case 跑几次取均值(消除 LLM 输出方差);`≥ 2` 推荐 | +| `user_simulator_config` | 选填 | `Optional[Any]` | `None` | 用户模拟器配置(多轮场景;少用) | + +**条件**:`metrics` 与 `criteria` **至少配 1 个**——两者都为空时 `evaluate.get_eval_metrics()` 返回空列表,启动期会因没有 metric 报错。新接入推荐用 `metrics`(更结构化),`criteria` 主要为兼容旧配置保留。 + +#### 7.3.2 `optimize` 段 + +源码:`_optimize_config.py:OptimizeConfig`。 + +| 字段 | 必填 | 类型 | 默认 | 说明 | +| --- | --- | --- | --- | --- | +| `eval_case_parallelism` | 选填 | `int` | `4` | 同一轮内 case 并发数(不影响总调用量、影响瞬时 QPS) | +| `stop` | 选填 | `FrameworkStopConfig` | `{required_metrics: "all"}` | 框架级停机段(详见 [§7.3.5](#735-optimizestop-段)) | +| `algorithm` | **必填** | `GepaReflectiveAlgo` | — | 算法块(详见 [§7.3.3](#733-optimizealgorithm-段)) | + +#### 7.3.3 `optimize.algorithm` 段 + +源码:`_optimize_config.py:GepaReflectiveAlgo`。`gepa_reflective` 算法的所有可调参数。 + +> **硬约束**:表中**最后 6 项 stopper 字段中至少配 1 个**——全部留空(默认 `None`)会被 `_require_at_least_one_stop_condition` 拒绝、抛 `ValueError` fail-fast。这就是把它们标为"条件必填"的原因。 + +**基础字段**: + +| 字段 | 必填 | 类型 | 默认 | 说明 | +| --- | --- | --- | --- | --- | +| `name` | **必填** | `Literal["gepa_reflective"]` | — | 算法选择器;目前唯一可选值 | +| `reflection_lm` | **必填** | `OptimizeModelOptions` | — | 反思 LM 配置(详见 [§7.3.4](#734-optimizealgorithmreflection_lm-段)) | +| `seed` | 选填 | `int` | `42` | 随机种子;A/B 实验时两套配置应保持一致 | + +**搜索行为字段**: + +| 字段 | 必填 | 类型 | 默认 | 取值与说明 | +| --- | --- | --- | --- | --- | +| `candidate_selection_strategy` | 选填 | Literal | `"pareto"` | `pareto` 从前沿挑(默认推荐)/ `current_best` 用当前最优 / `epsilon_greedy` 探索-利用 / `top_k_pareto` 前沿前 K 名中随机 | +| `module_selector` | 选填 | `str` | `"round_robin"` | 多字段时本轮改哪个:`round_robin` 按注册顺序轮换 / `all` 全选 / `random` 随机 | +| `frontier_type` | 选填 | Literal | `"instance"` | Pareto 前沿粒度:`instance` 每 case 一个 best / `objective` 每 metric 一个 / `hybrid` 双层 / `cartesian` 笛卡尔积 | +| `reflection_minibatch_size` | 选填 | `Optional[int]` | `None` | 每轮反思 minibatch 大小;`None` 让 gepa 决定 | +| `reflection_history_top_k` | 选填 | `int` (0~5) | `2` | 每条 case 给反思 LM 多少条历史最佳 response;0 禁用,上限 5 | +| `perfect_score` | 选填 | `float` | `1.0` | "完美分"阈值(搭配 `skip_perfect_score`) | +| `skip_perfect_score` | 选填 | `bool` | `True` | 反思时跳过已满分的 case | + +**多字段融合(merge)字段**: + +| 字段 | 必填 | 类型 | 默认 | 说明 | +| --- | --- | --- | --- | --- | +| `use_merge` | 选填 | `bool` | `False` | 启用 merge round;**仅多字段(≥2)时真触发**,单字段时不会触发也不会报错(仅 UserWarning) | +| `max_merge_invocations` | 选填 | `int` | `5` | merge 触发次数上限 | +| `merge_val_overlap_floor` | 选填 | `int` | `5` | 触发 merge 的最低 val 集 case 重叠数 | + +**性能字段**: + +| 字段 | 必填 | 类型 | 默认 | 说明 | +| --- | --- | --- | --- | --- | +| `cache_evaluation` | 选填 | `bool` | `False` | 缓存 (candidate, case) 评分;重复评估直接跳 | +| `track_best_outputs` | 选填 | `bool` | `False` | 追踪每 case 的最佳输出 | + +**停机条件 6 项**——**至少配 1 个**(OR 语义触发): + +| 字段 | 必填 | 类型 | 默认 | 抢闸条件 | +| --- | --- | --- | --- | --- | +| `max_metric_calls` | 条件必填 | `Optional[int]` | `None` | 累计 case-level 评估次数 ≥ N → 停 | +| `max_iterations_without_improvement` | 条件必填 | `Optional[int]` | `None` | 连续 N 轮 best valset 无提升 → 停 | +| `timeout_seconds` | 条件必填 | `Optional[float]` | `None` | wall-clock 超过 N 秒 → 停 | +| `score_threshold` | 条件必填 | `Optional[float]` | `None` | best valset 分数 ≥ N → 停 | +| `max_candidate_proposals` | 条件必填 | `Optional[int]` | `None` | 候选提议次数 ≥ N → 停 | +| `max_tracked_candidates` | 条件必填 | `Optional[int]` | `None` | Pareto 候选池大小 ≥ N → 停 | + +**条件**:6 项里至少配 1 个非 `None`,否则启动期 fail-fast。详见 [§4.7 SLO 硬约束](#47)。 + +#### 7.3.4 `optimize.algorithm.reflection_lm` 段 + +源码:`_optimize_model_options.py:OptimizeModelOptions`。反思 LM 的连接配置。 + +> **日常只需配 4 个**:`model_name` / `base_url` / `api_key` / `generation_config`(其余留默认)。下表中标"高阶"的 6 项一般不需要碰。 + +| 字段 | 必填 | 类型 | 默认 | 说明 | +| --- | --- | --- | --- | --- | +| `model_name` | **必填** | `str` | `""` | 模型名(如 `"gpt-4o-mini"`);空字符串等于没配,会启动期报错 | +| `base_url` | 选填 | `Optional[str]` | `None` | 自定义 endpoint URL | +| `api_key` | 选填 | `str` | `""` | API key(多数 provider 必须给,否则调用阶段会报错) | +| `generation_config` | 选填 | `Optional[dict]` | `None` | 生成参数;典型:`{"max_tokens": 4096, "temperature": 0.6}` | +| `provider_name` | 高阶 | `str` | `""` | provider 名;空 / `"openai"` 走 `OpenAIModel`,其他值走 `ModelRegistry.create_model("{provider}/{model}")` | +| `variant` | 高阶 | `str` | `""` | OpenAI 兼容变体(仅 provider 是 openai 时) | +| `extra_fields` | 高阶 | `Optional[dict]` | `None` | 透传给底层 model 的额外字段 | +| `num_samples` | 高阶 | `Optional[int]` | `None` | 采样数 | +| `weight` | 高阶 | `float` | `1.0` | 权重(multi-judge 场景) | +| `think` | 高阶 | `Optional[bool]` | `None` | 是否启用 thinking 模式 | + +**字段值支持环境变量展开**——`"${TRPC_AGENT_API_KEY}"` 会被自动替换。 + +#### 7.3.5 `optimize.stop` 段 + +源码:`_optimize_config.py:FrameworkStopConfig`。 + +| 字段 | 必填 | 类型 | 默认 | 取值 | +| --- | --- | --- | --- | --- | +| `required_metrics` | 选填 | `Optional[Union[Literal["all"], list[str]]]` | `"all"` | `"all"`:所有 metric 都要达 threshold;`["m1", "m2"]`:列出的 metric 都要达 threshold(其他 metric 仍参与评测但不影响早停);`null` 或 `[]`:禁用框架级早停(仅靠 algorithm 级 stopper) | + +**列表形式校验**:列表中的 metric 名必须能在 `evaluate.metrics[]` 中找到,否则启动期 `OptimizeConfigFile._validate_required_metrics_against_evaluate` 抛 `ValueError`,错误信息列出"未知 metric"和"可用 metric"清单。 + +### 7.4 `OptimizeResult` + `RoundRecord` 字段表 + +源码:`trpc_agent_sdk/evaluation/_optimize_result.py`。这是 `optimize()` 的返回值,也是 `runs//result.json` 的内容。 + +> **重要约定**:`OptimizeResult` 与 `RoundRecord` 都基于 `EvalBaseModel`(`alias_generator=to_camel`)。**Python 内存中是 snake_case,序列化到 JSON 时全部转 camelCase**——读 `result.json` 时按 camelCase 索引(`bestPassRate` 而非 `best_pass_rate`),常见踩坑点。下表中"字段"列用 Python 名(snake_case),读 JSON 时换成 camelCase。 + +#### 7.4.1 `OptimizeResult` 顶层字段 + +**核心结果字段**: + +| 字段(snake_case) | 类型 | 含义 | +| --- | --- | --- | +| `status` | `Literal["SUCCEEDED", "FAILED", "CANCELED"]` | 最终状态;`FAILED` 时 `best_prompts = baseline_prompts` | +| `finish_reason` | Literal | `completed` / `perfect_pass_rate` / `no_improvement` / `error` | +| `stop_reason` | `Optional[StopReason]` | 哪条 stopper 抢闸(详见 [§5.4](#54-停机时机完成当前轮再停));FAILED 早停时为 `None` | +| `error_message` | `str` | FAILED 时的错误信息(默认 `""`) | +| `algorithm` | `str` | 算法名(如 `"gepa_reflective"`) | + +**分数字段**: + +| 字段 | 类型 | 含义 | +| --- | --- | --- | +| `baseline_pass_rate` | `float` | baseline 在 valset 的 pass rate | +| `best_pass_rate` | `float` | 最优候选在 valset 的 pass rate | +| `pass_rate_improvement` | `float` | `best - baseline` | +| `baseline_metric_breakdown` | `dict[str, float]` | baseline 每条 metric 均分 | +| `best_metric_breakdown` | `dict[str, float]` | 最优候选每条 metric 均分 | +| `metric_thresholds` | `dict[str, float]` | 每条 metric 的 threshold(拷自 `evaluate.metrics[].threshold`) | +| `per_metric_best_candidates` | `dict[str, list[int]]` | 每条 metric 的 Pareto 前沿候选索引(0-based);空 = 算法不暴露此信息 | + +**prompt 字段**: + +| 字段 | 类型 | 含义 | +| --- | --- | --- | +| `baseline_prompts` | `dict[str, str]` | 起点 prompt 内容(按 TargetPrompt 字段名 keyed) | +| `best_prompts` | `dict[str, str]` | 最优候选 prompt;FAILED 时 = `baseline_prompts`(保证产物**永远不会比 baseline 差**) | + +**轮次字段**: + +| 字段 | 类型 | 含义 | +| --- | --- | --- | +| `total_rounds` | `int` | 跑了几轮 | +| `rounds` | `list[RoundRecord]` | 每轮记录(详见 §7.4.2) | + +**统计与时间字段**: + +| 字段 | 类型 | 含义 | +| --- | --- | --- | +| `total_reflection_lm_calls` | `int` | 反思 LM 累计调用次数(含重试) | +| `total_token_usage` | `dict[str, int]` | 反思 LM 累计 token:`{prompt, completion, total}` | +| `duration_seconds` | `float` | 总 wall-clock 耗时 | +| `started_at` / `finished_at` | `str` | ISO-8601 时间戳 | + +**其他**: + +| 字段 | 类型 | 含义 | +| --- | --- | --- | +| `schema_version` | `str` | 默认 `"v1"`;产物 schema 升级时 bump | +| `extras` | `dict[str, Any]` | 自定义业务字段;优化器不读不写 | + +#### 7.4.2 `RoundRecord` 字段(每轮一条) + +**轮次基本信息**: + +| 字段 | 类型 | 含义 | +| --- | --- | --- | +| `round` | `int` | 1-based 轮次号 | +| `kind` | `Literal["reflective", "merge"]` | 反思轮 / 融合轮 | +| `started_at` | `str` | ISO-8601 时间戳 | +| `duration_seconds` | `float` | 本轮 wall-clock 耗时 | + +**改写情况**: + +| 字段 | 类型 | 含义 | +| --- | --- | --- | +| `optimized_field_names` | `list[str]` | 本轮被反思 LM 改写的字段名 | +| `candidate_prompts` | `dict[str, str]` | 本轮候选的全字段内容 | +| `accepted` | `bool` | 是否被采纳为新 best | +| `acceptance_reason` | `str` | 采纳决策的可读说明 | +| `per_field_diagnosis` | `dict[str, str]` | 反思 LM 给每个字段的诊断文本 | + +**评分情况**: + +| 字段 | 类型 | 含义 | +| --- | --- | --- | +| `validation_pass_rate` | `float` | 本轮在 valset 的 pass rate | +| `metric_breakdown` | `dict[str, float]` | 本轮 valset 上每条 metric 均分;空 = 该轮没跑 valset | +| `failed_case_ids` | `list[str]` | 本轮 valset 失败的 case id | +| `failed_cases_truncated` | `int` | 因 token 预算被裁掉的失败 case 数 | +| `train_minibatch_size` | `int` | 本轮 minibatch 大小;0 = 跳过未抽样 | +| `train_subsample_parent_score` | `Optional[float]` | parent 候选在 minibatch 上的分;`None` = 未跑 | +| `train_subsample_candidate_score` | `Optional[float]` | 新候选在 minibatch 上的分;`None` = 未跑 | +| `skip_reason` | `Optional[str]` | 跳过原因(如 `"subsample perfect"`、`"no proposal"`) | +| `error_message` | `Optional[str]` | 本轮算法错误信息 | + +**统计字段**: + +| 字段 | 类型 | 含义 | +| --- | --- | --- | +| `reflection_lm_calls` | `int` | 本轮反思 LM 调用次数(含重试) | +| `round_token_usage` | `dict[str, int]` | 本轮反思 LM token:`{prompt, completion, total}` | +| `budget_used` | `Optional[int]` | 累计已用 metric_calls | +| `budget_total` | `Optional[int]` | 配置预算上限(如 `max_metric_calls`) | + +**`extras`**(`dict[str, Any]`):自定义业务字段;优化器不读不写。 + +#### 7.4.3 `OptimizeResult` 实用方法 + +| 方法 | 行为 | +| --- | --- | +| `dump_to(path)` | 序列化到 JSON 文件(`indent=2`, `by_alias=True`) | +| `OptimizeResult.from_file(path)` | classmethod,从 JSON 反序列化 | +| `format_summary(*, output_dir, update_source)` | 生成 `summary.txt` 的人类可读文本 | + + + + + +## 8 产物与目录约定 + +每跑一次 `optimize()`,框架在 `output_dir` 下落一组完整的审计产物。所有写入都是**原子的**——SIGINT / 进程崩溃都不会留下半写文件。 + +### 8.1 目录布局 + +```text +runs// +├── result.json 完整 OptimizeResult 序列化(程序读取入口) +├── summary.txt 人类可读摘要(一眼看 baseline → best) +├── config.snapshot.json 本次跑用的 optimizer.json 完整快照(可重现) +├── run.log 单行状态,CI 解析友好 +│ +├── baseline_prompts/ 运行前的 prompt 快照(每字段一个 .md) +│ ├── system_prompt.md +│ └── ... +│ +├── best_prompts/ 优化得到的最优候选(每字段一个 .md) +│ ├── system_prompt.md +│ └── ... +│ +└── rounds/ 每轮的完整 RoundRecord + ├── round_001.json + ├── round_002.json + └── ... +``` + +每个文件的角色: + +| 文件 / 目录 | 何时写 | 干什么用 | +| --- | --- | --- | +| `result.json` | 优化结束(含失败) | 程序读取最权威产物。完整 `OptimizeResult` 序列化(详见 [§7.4](#74-optimizeresult--roundrecord-字段表))。**字段名 camelCase** | +| `summary.txt` | 优化结束(仅成功) | 人类可读摘要:`baseline → best` 趋势、metric breakdown、所有 best 字段 + 字符数、artifacts 目录索引 | +| `config.snapshot.json` | 优化开始 | 本次跑用的 `optimizer.json` 完整快照——后续想"复跑这次结果"直接用它 | +| `run.log` | 优化结束 | 单行:` status=... algorithm=... baseline=0.4 best=0.85 delta=+0.45 rounds=10 duration_seconds=120.5`;CI 平台 grep 友好 | +| `baseline_prompts/.md` | 优化开始 | 运行前每个 TargetPrompt 字段的内容快照——**无论 `update_source` 设什么都会写**(最重要的兜底产物) | +| `best_prompts/.md` | 优化结束(仅有 result 时) | 最优候选 prompt——`update_source=False` 时这是最有价值的产物(待人工 review 后同步) | +| `rounds/round_.json` | 每轮结束 | 完整 `RoundRecord` 序列化(详见 [§7.4.2](#742-roundrecord-字段每轮一条));3 位零填充编号便于排序 | + +### 8.2 哨兵文件:让用户主动停优化 + +源码:`_optimize_gepa_reflective.py:_build_stop_callbacks` 末尾。 + +跑优化期间,用户在 `output_dir` 下**手动 `touch optimize.stop`**: + +```bash +touch runs//optimize.stop +``` + +下一轮开头框架检测到该文件即停(`gepa.utils.FileStopper` 实现),`stop_reason="user_requested_stop"`。**典型用途**:跑了一半发现已经够用 / 临时要释放 LLM 配额——比 Ctrl+C 更优雅,能保证当前轮完成后干净落盘。 + +### 8.3 原子落盘保证 + +**所有产物都用 tmp + `os.replace` 原子写**——POSIX 保证 rename 原子,进程被 kill / 断电时 `output_dir` 里要么是干净的旧文件、要么是干净的新文件,**永远不会出现半写状态**。 + +源码:`_agent_optimizer.py` 的两个工具函数: +- `_atomic_write_text(path, content)`:先写 `.tmp`,再 `os.replace(tmp, path)` +- `_mask_sigint`:上下文管理器,在 `_persist_artifacts` 期间屏蔽 SIGINT(避免"第二次 Ctrl+C 打断 finally 落盘") + +**`update_source=True` 的源 prompt 文件回写**:使用 `TargetPrompt.write_all`,对**多字段**也保证原子性——任一字段写失败,已写成功的字段全部回滚到 pre-call 内容(详见 [§7.2](#72-targetprompt-api-表) 的 `write_all` 契约)。 + +> **极端容错**:如果 `update_source=True` 写源文件时 `os.replace` 自身失败(如目标文件所在目录被并发删除),框架会**显式调 `write_all(baseline)` 把源文件恢复到运行前内容**,再上抛原异常——保证业务永远拿不到一个"半优化"的源文件。 + + +## 9 想自己扩展? + +源码总入口:`_optimize_registrations.py`。框架通过**注册表机制**支持三类扩展,不需要 fork SDK。 + +### 9.1 注册新算法 + +源码:`_base_optimizer.py:BaseOptimizer` + `_optimize_registry.py:OPTIMIZER_REGISTRY`。 + +写一个 `BaseOptimizer` 子类,实现 `async def run(self, *, reporter=None) -> OptimizeResult`,注册到 `OPTIMIZER_REGISTRY`: + +```python +from trpc_agent_sdk.evaluation._base_optimizer import BaseOptimizer +from trpc_agent_sdk.evaluation._optimize_registry import OPTIMIZER_REGISTRY +from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult + + +class MyOwnOptimizer(BaseOptimizer): + async def run(self, *, reporter=None) -> OptimizeResult: + # 你的算法主循环。基类已注入: + # self.config - OptimizeConfigFile(含 evaluate / optimize 两段) + # self.call_agent - 业务 agent 适配函数 + # self.target_prompt - TargetPrompt 实例 + # self.train_dataset_path / self.validation_dataset_path + # self.callbacks / self.output_dir + # self.extra_stop_callbacks / self.extra_gepa_callbacks + ... + return OptimizeResult(...) + + +# 注册:第二个参数必须是 BaseOptimizer 子类,否则 register() 抛 TypeError +OPTIMIZER_REGISTRY.register("my_own_algo", MyOwnOptimizer) +``` + +业务侧用法:在 `optimizer.json` 中把 `optimize.algorithm.name` 改为 `"my_own_algo"`,框架启动期通过 `OPTIMIZER_REGISTRY.get(...)` 查到你的类、实例化、跑 `run()`。 + +**注意**:`GepaReflectiveAlgo.name` 当前是 `Literal["gepa_reflective"]`——**新算法需要新加一个 `pydantic.BaseModel` 配置类**(如 `MyOwnAlgo`),并修改 `OptimizeConfig.algorithm` 字段为 discriminated union(详见 `_optimize_config.py:OptimizeConfig` 的 docstring)。 + +### 9.2 注册自定义 stopper + +源码:`_agent_optimizer.py:AgentOptimizer.optimize` 的 `extra_stop_callbacks` 参数。 + +通过 `extra_stop_callbacks` 在运行时注入——**不需要改配置文件**: + +```python +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _LabeledStopper + + +class MySloMonitorStopper: + """自定义 stopper:检查外部 SLO 监控系统,超阈值就停。""" + + def __init__(self, slo_client): + self._slo = slo_client + self.last_triggered = False + + def __call__(self, gepa_state=None) -> bool: + if self._slo.is_p99_breached(): + self.last_triggered = True + return True + return False + + +# 用法: +stopper = MySloMonitorStopper(slo_client) +result = await AgentOptimizer.optimize( + ..., + extra_stop_callbacks=[ + # 普通 stopper:stop_reason 显示为 "completed" + stopper, + + # 想要稳定的 stop_reason 标签时,用 _LabeledStopper 包装: + # _LabeledStopper(stopper, "slo_breach"), # 但 "slo_breach" 不在 StopReason Literal 里,pydantic 会拒 + ], +) +``` + +**接口约定**(见 `_LabeledStopper`): +- 必须有 `__call__(self, gepa_state=None) -> bool` 方法 +- `True` 即表示停机 +- 应有 `last_triggered: bool` 属性供 `_classify_stop_reason` 读取 + +**`stop_reason` 的两种行为**: +- 普通 callable / 自定义类:触发时 `stop_reason` 显示为 `"completed"`(gepa 不知道你为什么停) +- 用 `_LabeledStopper(inner, label)` 包装:`label` 必须是 `StopReason` Literal 的合法值(见 `_optimize_result.py`);自定义新 label 需扩展 Literal 类型 + +### 9.3 注册自定义评测 callback + +源码:`_agent_optimizer.py:AgentOptimizer.optimize` 的 `extra_gepa_callbacks` 参数。 + +通过 `extra_gepa_callbacks` 接入 gepa 内部事件——典型用途:转发到 dashboard / 实时监控 metric。 + +```python +class MyDashboardCallback: + def on_proposal_end(self, *args, **kwargs) -> None: + # 上报到 Grafana / WandB / 内部监控 + ... + + # gepa 静默忽略缺失的方法,按需实现部分协议方法即可 + +result = await AgentOptimizer.optimize( + ..., + extra_gepa_callbacks=[MyDashboardCallback()], +) +``` + +**协议约束**:每个 callback 应实现 `gepa.core.callback.GEPACallback` 协议中的若干方法(`on_iteration_start` / `on_proposal_start` / `on_proposal_end` / `on_valset_breakdown` / ...)。**gepa 静默忽略 callback 缺失的方法**,所以业务可以只实现关心的那几个。 + + +## 10 FAQ + +**Q:跑了一次 `result.json` 里 `bestPassRate` 跟 `baselinePassRate` 一样、`accepted` 全是 false——是 bug 吗?** + +不是。优化没找到比 baseline 更好的候选——`status="SUCCEEDED"` + `finish_reason="no_improvement"` 是这种情况的典型组合,`best_prompts` 等于 `baseline_prompts`。可能原因:baseline 已经很好、`max_metric_calls` 给得太小没跑到改进点、训练集和验证集分布差太多、metric 噪声太大(建议提高 `num_runs`)。 + +--- + +**Q:`update_source=True` 跑挂了,源 prompt 文件被改坏了吗?** + +不会。两道保险:(1) 优化失败(`status="FAILED"`)时框架根本不调 `write_all`;(2) 即便 `write_all` 自身失败,源文件也通过 tmp + `os.replace` 原子回滚(详见 [§8.3](#83-原子落盘保证))。 + +--- + +**Q:能跑到一半改 `optimizer.json` 吗?** + +不能。`optimizer.json` 在启动期一次性 load,后续修改不会被读取。哨兵文件 `optimize.stop` 是唯一支持的"运行时干预"(详见 [§8.2](#82-哨兵文件让用户主动停优化))。 + +--- + +**Q:训练集很小(< 5 case)能跑吗?** + +可以,但效果差:(1) 反思 LM 看的反馈样本太少,改写方向不稳;(2) 小训练集容易让 advanced 配置过拟合(参考 [§4.8](#48))。建议至少 5~10 条 case;< 5 时考虑先做手工调优。 + +--- + +**Q:`call_agent` 内部发 HTTP / RPC 时怎么处理重试?** + +由 `call_agent` 自己处理。框架不替业务做 LLM / 服务调用层的重试——设计上保持 `call_agent` 是黑盒。如果调用失败,那一条 case 评测分数计 0,反思 LM 会看到错误信息(参考 §5.2 反思 LM 反馈结构)。 + +--- + +**Q:能让多个 `optimize()` 同时跑、共享一个 `output_dir` 吗?** + +不可以。多个进程同写一个 `output_dir`,原子写约束保护单文件不被半写,但**多个进程相互覆盖文件**——`result.json` / `rounds/round_001.json` 等会互相踩。每次跑用独立 timestamp 子目录。 + +--- + +**Q:用 black-box `call_agent` 模式时,能用 `tool_trajectory_avg_score` 这类 metric 吗?** + +不能。`call_agent` 黑盒模式拿不到 session traces / tool intermediate_data,框架启动期会 fail-fast 拒绝(详见 [§7.1](#71-agentoptimizeroptimize-参数表) 启动期检查表)。改用响应级 metric:`final_response_avg_score` / `llm_rubric_response` / `llm_final_response`。 + +--- + +**Q:跑完 `update_source=False`,源 prompt 还在原地,但 `target_prompt.write_all` 在过程中是被反复调过的?** + +是的。优化器主循环每次产生新候选都会 `write_all` 把候选写到 `add_path` 注册的源文件——这是为了让下一次 `call_agent` 调用能读到新 prompt。**`finally` 阶段会自动 `write_all(baseline_snapshot)` 把源文件回滚到 baseline 内容**(源码:`_agent_optimizer.py:optimize` 的 `cleanup_done` sentinel)。所以 `update_source=False` 跑完后,源文件**与运行前完全一致**——前提是 `TargetPrompt.write_all` 没在回滚阶段抛错(极端情况下抛错时框架会记 warning 但不影响 `result.json` / `best_prompts/` 的产出)。 + +--- + +**Q:怎么"复跑"上次的优化结果?** + +复跑 `runs//config.snapshot.json` 即可——它是上次的完整配置快照。但 LLM 输出存在随机性,即使配置一致也可能得到不同 best_prompts;`seed` 字段固定能减少(不能消除)这种随机性。A/B 实验时一定要锁 seed(参考 [§4.8](#48))。 + + diff --git a/examples/optimization/advanced_strategies/README.md b/examples/optimization/advanced_strategies/README.md new file mode 100644 index 00000000..dbb261fe --- /dev/null +++ b/examples/optimization/advanced_strategies/README.md @@ -0,0 +1,206 @@ +# Advanced Strategies — GEPA 高阶策略组合 A/B 对照 + +> **适用场景**:已熟悉 GEPA 基本流程,希望进一步理解 `candidate_selection_strategy` / `frontier_type` / `use_merge` / `skip_perfect_score` 等高阶配置在真实任务上的行为差异。本 example 跑 baseline 与 advanced 两套配置后用 `compare.py` 输出对比表。阅读前请先熟悉 `quickstart/README.md` §2。 + +## 1 · 适用问题与设计目标 + +GEPA 高阶配置开关多,业务方常见困惑: + +- "打开 `use_merge=true` 真的会触发 merge 吗?" +- "`frontier_type` 选 `instance` 还是 `objective` 对我的任务有什么影响?" +- "`skip_perfect_score=true` 能省多少 reflection LM 调用?" + +单跑一次优化往往看不出差异,因为 GEPA 在多数任务上都能收敛到相近 `best_pass_rate`。本 example 用 A/B 对照方法暴露差异: + +- **方案 A(baseline)**:基础策略组合 +- **方案 B(advanced)**:高阶策略组合(`frontier_type=objective` + `skip_perfect_score=true` + `use_merge=true`) +- **任务设计**:地址解析任务,混合"完整地址"与"缺信息地址"两类 case,制造多目标局部最优空间 + +| 输入 | 输出 | +| --- | --- | +| 两套不同的 `optimizer_*.json` 配置 | 两次独立的优化运行结果 | +| `compare.py` 解析两次的 `result.json` | 多维度对比表 | + +### 本 example 演示的最小用例 + +| 维度 | 值 | +| --- | --- | +| 业务任务 | 自由文本地址解析为严格 JSON `{country, city, postal_code, street}`(缺信息字段输出 `null`) | +| 优化目标 | `agent/prompts/system.md` 单字段 | +| 训练集 | 6 条 case:3 条完整地址 + 3 条缺信息地址 | +| 验证集 | 6 条 case | + +## 2 · 术语对照 + +仅列出本 example 引入的新概念。基础术语见 `quickstart/README.md` §2。 + +| 术语 | 含义 | +| --- | --- | +| **candidate_selection_strategy** | 反思每轮选哪个候选作为亲本的策略。可选 `pareto` / `current_best` / `epsilon_greedy` / `top_k_pareto`。 | +| **frontier_type** | Pareto 前沿粒度。可选 `instance`(按 case) / `objective`(按 metric) / `hybrid`(双层) / `cartesian`(按 case×metric)。 | +| **skip_perfect_score** | 反思 minibatch 抽样时是否跳过已满分的 case。 | +| **predictor-level merge** | merge 操作在 prompt 字段层面进行。**需要至少 2 个字段才有意义**——单字段优化下 merge 永远不会触发。 | +| **merge_val_overlap_floor** | 触发 merge 的最低 val 集 case 重叠数(默认 5)。 | + +## 3 · 运行示例 + +### 3.1 安装依赖 + +```bash +pip install -e ".[optimize]" +``` + +### 3.2 配置环境变量 + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +### 3.3 顺序跑两次优化 + +```bash +cd examples/optimization/advanced_strategies +python3 run_baseline.py # 配置 A:basic 策略组合 +python3 run_advanced.py # 配置 B:高阶策略组合 +``` + +每次运行约 3 分钟。 + +### 3.4 输出对比表 + +```bash +python3 compare.py +``` + +`compare.py` 自动选取 `runs/` 下最新的 `baseline_*` 与 `advanced_*` 目录解析 `result.json`,输出多维度对比表(轮次数、接受率、merge 触发次数、reflection LM 调用数、baseline / best pass_rate 等)。 + +## 4 · 架构与数据流 + +``` +[run_baseline.py] [run_advanced.py] + │ │ + ├── optimizer_baseline.json ├── optimizer_advanced.json + │ instance frontier │ objective frontier + │ skip_perfect_score=false │ skip_perfect_score=true + │ use_merge=false │ use_merge=true(单字段实际不触发) + │ │ + └── runs/baseline_/result.json └── runs/advanced_/result.json + + ┌────────────┴────────────┐ + │ python3 compare.py │ + │ _latest("baseline") │ + │ _latest("advanced") │ + │ 解析 result.json │ + │ 输出对比表 │ + └─────────────────────────┘ +``` + +### 4.1 文件清单 + +| 文件 | 角色 | 接入自有业务时的修改方向 | +| --- | --- | --- | +| `run_baseline.py` | basic 配置入口 | 与 quickstart 同 | +| `run_advanced.py` | 高阶配置入口 | 调整 `optimizer_advanced.json` 中策略组合 | +| `compare.py` | 解析两次 `result.json` 输出对比表 | 添加 / 删除关注的对比维度 | +| `agent/agent.py` | 地址解析 LlmAgent + `_normalize_json` | 替换为业务 agent | +| `agent/prompts/system.md` | baseline prompt(故意极简) | 写入业务 baseline | +| `optimizer_baseline.json` | basic 策略 JSON | 调整阈值与 metric | +| `optimizer_advanced.json` | 高阶策略 JSON | 调整高阶开关 | +| `data/train.evalset.json` / `data/val.evalset.json` | 数据集 | 替换为业务用例 | + +## 5 · 高阶策略对照 + +### 5.1 配置差异速查 + +| 配置项 | baseline | advanced | +| --- | --- | --- | +| `candidate_selection_strategy` | `pareto` | `pareto` | +| `frontier_type` | `instance` | `objective` | +| `skip_perfect_score` | `false` | `true` | +| `use_merge` | `false` | `true` | +| `module_selector` | `round_robin` | `round_robin` | + +### 5.2 `frontier_type` instance vs objective + +| 取值 | 行为 | 在本任务上的表现 | +| --- | --- | --- | +| `instance` | 每条 case 维护一个 best 候选,反思看逐 case 反馈 | 接受门槛较高(需在某 case 上严格优于历史),rounds_accepted 较少 | +| `objective` | 每条 metric 维护一个 best,反思看聚合分数 | 接受门槛较低(聚合分有提升即接受),rounds_accepted 较多但 valset 易震荡 | + +`objective` 更激进。小训练集(< 10 case)下可能过拟合 train minibatch,造成 valset pass_rate 波动。 + +### 5.3 `skip_perfect_score` 的实际节省 + +理论上能减少不必要的 reflection LM 调用。实际节省幅度取决于: + +- baseline 起点高度(baseline=0 时早期满分 case 极少,节省有限) +- 训练集规模(小训练集下满分 case 在 minibatch 中比例不稳定) + +本 example 实测约节省 1 次 reflection 调用,差异不显著。该开关在**大规模训练集 + 高基线起点**场景下效果更明显。 + +## 6 · 关键配置(含两条踩坑警示) + +### 6.1 `use_merge` 在单字段优化下不会触发 + +merge 是 predictor-level 操作,**需要至少 2 个字段才有意义**。本 example 是单字段优化,因此 `optimizer_advanced.json` 中 `use_merge=true` 设置无副作用,但也不会带来任何实际 merge 行为——`compare.py` 输出中 `merge_rounds_total=0` 是预期。 + +需要观察 merge 实际效果时,参见 `multi_agent_pipeline/` example,其 4 字段配置下 merge 会真实触发。 + +### 6.2 `result.json` 字段命名为 camelCase + +SDK 内部使用 snake_case 字段名(如 `stop_reason` / `total_rounds` / `best_pass_rate`),但序列化到 `result.json` 时会自动转换为 camelCase(`stopReason` / `totalRounds` / `bestPassRate`)。 + +这是因为 `EvalBaseModel` 的 `alias_generator=to_camel`,序列化时 `by_alias=True`。 + +**踩坑提醒**:用 Python 读 `result.json` 时按 camelCase 索引: + +```python +data = json.loads(Path("result.json").read_text()) +print(data["bestPassRate"]) # ✅ +print(data["best_pass_rate"]) # ❌ KeyError +``` + +`compare.py` 中已经按 camelCase 解析;自有脚本读 `result.json` 时同样按此约定。 + +### 6.3 `frontier_type` 取值约束 + +SDK 仅接受以下 4 个字面量值: + +``` +"instance" | "objective" | "hybrid" | "cartesian" +``` + +其他取值(如 `"aggregate"` / `"mixed"`)会在 pydantic 层面直接 `ValidationError`,无法启动优化。配置前请确认拼写。 + +## 7 · 常见问题 + +**Q:为什么两次跑的 `best_pass_rate` 经常相同?** +A:GEPA 是 Pareto 优化算法,在简单任务 + 充足预算下两套策略最终常收敛到同一最优。差异往往体现在**到达路径**(轮次数、接受率、merge 行为)而非最终分数。这正是本 example 设计 `compare.py` 关注多维度而非单一 `best_pass_rate` 的原因。 + +**Q:advanced 接受了 4 轮但 baseline 只接受了 2 轮,是不是 advanced 更好?** +A:不一定。`objective` frontier 接受门槛低,可能"接受了一个 train 上更好但 val 上更差"的候选。需结合每轮的 `valset pass_rate` 趋势观察是否过拟合。 + +**Q:`compare.py` 输出 `merge_rounds_total=0` 但我开了 `use_merge=true`?** +A:单字段优化下符合预期。参见 §6.1。 + +**Q:怎么知道是哪一轮被接受的、是反思还是 merge?** +A:`result.json` 中 `rounds[*]` 数组每条记录都有 `accepted: true/false` 和 `kind: "reflective" | "merge"` 字段,可直接遍历查看。 + +**Q:advanced 配置里 `seed` 应该和 baseline 保持一致吗?** +A:保持一致便于对比时排除随机性影响。本 example 两份 JSON 都用同一 `seed`。 + +## 8 · 接入自有业务的步骤 + +1. **复制本 example 作为对照模板**:保留 `run_baseline.py` / `run_advanced.py` / `compare.py` 三脚本结构 +2. **替换业务 agent**:`agent/agent.py` 改为业务 agent 实现 +3. **设计两套配置 JSON**: + - `optimizer_baseline.json`:当前线上配置或默认配置 + - `optimizer_advanced.json`:希望验证的高阶组合 + - 二者保持 `seed` / `max_metric_calls` 一致以便公平对比 +4. **替换数据集**:业务 train / val +5. **跑两次 + compare**:根据对比表多维度评估高阶配置在业务任务上的实际收益 +6. **决策**:把对比表中表现明显更优的配置作为生产配置 + +> 高阶配置不是"越复杂越好"。许多任务上 baseline 配置已能达到合理收敛,advanced 只在特定任务结构(多目标、多字段、大规模训练集等)下显示价值。**用数据决定,不用直觉**。 diff --git a/examples/optimization/advanced_strategies/agent/__init__.py b/examples/optimization/advanced_strategies/agent/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/optimization/advanced_strategies/agent/agent.py b/examples/optimization/advanced_strategies/agent/agent.py new file mode 100644 index 00000000..90d8b8cc --- /dev/null +++ b/examples/optimization/advanced_strategies/agent/agent.py @@ -0,0 +1,134 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""地址解析 agent —— Advanced Strategies example 专用。 + +任务设计动机 +------------ +本 example 用于验证 GEPA 高阶策略组合(use_merge / frontier_type / +skip_perfect_score 等)的真实效果。任务必须存在两个**互相牵制**的维度, +才能逼出策略差异: + +A. 完整地址(country/city/postal_code/street 都给到)→ 期望严格 JSON +B. 缺信息地址(少 postal_code 或 street)→ 期望对应字段输出 null + +候选 prompt 容易陷入两个局部最优: +- 候选 P1 学会"严格 JSON"但所有字段都不给 null(缺信息时硬编一个) +- 候选 P2 学会"该 null 就 null"但 JSON 格式偶尔崩 + +→ 多字段场景下 use_merge=true 能融合 P1/P2 各自掌握的子能力。 +→ frontier_type 选 instance vs objective 在这类任务上行为差异显著。 + +接入业务时改哪里 +---------------- +- 替换为业务任务 agent 与 prompt +- 保留 _normalize_json 让 metric 走 text exact,CI 上更稳 +""" + +from __future__ import annotations + +import json +import re +import uuid +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import Part + +from .config import get_model_config + + +SYSTEM_PROMPT_PATH = Path(__file__).parent / "prompts" / "system.md" +APP_NAME = "advanced_strategies_demo" + +_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) + + +def _create_model() -> LLMModel: + """构建 OpenAI 兼容 chat 模型实例。""" + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def _read_instruction() -> str: + """从磁盘重读 system.md。""" + return SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + + +def create_agent() -> LlmAgent: + """构建一个使用当前磁盘 prompt 的新 LlmAgent 实例。""" + return LlmAgent( + name="address_parser", + description="Parses free-text postal addresses into a strict JSON.", + model=_create_model(), + instruction=_read_instruction(), + generate_content_config=GenerateContentConfig( + temperature=0.1, + top_p=0.9, + max_output_tokens=256, + ), + ) + + +def _normalize_json(raw: str) -> str: + """把 LLM 输出规范化成稳定 JSON 字符串。 + + 与 ci_integration / blackbox_cli 完全相同的规范化逻辑:让 + final_response_avg_score(text.match=exact) 直接走精确匹配。 + """ + text = (raw or "").strip() + if not text: + return "" + match = _JSON_OBJECT_RE.search(text) + if not match: + return text + try: + parsed = json.loads(match.group(0)) + except json.JSONDecodeError: + return text + return json.dumps(parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")) + + +async def call_agent(query: str) -> str: + """框架回调:跑一次推理,输出经 _normalize_json 规范化。""" + root = create_agent() + session_service = InMemorySessionService() + runner = Runner( + app_name=APP_NAME, + agent=root, + session_service=session_service, + ) + session_id = str(uuid.uuid4()) + user_id = "parser" + await session_service.create_session( + app_name=APP_NAME, + user_id=user_id, + session_id=session_id, + state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, + session_id=session_id, + new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return _normalize_json(final_text) diff --git a/examples/optimization/advanced_strategies/agent/config.py b/examples/optimization/advanced_strategies/agent/config.py new file mode 100644 index 00000000..d0a64b15 --- /dev/null +++ b/examples/optimization/advanced_strategies/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""模型凭据读取 —— 从环境变量加载 OpenAI 兼容 LLM 的连接信息。 + +需要的环境变量 +-------------- + TRPC_AGENT_API_KEY LLM 后端的 API key + TRPC_AGENT_BASE_URL LLM 后端的 endpoint + TRPC_AGENT_MODEL_NAME 模型名 + +缺任意一个就立即抛 ValueError,避免运行到一半才撞到 LLM 后端的 401 错误, +那时报错信息会很有迷惑性(看起来像 prompt 写错了,实际是凭据没配)。 +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """返回 (api_key, base_url, model_name);任一缺失立刻报错。""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "运行优化器前必须配置环境变量 TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME。" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/advanced_strategies/agent/prompts/system.md b/examples/optimization/advanced_strategies/agent/prompts/system.md new file mode 100644 index 00000000..f9b299bb --- /dev/null +++ b/examples/optimization/advanced_strategies/agent/prompts/system.md @@ -0,0 +1 @@ +You parse free-text postal addresses and return a JSON object. diff --git a/examples/optimization/advanced_strategies/compare.py b/examples/optimization/advanced_strategies/compare.py new file mode 100644 index 00000000..164b27bc --- /dev/null +++ b/examples/optimization/advanced_strategies/compare.py @@ -0,0 +1,100 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""跑完 baseline + advanced 后用本脚本对比 result.json,输出对比表。 + +适用场景 +-------- +高阶策略 A/B 对照实验的分析端。先跑 run_baseline.py + run_advanced.py, +再跑本脚本:自动选取 runs/ 下最新的 baseline_* 与 advanced_* 目录解析 +result.json,按多维度对比表输出。 + +result.json 字段命名注意 +------------------------ +SDK 内部 snake_case,但序列化到 result.json 时通过 alias_generator 转换为 +camelCase。本脚本按 camelCase 索引(stopReason / totalRounds / bestPassRate +等)。自有脚本读 result.json 时同样按此约定。 +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +RUNS_DIR = _HERE / "runs" + + +def _latest(prefix: str) -> Path | None: + """挑出 runs/_* 中最新一次的目录。""" + candidates = sorted(RUNS_DIR.glob(f"{prefix}_*")) + return candidates[-1] if candidates else None + + +def _load(run_dir: Path) -> dict: + """读 result.json;缺失返回空 dict。""" + rj = run_dir / "result.json" + if not rj.exists(): + return {} + return json.loads(rj.read_text(encoding="utf-8")) + + +def _short(d: dict) -> dict: + """从完整 result.json 中抽取本次对比关注的维度。 + + 维度选择原则:能直接反映高阶策略行为差异的字段(轮次接受率、merge + 触发次数、reflection LM 调用数等),而非仅最终 best_pass_rate + (高阶策略往往与 baseline 收敛到相近最终分数,差异在到达路径上)。 + """ + rounds = d.get("rounds") or [] + accepted = sum(1 for r in rounds if r.get("accepted")) + merge_total = sum(1 for r in rounds if r.get("kind") == "merge") + merge_accepted = sum(1 for r in rounds if r.get("kind") == "merge" and r.get("accepted")) + return { + "stop_reason": d.get("stopReason"), + "finish_reason": d.get("finishReason"), + "duration_s": round(d.get("durationSeconds") or 0.0, 1), + "total_rounds": d.get("totalRounds"), + "rounds_accepted": accepted, + "merge_rounds_total": merge_total, + "merge_rounds_accepted": merge_accepted, + "reflection_lm_calls": d.get("totalReflectionLmCalls"), + "baseline_pass_rate": d.get("baselinePassRate"), + "best_pass_rate": d.get("bestPassRate"), + } + + +def main() -> int: + """读两次最新 run,输出对比表。""" + base = _latest("baseline") + adv = _latest("advanced") + if base is None or adv is None: + print( + "Need both baseline_* and advanced_* runs in runs/. " + "Run run_baseline.py and run_advanced.py first.", + file=sys.stderr, + ) + return 1 + + print(f"baseline run : {base.name}") + print(f"advanced run : {adv.name}\n") + + a = _short(_load(base)) + b = _short(_load(adv)) + keys = list(a.keys()) + width = max(len(k) for k in keys) + 2 + print(f"{'metric'.ljust(width)}{'baseline'.rjust(18)}{'advanced'.rjust(18)}") + print("-" * (width + 36)) + for k in keys: + va = a.get(k) + vb = b.get(k) + print(f"{k.ljust(width)}{str(va).rjust(18)}{str(vb).rjust(18)}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/optimization/advanced_strategies/data/train.evalset.json b/examples/optimization/advanced_strategies/data/train.evalset.json new file mode 100644 index 00000000..d9c85f90 --- /dev/null +++ b/examples/optimization/advanced_strategies/data/train.evalset.json @@ -0,0 +1,73 @@ +{ + "eval_set_id": "advanced_strategies_train", + "name": "Address parsing - train", + "description": "6 case 混合:3 个完整地址(country/city/postal_code/street 全有)+ 3 个缺信息地址(部分字段需要输出 null)。GEPA 要同时学好这两类,是 use_merge 触发场景。", + "eval_cases": [ + { + "eval_id": "train_full_us", + "conversation": [ + { + "invocation_id": "t1", + "user_content": {"parts": [{"text": "1600 Amphitheatre Parkway, Mountain View, CA 94043, USA"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Mountain View\",\"country\":\"USA\",\"postal_code\":\"94043\",\"street\":\"1600 Amphitheatre Parkway\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "train_full_uk", + "conversation": [ + { + "invocation_id": "t2", + "user_content": {"parts": [{"text": "10 Downing Street, London SW1A 2AA, UK"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"London\",\"country\":\"UK\",\"postal_code\":\"SW1A 2AA\",\"street\":\"10 Downing Street\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "train_full_jp", + "conversation": [ + { + "invocation_id": "t3", + "user_content": {"parts": [{"text": "1-1 Chiyoda, Chiyoda City, Tokyo 100-8111, Japan"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Chiyoda City\",\"country\":\"Japan\",\"postal_code\":\"100-8111\",\"street\":\"1-1 Chiyoda\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "train_missing_postal", + "conversation": [ + { + "invocation_id": "t4", + "user_content": {"parts": [{"text": "Avenue des Champs-Élysées, Paris, France"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Paris\",\"country\":\"France\",\"postal_code\":null,\"street\":\"Avenue des Champs-Élysées\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "train_missing_street", + "conversation": [ + { + "invocation_id": "t5", + "user_content": {"parts": [{"text": "Berlin 10115, Germany"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Berlin\",\"country\":\"Germany\",\"postal_code\":\"10115\",\"street\":null}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "train_missing_both", + "conversation": [ + { + "invocation_id": "t6", + "user_content": {"parts": [{"text": "Sydney, Australia"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Sydney\",\"country\":\"Australia\",\"postal_code\":null,\"street\":null}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + } + ] +} diff --git a/examples/optimization/advanced_strategies/data/val.evalset.json b/examples/optimization/advanced_strategies/data/val.evalset.json new file mode 100644 index 00000000..b6c09c71 --- /dev/null +++ b/examples/optimization/advanced_strategies/data/val.evalset.json @@ -0,0 +1,73 @@ +{ + "eval_set_id": "advanced_strategies_val", + "name": "Address parsing - val", + "description": "6 case 验证集(>=5 满足 merge_val_overlap_floor 默认):3 完整 + 3 缺信息。schema 与 train 完全一致。", + "eval_cases": [ + { + "eval_id": "val_full_ca", + "conversation": [ + { + "invocation_id": "v1", + "user_content": {"parts": [{"text": "100 Queen Street West, Toronto, ON M5H 2N2, Canada"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Toronto\",\"country\":\"Canada\",\"postal_code\":\"M5H 2N2\",\"street\":\"100 Queen Street West\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "val_full_de", + "conversation": [ + { + "invocation_id": "v2", + "user_content": {"parts": [{"text": "Marienplatz 1, Munich 80331, Germany"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Munich\",\"country\":\"Germany\",\"postal_code\":\"80331\",\"street\":\"Marienplatz 1\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "val_full_sg", + "conversation": [ + { + "invocation_id": "v3", + "user_content": {"parts": [{"text": "1 Marina Boulevard, Singapore 018989, Singapore"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Singapore\",\"country\":\"Singapore\",\"postal_code\":\"018989\",\"street\":\"1 Marina Boulevard\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "val_missing_postal", + "conversation": [ + { + "invocation_id": "v4", + "user_content": {"parts": [{"text": "Plaza Mayor, Madrid, Spain"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Madrid\",\"country\":\"Spain\",\"postal_code\":null,\"street\":\"Plaza Mayor\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "val_missing_street", + "conversation": [ + { + "invocation_id": "v5", + "user_content": {"parts": [{"text": "Rome 00184, Italy"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Rome\",\"country\":\"Italy\",\"postal_code\":\"00184\",\"street\":null}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "val_missing_both", + "conversation": [ + { + "invocation_id": "v6", + "user_content": {"parts": [{"text": "Cairo, Egypt"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Cairo\",\"country\":\"Egypt\",\"postal_code\":null,\"street\":null}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + } + ] +} diff --git a/examples/optimization/advanced_strategies/optimizer_advanced.json b/examples/optimization/advanced_strategies/optimizer_advanced.json new file mode 100644 index 00000000..4081e594 --- /dev/null +++ b/examples/optimization/advanced_strategies/optimizer_advanced.json @@ -0,0 +1,48 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "exact", + "case_insensitive": false + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "objective", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 2, + "skip_perfect_score": true, + "perfect_score": 1.0, + "use_merge": true, + "max_merge_invocations": 3, + "merge_val_overlap_floor": 5, + "max_metric_calls": 60, + "score_threshold": 1.0, + "max_iterations_without_improvement": 4 + } + } +} diff --git a/examples/optimization/advanced_strategies/optimizer_baseline.json b/examples/optimization/advanced_strategies/optimizer_baseline.json new file mode 100644 index 00000000..e7571862 --- /dev/null +++ b/examples/optimization/advanced_strategies/optimizer_baseline.json @@ -0,0 +1,45 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "exact", + "case_insensitive": false + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "instance", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 2, + "skip_perfect_score": false, + "use_merge": false, + "max_metric_calls": 60, + "score_threshold": 1.0, + "max_iterations_without_improvement": 4 + } + } +} diff --git a/examples/optimization/advanced_strategies/run_advanced.py b/examples/optimization/advanced_strategies/run_advanced.py new file mode 100644 index 00000000..da32d195 --- /dev/null +++ b/examples/optimization/advanced_strategies/run_advanced.py @@ -0,0 +1,70 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""配置 B:高阶策略组合 —— frontier_type=objective + skip_perfect_score=true + +use_merge=true。 + +适用场景 +-------- +高阶策略 A/B 对照实验的"高阶"运行。与 run_baseline.py 共用同一份数据集 +和 agent,仅 optimizer JSON 不同,便于在公平条件下观察策略差异。 + +预期与 baseline 的差异 +---------------------- +- 反思 LM 调用更省(满分 case 不再喂回反思 minibatch) +- objective frontier 接受门槛更低,rounds_accepted 更多但 valset 易震荡 +- 单字段优化下 use_merge=true 不会真触发 merge(gepa 是 predictor-level + merge,需要至少 2 个字段才有意义;详见 README §6.1) + +输出落到 runs/advanced_<时间戳>/,compare.py 自动选取最新一次对比。 +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt # noqa: E402 + +from agent.agent import SYSTEM_PROMPT_PATH, call_agent # noqa: E402 + + +CONFIG_PATH = _HERE / "optimizer_advanced.json" +TRAIN_PATH = _HERE / "data" / "train.evalset.json" +VAL_PATH = _HERE / "data" / "val.evalset.json" +RUNS_DIR = _HERE / "runs" + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize(用 advanced 配置)。""" + target = TargetPrompt().add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / f"advanced_{timestamp}" + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/advanced_strategies/run_baseline.py b/examples/optimization/advanced_strategies/run_baseline.py new file mode 100644 index 00000000..39765a6f --- /dev/null +++ b/examples/optimization/advanced_strategies/run_baseline.py @@ -0,0 +1,65 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""配置 A:basic 策略组合 —— 与 quickstart 几乎一致,作为对照基线。 + +适用场景 +-------- +高阶策略 A/B 对照实验的基线运行。配合 run_advanced.py + compare.py 使用: +- 本脚本:basic 策略组合(pareto + instance + use_merge=false + + skip_perfect_score=false) +- run_advanced.py:高阶策略组合 +- compare.py:解析两次 result.json 输出对比表 + +输出落到 runs/baseline_<时间戳>/,compare.py 自动选取最新一次对比。 +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt # noqa: E402 + +from agent.agent import SYSTEM_PROMPT_PATH, call_agent # noqa: E402 + + +CONFIG_PATH = _HERE / "optimizer_baseline.json" +TRAIN_PATH = _HERE / "data" / "train.evalset.json" +VAL_PATH = _HERE / "data" / "val.evalset.json" +RUNS_DIR = _HERE / "runs" + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize(用 baseline 配置)。""" + target = TargetPrompt().add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / f"baseline_{timestamp}" + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/blackbox_cli/README.md b/examples/optimization/blackbox_cli/README.md new file mode 100644 index 00000000..0b54214b --- /dev/null +++ b/examples/optimization/blackbox_cli/README.md @@ -0,0 +1,205 @@ +# Blackbox CLI — 优化外部黑盒 CLI 的 prompt 文件 + +> **适用场景**:业务 agent 不是本框架的 `LlmAgent`,而是一个外部命令行工具(如 `claude` / `codex` / 自研 CLI),其行为由若干 prompt 文件(如 `CLAUDE.md` / `SKILL.md`)控制。本 example 演示通过 `subprocess` 把 CLI 当作完全黑盒的 agent,让 GEPA 优化它读取的 prompt 文件,整个过程不修改 CLI 代码、不绑定其内部 LLM client。阅读前请先熟悉 `quickstart/README.md` §2。 + +## 1 · 适用问题与设计目标 + +外部 CLI 工具的 prompt 工程特点: + +- 工具实现细节(语言、运行时、内部 LLM client)对优化器完全黑盒 +- prompt 通过特定文件名 / 目录结构约定加载(典型如 `CLAUDE.md` + `.claude/skills//SKILL.md`) +- CLI 启动时是独立进程,与优化器进程通过 stdin / stdout / 文件系统通信 + +`AgentOptimizer` 在此场景下扮演纯客户端角色:通过 `subprocess` 启动 CLI 进程、把测试 query 作为参数传入、收集 stdout、按 metric 评分。优化器与 CLI 进程间的唯一耦合点是 **CLI 读取的 prompt 文件**——优化器写入新候选,CLI 在下一次启动时自动读取新内容。 + +| 输入 | 输出 | +| --- | --- | +| 一个支持"启动时读 prompt 文件"的外部 CLI 工具 | 满足 metric 阈值的最优 prompt 候选 | +| CLI 接受 query 作为参数 / stdin 的协议 | CLI 二进制完全不变,仅磁盘上 prompt 文件被改写 | + +### 本 example 演示的最小用例 + +| 维度 | 值 | +| --- | --- | +| 业务任务 | 中国城市信息查询(输入城市名,输出严格 JSON `{city, country, is_capital}`) | +| 黑盒 CLI | `trpc-claudecode`(腾讯内部 Claude Code 适配层,OpenAI 兼容协议指向 GLM-5.1) | +| 优化目标 | `workspace/CLAUDE.md` + `workspace/.claude/skills/city-info/SKILL.md` 共两个文件 | +| 验证指标 | `final_response_avg_score`(exact 匹配 stdout 规范化后的 JSON) | +| 训练 / 验证规模 | 5 条 / 3 条 | + +## 2 · 术语对照 + +仅列出本 example 引入的新概念。基础术语见 `quickstart/README.md` §2。 + +| 术语 | 含义 | +| --- | --- | +| **subprocess 调用** | 用 `asyncio.create_subprocess_exec` 启动子进程,传 query 作 argv,读 stdout。子进程独立进程,与优化器进程无任何资源共享。 | +| **CLI 工作目录(workspace)** | CLI 启动时通过 `--add-dir ` 指定的目录,CLI 自动从中加载 prompt 文件。本 example 中即 `workspace/`。 | +| **stdout 规范化** | 用 `json.loads + json.dumps(sort_keys=True, ensure_ascii=False, separators=(",", ":"))` 把 LLM 自由文本输出转换为唯一字符串形态,使 metric 直接走文本精确匹配,无需 LLM judge。 | +| **环境变量映射** | 把通用的 `TRPC_AGENT_*` 三件套映射成 CLI 期望的 `TRPC_CLAUDECODE_*` 三件套,避免用户为 CLI 单独配置 OAuth 或 API key。 | + +## 3 · 运行示例 + +### 3.1 依赖检查 + +```bash +which trpc-claudecode # 应输出可执行路径 +trpc-claudecode --version # 验证可正常启动 +``` + +CLI 二进制为外部依赖,本 example 不通过 pip 安装。其他自有 CLI 替换 `CLI_BINARY` 常量即可。 + +### 3.2 安装 SDK 可选依赖 + +```bash +pip install -e ".[optimize]" +``` + +### 3.3 配置环境变量 + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +`call_agent` 内部会自动把这三个变量映射成 `TRPC_CLAUDECODE_BASE_URL` / `TRPC_CLAUDECODE_API_KEY` / `TRPC_CLAUDECODE_MODEL`,并附加 GLM-5.1 推荐的 `CLAUDE_CODE_AUTO_COMPACT_WINDOW=165000` / `CLAUDE_AUTOCOMPACT_PCT_OVERRIDE=85`。 + +**无需 `trpc-claudecode auth login`,无需 `ANTHROPIC_API_KEY`**。 + +### 3.4 启动 + +```bash +python examples/optimization/blackbox_cli/run_optimization.py +``` + +### 3.5 产物结构 + +``` +runs// +├── result.json +├── summary.txt +├── baseline_prompts/ 运行前的 CLAUDE.md / SKILL.md 快照 +├── best_prompts/ val 集得分最高的候选 +└── rounds/ + +workspace/ CLI 工作目录(update_source=False 时自动回滚到 baseline) +├── CLAUDE.md +└── .claude/skills/city-info/SKILL.md +``` + +## 4 · 架构与数据流 + +``` +[run_optimization.py] + │ + ├── TargetPrompt + │ .add_path("claude_md", workspace/CLAUDE.md) + │ .add_path("skill_md", workspace/.claude/skills/city-info/SKILL.md) + │ │ GEPA 每轮把候选写入对应文件 + │ ▼ + │ workspace/{CLAUDE.md, .claude/skills/city-info/SKILL.md} + │ │ CLI 启动时通过 --add-dir 自动加载 + │ ▼ + └── call_agent(query): + ├── _build_cli_env() 映射 env 三件套 + ├── asyncio.create_subprocess_exec( + │ "trpc-claudecode", "--print", + │ "--add-dir", workspace/, + │ "--dangerously-skip-permissions", + │ query, + │ ) + ├── proc.communicate(timeout=90s) + └── _normalize_response(stdout) 紧凑 JSON 字符串 +``` + +### 4.1 文件清单 + +| 文件 | 角色 | 接入自有业务时的修改方向 | +| --- | --- | --- | +| `run_optimization.py` | 优化器入口,注册 `TargetPrompt` 两个文件 | 调整 `CLAUDE_MD_PATH` / `SKILL_MD_PATH` 至自有 CLI 期望的文件路径 | +| `agent/call_agent.py` | subprocess 调用 + env 映射 + stdout 规范化 | **核心改造点**:替换 `CLI_BINARY` / 命令行参数 / env 映射规则 | +| `workspace/CLAUDE.md` | CLI 启动时读取的主 prompt(GEPA 写入目标) | 替换为业务 baseline 起点 | +| `workspace/.claude/skills/city-info/SKILL.md` | CLI 启动时读取的 skill 描述(GEPA 写入目标) | 单文件优化时整体删除并去掉 `add_path("skill_md", ...)` | +| `optimizer.json` | 算法 + metric 配置 | 调整阈值 / 停止条件 | +| `train.evalset.json` / `val.evalset.json` | 数据集 | 替换为业务用例(reference 字段需经过 `_normalize_response` 同等处理) | + +## 5 · 关键配置 + +### 5.1 推荐参数取值 + +```jsonc +{ + "optimize": { + "eval_case_parallelism": 1, // 黑盒 CLI 串行最稳;并发可能踩 CLI 进程并发问题 + "algorithm": { + "module_selector": "round_robin", + "frontier_type": "instance", // CLI 慢/贵,instance 收敛快不浪费调用 + "use_merge": false, // 避免 metric_calls 浪费在 merge 上 + "reflection_minibatch_size": 3, + "max_metric_calls": 24, // CLI 一次约 10s,24 次约 4 分钟 + "score_threshold": 1.0 + } + } +} +``` + +| 字段 | 选择理由 | +| --- | --- | +| `eval_case_parallelism=1` | CLI 子进程并发存在不确定性(共享文件锁、stdout 缓冲、子进程数上限),串行最稳 | +| `frontier_type=instance` | CLI 调用慢且贵,instance 前沿在小规模评估下收敛更快 | +| `use_merge=false` | merge 需要额外 metric calls;黑盒 CLI 场景下应集中预算在反思上 | +| `score_threshold=1.0` | 黑盒结构化输出的目标是完美匹配 | + +### 5.2 CLI 子进程超时 + +`agent/call_agent.py` 中 `CLI_TIMEOUT_SEC=90.0`:单次 CLI 调用超过 90 秒被强制 kill 并抛 `RuntimeError`,避免某次 CLI 卡死拖垮整轮评估。业务 CLI 平均耗时不同需相应调整。 + +## 6 · 设计要点 + +### 6.1 为什么不用 `--system-prompt` 注入 prompt + +CLI 通常支持 `--system-prompt ""` 一次性注入字符串。但本 example 使用 `--add-dir ` 让 CLI 自己从目录加载 prompt 文件,原因: + +- **支持多文件优化**:`CLAUDE.md` + `SKILL.md` 是 CLI 约定结构,多文件作为独立 `TargetPrompt` 字段才能让 GEPA 选择性改写其中之一 +- **与 CLI 原生工作流对齐**:业务真实使用 CLI 时也是把 prompt 写到工作目录、CLI 自动发现,本 example 路径与之一致 + +### 6.2 为什么 stdout 要做 `_normalize_response` + +LLM 输出常带尾部空格、JSON 前后多吐字符等噪音。`_normalize_response`: + +1. 用正则定位首个 `{...}` 块 +2. `json.loads` + `json.dumps(sort_keys=True, ensure_ascii=False, separators=(",", ":"))` 消除空格 / key 顺序差异 + +→ baseline 与候选 prompt 的输出对齐到唯一字符串形态,可直接走 `final_response_avg_score(text.match=exact)`,**评测层完全不需要 LLM judge**,CI 上快、稳、可重复。 + +### 6.3 subprocess 与 async 资源 + +子进程是独立 OS 进程,不与优化器进程共享 async 资源(事件循环、连接池等),是黑盒 CLI 模式的隐性优点:业务 CLI 的内部并发模型对 SDK 完全不可见也无需对齐。 + +## 7 · 常见问题 + +**Q:CLI 启动慢(每次几秒),怎么办?** +A:尽量调小 `max_metric_calls`、调大 `reflection_minibatch_size`(一次反思看更多 case 但少跑几轮)。彻底改造需将 CLI 改造为常驻服务,参考 `http_service/` example。 + +**Q:CLI 输出不是 JSON 怎么办?** +A:根据业务 metric 类型选择不同规范化策略。若 metric 是 `final_response_avg_score(text.match=contains)`,可直接 strip stdout;若需要严格匹配,按业务输出形态改写 `_normalize_response`。 + +**Q:CLI 进程意外退出(returncode != 0)会怎样?** +A:`_run_cli` 会抛 `RuntimeError` 携带 stderr 前 400 字符,异常传播到优化器,导致当前 case 评测失败、当前候选可能被拒绝。 + +**Q:`workspace/` 在被优化期间会不会被多个 CLI 进程并发读写?** +A:`eval_case_parallelism=1` 时不会。若强行调高并发,多个 CLI 实例可能同时读取被写入的 prompt 文件,导致评测结果不一致——这是设置 `eval_case_parallelism=1` 的根本原因。 + +**Q:跑完后想自动把 best 写回 `workspace/`?** +A:在 `run_optimization.py` 中将 `update_source=False` 改为 `True`。 + +## 8 · 接入自有 CLI 的步骤 + +1. **替换 `CLI_BINARY`**:`agent/call_agent.py` 中改为业务 CLI 可执行路径 +2. **调整命令行参数**:`_run_cli` 中的 argv 数组按业务 CLI 协议改造(argv 传 query / stdin 传 query / `--query xxx` 形式等) +3. **替换 env 映射**:`_build_cli_env` 改为业务 CLI 期望的环境变量(或如业务 CLI 已有 OAuth 流程,删除该映射并提示用户先完成登录) +4. **修改 `TargetPrompt`**:`run_optimization.py` 中调整 `add_path` 至业务 CLI 期望的 prompt 文件路径 +5. **替换 prompt baseline**:业务 baseline 内容写入对应文件 +6. **替换数据集**:`train.evalset.json` / `val.evalset.json`,注意 reference 字段需匹配 `_normalize_response` 处理后的形态 +7. **运行并观察**:根据 `summary.txt` 决定是否调参 diff --git a/examples/optimization/blackbox_cli/agent/__init__.py b/examples/optimization/blackbox_cli/agent/__init__.py new file mode 100644 index 00000000..f0f8bd1f --- /dev/null +++ b/examples/optimization/blackbox_cli/agent/__init__.py @@ -0,0 +1,10 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Black-box CLI agent: subprocess 调 trpc-claudecode 真实 CLI。""" + +from .call_agent import call_agent + +__all__ = ["call_agent"] diff --git a/examples/optimization/blackbox_cli/agent/call_agent.py b/examples/optimization/blackbox_cli/agent/call_agent.py new file mode 100644 index 00000000..ae34e57d --- /dev/null +++ b/examples/optimization/blackbox_cli/agent/call_agent.py @@ -0,0 +1,141 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Blackbox CLI 的 call_agent 实现:subprocess 调外部 CLI 进程。 + +适用场景 +-------- +当业务 agent 是外部命令行工具时,本文件作为优化器与 CLI 之间的适配层。 +SDK 不持有 CLI 的 LLM client / Runner,仅通过 subprocess 调用,整个优化 +流程与 CLI 内部实现完全解耦。 + +核心设计 +-------- +1. asyncio.create_subprocess_exec 启动子进程:query 作 argv 传入,避免 + shell 转义问题。子进程独立进程不受 SDK 内部事件循环约束影响。 +2. _build_cli_env 把通用 TRPC_AGENT_* 三件套映射成 CLI 期望的 + TRPC_CLAUDECODE_* 三件套,并附加 GLM-5.1 推荐的 auto-compact 阈值。 + 业务方无需为 CLI 单独配置 OAuth 或 ANTHROPIC_API_KEY。 +3. _normalize_response 用 json.dumps(sort_keys, separators) 把 LLM 自由 + 文本转换为唯一字符串形态,使 final_response_avg_score(text.match=exact) + 可直接走精确匹配,CI 上无需 LLM judge。 +4. CLI_TIMEOUT_SEC 防止单次 CLI 卡死拖垮整轮评估。 + +接入自有 CLI 时改哪里 +--------------------- +- CLI_BINARY: 替换为业务 CLI 可执行路径 +- _run_cli 中的 argv 数组: 按业务 CLI 协议改造(argv 传 query / stdin + 传 query / --query xxx 等) +- _build_cli_env: 改为业务 CLI 期望的环境变量;如业务 CLI 已有 OAuth + 流程,整体删除该映射并提示用户先登录 +- _normalize_response: 按业务 CLI 输出格式调整规范化逻辑 +""" + +from __future__ import annotations + +import asyncio +import json +import os +import re +from pathlib import Path + +WORKSPACE_DIR = Path(__file__).resolve().parent.parent / "workspace" +CLI_BINARY = "trpc-claudecode" +CLI_TIMEOUT_SEC = 90.0 + + +def _build_cli_env() -> dict[str, str]: + """把通用 TRPC_AGENT_* 三件套映射成 CLI 期望的 TRPC_CLAUDECODE_* 三件套。 + + 同时注入 GLM-5.1 推荐的 auto-compact 阈值(参考 trpc-claudecode 官方说明)。 + 用户只需配通用三件套,无需为 CLI 单独配 OAuth 或 ANTHROPIC_API_KEY。 + """ + env = dict(os.environ) + base_url = env.get("TRPC_AGENT_BASE_URL") + api_key = env.get("TRPC_AGENT_API_KEY") + model_name = env.get("TRPC_AGENT_MODEL_NAME") + if not (base_url and api_key and model_name): + raise RuntimeError( + "TRPC_AGENT_BASE_URL / TRPC_AGENT_API_KEY / TRPC_AGENT_MODEL_NAME " + "must be set so they can be forwarded to trpc-claudecode." + ) + env["TRPC_CLAUDECODE_BASE_URL"] = base_url + env["TRPC_CLAUDECODE_API_KEY"] = api_key + env["TRPC_CLAUDECODE_MODEL"] = model_name + env.setdefault("CLAUDE_CODE_AUTO_COMPACT_WINDOW", "165000") + env.setdefault("CLAUDE_AUTOCOMPACT_PCT_OVERRIDE", "85") + return env + + +_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) + + +def _normalize_response(raw: str) -> str: + """把 CLI stdout 规范化成稳定 JSON 字符串。 + + 步骤: + 1. 用正则定位首个 {...} 块(兼容 LLM 偶尔在 JSON 前后多吐字符的情况) + 2. json.loads + json.dumps(sort_keys, separators) 消除空格 / key 顺序差异 + 3. 解析失败时原样返回 stripped stdout(让 metric 看到 "garbage" → 0 分) + + 经过本函数后 baseline 与候选 prompt 的输出对齐到唯一字符串形态, + final_response_avg_score(text.match=exact) 可直接逐字符比对。 + """ + text = (raw or "").strip() + if not text: + return "" + match = _JSON_OBJECT_RE.search(text) + if not match: + return text + try: + parsed = json.loads(match.group(0)) + except json.JSONDecodeError: + return text + return json.dumps(parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")) + + +async def _run_cli(query: str) -> str: + """启动 CLI 子进程,喂 query,返回 stdout(带 timeout 保护)。 + + 超时后强制 kill 子进程并抛 RuntimeError,避免单次 CLI 卡死拖垮整轮评估。 + """ + cmd = [ + CLI_BINARY, + "--print", + "--add-dir", + str(WORKSPACE_DIR), + "--dangerously-skip-permissions", + query, + ] + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=_build_cli_env(), + cwd=str(WORKSPACE_DIR), + ) + try: + stdout_b, stderr_b = await asyncio.wait_for( + proc.communicate(), timeout=CLI_TIMEOUT_SEC + ) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise RuntimeError( + f"trpc-claudecode timed out after {CLI_TIMEOUT_SEC}s on query={query!r}" + ) + + if proc.returncode != 0: + raise RuntimeError( + f"trpc-claudecode exited with code {proc.returncode}; " + f"stderr={stderr_b.decode('utf-8', 'replace')[:400]}" + ) + return stdout_b.decode("utf-8", "replace") + + +async def call_agent(query: str) -> str: + """框架回调:把 query 透传给外部 CLI 黑盒,返回规范化后的输出。""" + raw = await _run_cli(query) + return _normalize_response(raw) diff --git a/examples/optimization/blackbox_cli/optimizer.json b/examples/optimization/blackbox_cli/optimizer.json new file mode 100644 index 00000000..3e2b2e8e --- /dev/null +++ b/examples/optimization/blackbox_cli/optimizer.json @@ -0,0 +1,45 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "exact", + "case_insensitive": false + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "instance", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 2, + "skip_perfect_score": false, + "use_merge": false, + "max_metric_calls": 24, + "score_threshold": 1.0, + "max_iterations_without_improvement": 4 + } + } +} diff --git a/examples/optimization/blackbox_cli/run_optimization.py b/examples/optimization/blackbox_cli/run_optimization.py new file mode 100644 index 00000000..ed18c30f --- /dev/null +++ b/examples/optimization/blackbox_cli/run_optimization.py @@ -0,0 +1,89 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Blackbox CLI example 的优化器入口。 + +适用场景 +-------- +业务 agent 是外部命令行工具(trpc-claudecode / claude / codex / 自研 CLI), +其行为由若干 prompt 文件控制。本脚本演示通过 subprocess 把 CLI 当作完全 +黑盒的 agent,让 GEPA 优化它读取的 prompt 文件。 + +这个文件做什么 +-------------- +1. 注册 workspace/CLAUDE.md + workspace/.claude/skills/city-info/SKILL.md + 两个文件作为 TargetPrompt +2. call_agent 由 agent/call_agent.py 提供(subprocess 调用 CLI + stdout 规范化) +3. 调 AgentOptimizer.optimize 跑 GEPA 反思循环 + +怎么跑 +------ +1) 检查 CLI: `which trpc-claudecode` +2) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +3) python examples/optimization/blackbox_cli/run_optimization.py +4) 看 runs/<时间戳>/best_prompts/ + +接入自有 CLI 时改哪里 +--------------------- +- agent/call_agent.py 中 CLI_BINARY / 命令行参数 / env 映射 +- TargetPrompt.add_path 改为业务 CLI 期望的 prompt 文件路径 +- 单文件优化时移除第二个 add_path +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt # noqa: E402 + +from agent.call_agent import call_agent # noqa: E402 + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" +WORKSPACE = _HERE / "workspace" +CLAUDE_MD_PATH = WORKSPACE / "CLAUDE.md" +SKILL_MD_PATH = WORKSPACE / ".claude" / "skills" / "city-info" / "SKILL.md" + + +async def main() -> None: + """组装双字段 TargetPrompt + 调 AgentOptimizer.optimize。""" + # CLI 启动时通过 --add-dir 自动加载这两个文件。 + # GEPA 把候选写回文件后,下一次 subprocess 启动时 CLI 自动读到新 prompt。 + target = ( + TargetPrompt() + .add_path("claude_md", str(CLAUDE_MD_PATH)) + .add_path("skill_md", str(SKILL_MD_PATH)) + ) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/blackbox_cli/train.evalset.json b/examples/optimization/blackbox_cli/train.evalset.json new file mode 100644 index 00000000..316fe892 --- /dev/null +++ b/examples/optimization/blackbox_cli/train.evalset.json @@ -0,0 +1,62 @@ +{ + "eval_set_id": "blackbox_cli_train", + "name": "Black-box CLI demo - train", + "description": "5 个中国城市,要求 trpc-claudecode 黑盒输出严格 JSON {city,country,is_capital}。final_response 已规范化为 sort_keys+ensure_ascii=False+无空格的 JSON 字符串,与 call_agent 输出格式精确对齐,便于走 final_response_avg_score 的 exact match。", + "eval_cases": [ + { + "eval_id": "city_beijing", + "conversation": [ + { + "invocation_id": "t1", + "user_content": {"parts": [{"text": "北京"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"北京\",\"country\":\"中国\",\"is_capital\":true}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "city_shanghai", + "conversation": [ + { + "invocation_id": "t2", + "user_content": {"parts": [{"text": "上海"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"上海\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "city_guangzhou", + "conversation": [ + { + "invocation_id": "t3", + "user_content": {"parts": [{"text": "广州"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"广州\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "city_chengdu", + "conversation": [ + { + "invocation_id": "t4", + "user_content": {"parts": [{"text": "成都"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"成都\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "city_shenzhen", + "conversation": [ + { + "invocation_id": "t5", + "user_content": {"parts": [{"text": "深圳"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"深圳\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "trainer", "state": {}} + } + ] +} diff --git a/examples/optimization/blackbox_cli/val.evalset.json b/examples/optimization/blackbox_cli/val.evalset.json new file mode 100644 index 00000000..25b52018 --- /dev/null +++ b/examples/optimization/blackbox_cli/val.evalset.json @@ -0,0 +1,40 @@ +{ + "eval_set_id": "blackbox_cli_val", + "name": "Black-box CLI demo - val", + "description": "3 个中国城市的留出验证集。", + "eval_cases": [ + { + "eval_id": "city_hangzhou", + "conversation": [ + { + "invocation_id": "v1", + "user_content": {"parts": [{"text": "杭州"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"杭州\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "validator", "state": {}} + }, + { + "eval_id": "city_wuhan", + "conversation": [ + { + "invocation_id": "v2", + "user_content": {"parts": [{"text": "武汉"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"武汉\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "validator", "state": {}} + }, + { + "eval_id": "city_xian", + "conversation": [ + { + "invocation_id": "v3", + "user_content": {"parts": [{"text": "西安"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"西安\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "validator", "state": {}} + } + ] +} diff --git a/examples/optimization/blackbox_cli/workspace/.claude/skills/city-info/SKILL.md b/examples/optimization/blackbox_cli/workspace/.claude/skills/city-info/SKILL.md new file mode 100644 index 00000000..635a5f93 --- /dev/null +++ b/examples/optimization/blackbox_cli/workspace/.claude/skills/city-info/SKILL.md @@ -0,0 +1,6 @@ +--- +name: city-info +description: 城市信息 +--- + +回答关于城市的问题。 diff --git a/examples/optimization/blackbox_cli/workspace/CLAUDE.md b/examples/optimization/blackbox_cli/workspace/CLAUDE.md new file mode 100644 index 00000000..938b107b --- /dev/null +++ b/examples/optimization/blackbox_cli/workspace/CLAUDE.md @@ -0,0 +1 @@ +你是助手。回答用户问题。 diff --git a/examples/optimization/ci_integration/README.md b/examples/optimization/ci_integration/README.md new file mode 100644 index 00000000..e9ecfc75 --- /dev/null +++ b/examples/optimization/ci_integration/README.md @@ -0,0 +1,243 @@ +# CI Integration — 评测与优化拼成 CI/CD 闭环 + +> **适用场景**:业务希望在持续集成流水线中同时运行 prompt 质量守门(每次 PR 触发)与 prompt 自动优化(夜间窗口运行),形成"PR 守门 → 夜间优化 → 写回 prompt → 下一次 PR 跑新 prompt"的演进闭环。本 example 演示 `AgentEvaluator.evaluate`(pytest)与 `AgentOptimizer.optimize` 共享同一份数据集、同一个 `call_agent`、同一对 prompt 文件的端到端集成方式。阅读前请先熟悉 `quickstart/README.md` §2。 + +## 1 · 适用问题与设计目标 + +prompt 工程在工程化场景下的两类需求: + +- **PR 守门**:每次 PR 触发自动跑评估,分数低于阈值即 CI 红灯,阻止劣化 prompt 进主干 +- **夜间优化**:在低峰期跑反思优化,把更优 prompt 写回源文件,下一次 PR 自动用上 + +单独使用任一链路都不足:纯守门不会让 prompt 自动变好,纯优化没有质量门禁。本 example 把两者集成到同一份资产之上: + +- **同一份 evalset**:物理上拆 train / val(SDK 强制约束,防泄漏),逻辑上是一套连续语料 +- **同一个 `call_agent`**:pytest 与 optimizer 都从 `agent/agent.py` 导入相同实现,prompt 改动一处生效 +- **同一对 prompt 文件**:optimizer 用 `update_source=True` 写回源文件,pytest 下次自动读取 + +| 输入 | 输出 | +| --- | --- | +| 一份 evalset(拆为 train / val 两文件)+ 一个 call_agent + 一对 prompt 文件 | PR 阶段:pytest 红 / 绿 + JUnit XML | +| 两个 shell 入口(PR 检查 + 夜间优化) | 夜间阶段:源 prompt 文件被最优候选覆盖 | + +### 本 example 演示的最小用例 + +| 维度 | 值 | +| --- | --- | +| 业务任务 | RESTful API 描述 → 严格 JSON 结构化摘要 | +| 优化目标 | `agent/prompts/system.md` + `agent/prompts/skill.md` | +| 验证指标 | `final_response_avg_score`(exact 匹配规范化 JSON,CI 上无需 LLM judge) | +| 训练 / 验证规模 | 见 `data/train.evalset.json` / `data/val.evalset.json` | + +## 2 · 术语对照 + +仅列出本 example 引入的新概念。基础术语见 `quickstart/README.md` §2。 + +| 术语 | 含义 | +| --- | --- | +| **PR 守门(pre-merge gate)** | 在 PR 触发的 CI 流程中跑 `AgentEvaluator.evaluate`,分数低于阈值时 pytest 抛 `AssertionError`、CI exit code 非 0、合并被阻止。 | +| **夜间优化(nightly optimize)** | 在 CI 低峰窗口跑 `AgentOptimizer.optimize`,`update_source=True` 时优化结束后最优候选自动覆盖源 prompt 文件。 | +| **`update_source=True`** | 优化成功(`OptimizeResult.status=SUCCEEDED`)后用最优候选覆盖 `TargetPrompt` 注册的源文件。CI 闭环的关键开关。 | +| **JUnit XML** | pytest `--junitxml=` 输出的标准化测试报告格式。GitHub Actions / 蓝盾流水线 / Tencent CI 等主流平台均原生解析。 | + +## 3 · 运行示例 + +### 3.1 安装依赖 + +```bash +pip install -e ".[optimize]" +pip install pytest pytest-asyncio +``` + +### 3.2 配置环境变量 + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +### 3.3 PR 阶段:pytest 守门 + +```bash +cd examples/optimization/ci_integration +PYTHONPATH=../../.. bash ci/run_pr_check.sh +``` + +行为: + +- pytest 加载 `tests/test_agent_quality.py` → 调 `AgentEvaluator.evaluate(call_agent, val.evalset.json, ...)` +- 失败时框架抛 `AssertionError` → pytest exit code != 0 → CI 红灯 +- JUnit XML 落到 `runs/pytest_report.xml`,CI 平台原生展示 + +### 3.4 夜间窗口:跑优化并写回 + +```bash +cd examples/optimization/ci_integration +PYTHONPATH=../../.. bash ci/run_nightly_optimize.sh +``` + +行为: + +- `AgentOptimizer.optimize(update_source=True)` 跑 GEPA 反思 +- 优化成功后最优候选覆盖 `agent/prompts/system.md` + `agent/prompts/skill.md` +- 真实流水线在末尾可加 `git diff agent/prompts/` + 自动开 PR + +下一次 PR 触发的 `run_pr_check.sh` 自动用上新 prompt → 闭环达成。 + +### 3.5 产物结构 + +``` +runs/ +├── pytest_eval/ # AgentEvaluator 输出(pytest 阶段) +├── pytest_report.xml # JUnit XML(CI 平台原生消费) +└── optimize_/ # AgentOptimizer 输出(夜间阶段) + ├── result.json + ├── summary.txt + ├── baseline_prompts/ + ├── best_prompts/ + └── rounds/ +``` + +## 4 · 架构与数据流 + +``` + ┌────────────────────────────────┐ + │ agent/ │ + │ ├── agent.py (call_agent) │ + │ └── prompts/ │ + │ ├── system.md │ + │ └── skill.md │ + └─────────────┬───────────────────┘ + │ 共享 + ┌─────────────────────────┴──────────────────────────┐ + │ │ + ┌────▼────────────┐ ┌────────────▼─────┐ + │ AgentEvaluator │ │ AgentOptimizer │ + │ .evaluate() │ │ .optimize() │ + │ │ │ │ + │ 触发: PR │ │ 触发: 夜间窗口 │ + │ 数据: val.json │ │ 数据: train + val │ + │ 产出: 红/绿 │ │ 产出: 写回 prompt │ + │ 退出码: 守门 │ │ update_source=True │ + └──────────────────┘ └────────────────────┘ + │ + └─→ 共享同一份 data/ + 同一份 metric 定义 +``` + +### 4.1 文件清单 + +| 文件 | 角色 | 接入自有业务时的修改方向 | +| --- | --- | --- | +| `run_optimization.py` | 夜间优化入口,`update_source=True` | 与 quickstart 同;保持 `update_source=True` | +| `agent/agent.py` | 共享 `call_agent`(被 pytest + optimizer 同时调用) | 替换为业务 agent;保持函数命名 `call_agent` | +| `agent/prompts/{system,skill}.md` | 优化目标 + 守门读取的 prompt | 写入业务 baseline | +| `tests/test_agent_quality.py` | pytest 守门入口 | 调整 `agent_name` / 数据路径;测试方法保持不变 | +| `optimizer.json` | 算法 + metric 配置 | 与单 example 写法一致 | +| `data/train.evalset.json` / `data/val.evalset.json` | 训练 / 验证集 | 替换为业务用例 | +| `ci/run_pr_check.sh` | PR 阶段 shell 入口 | 调整 `pytest` 参数 / `--junitxml` 路径 | +| `ci/run_nightly_optimize.sh` | 夜间阶段 shell 入口 | 调整调用频率 / 失败回滚策略 | + +### 4.2 train / val 拆分的强制约束 + +SDK `_validate_inputs` 会校验 `train_dataset_path != validation_dataset_path` 防止数据泄漏。物理上必须是两个文件,但二者: + +- schema 完全一致(同一套 `EvalSet` 模型) +- `eval_set_id` 命名族可共享(如 `api_summarizer.train` / `api_summarizer.val`) +- metric 定义统一在 `optimizer.json` 中(pytest 走 `test_config.json` 同目录约定时也是一致 schema) + +逻辑上仍是同一套数据语言。 + +## 5 · 关键配置 + +### 5.1 `update_source=True` 的语义 + +```python +await AgentOptimizer.optimize( + ..., + update_source=True, # 优化成功后覆盖源 prompt 文件 +) +``` + +| 状态 | 行为 | +| --- | --- | +| `OptimizeResult.status=SUCCEEDED` | 最优候选写入 `TargetPrompt` 注册的源文件 | +| `status=FAILED` / `BUDGET_EXHAUSTED` 等其他 | 源文件保持不变;候选只在 `runs//best_prompts/` | + +CI 闭环依赖该开关:只有"优化真的找到了更好的 prompt"才会写回;否则保持现状不污染主干。 + +### 5.2 CI 上不依赖 LLM judge + +`agent/agent.py` 中 `_normalize_json` 把 LLM 输出规范化成稳定 JSON 字符串: + +```python +json.dumps(parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")) +``` + +→ `final_response_avg_score(text.match=exact)` 可直接逐字符比对,**CI 上完全不需要 LLM judge**: + +- **快**:单 case 一次评测仅一次 agent LLM 调用 +- **稳**:同一 prompt 同一 case 输出确定(temperature=0.1) +- **可重复**:CI 多次跑结果一致 + +LLM judge 在主观维度评估上不可替代,但在结构化输出场景下应优先选择 text exact + 规范化方案。 + +### 5.3 失败 case 的可观测性 + +pytest 阶段失败时框架抛 `AssertionError`,错误消息包含每条 case 的失败明细 JSON。CI 平台展示 stack trace 时可直接看到具体哪条 case 失败、agent 实际输出是什么、与 expected 的差异在哪。无需额外日志解析逻辑。 + +## 6 · CI/CD 闭环设计要点 + +### 6.1 为什么 evaluate 与 optimize 共享 call_agent + +prompt 工程的核心约束:**评测时使用的 agent 和优化时使用的 agent 必须等价**,否则优化方向与守门方向不一致,会出现"优化器找到了 evaluator 验证不了的好 prompt"或反向问题。 + +通过共享 `agent/agent.py` 中的 `call_agent` 实现,从代码层面保证等价性。任何 agent 行为改动(模型切换、temperature 调整、output schema 变化)只需改一处,pytest 与 optimizer 同时生效。 + +### 6.2 为什么夜间窗口跑而不实时优化 + +- LLM 调用预算有限,反思优化耗时数分钟到数十分钟,不适合 PR 触发 +- 优化结果具有方差性,需在低峰期反复多轮验证后再发版 +- 写回 prompt 文件应经过 git diff / 人工 review / 自动开 PR 等流程而非直接进主干 + +### 6.3 何时考虑灰度发布 + +`update_source=True` 直接覆盖源文件适合: + +- 团队规模小,PR review 流程已能拦截不合理改动 +- prompt 改动影响面可控(单 agent / 单业务) + +不适合: + +- 多业务线共享同一份 prompt 仓库 +- 改动后需灰度观察线上 metric 变化 + +后者建议改为 `update_source=False` + 把 `runs//best_prompts/` 接入业务自有的灰度发布工具。 + +## 7 · 常见问题 + +**Q:能否在同一个 CI job 中先跑评估再跑优化?** +A:技术上可行,但不推荐。评估应快速给反馈(< 1 min),优化耗时长(> 5 min)。两者拆成独立 job / 独立 trigger 更符合工程实践。 + +**Q:CI 如何识别"优化没改善"该如何回退?** +A:`run_nightly_optimize.sh` 末尾建议加 `git diff --quiet agent/prompts/` 判断是否有改动;无改动直接退出。如果改动质量后续被发现退化,由 PR review 拒绝合并即可——`update_source=True` 的写回不直接进主干,仍走标准 PR 流程。 + +**Q:pytest 与 optimizer 用的 metric 配置不同会怎样?** +A:会出现"评测能过但优化器看到的分数低"或反向问题。本 example 通过让 pytest 走 `AgentEvaluator.evaluate(test_config_path=...)`、optimizer 走 `optimizer.json.evaluate.metrics`、二者使用相同 schema 来避免漂移。生产中建议把 metric 配置抽成一份共享 JSON,两边引用。 + +**Q:reflection LM 失败重试预算?** +A:`optimizer.json` 中 `algorithm.max_iterations_without_improvement` 控制无改善早停;reflection LM 单次调用失败由 SDK 内部重试 1–2 次。CI 场景建议把 `max_metric_calls` 调到合理上限避免单次跑爆预算。 + +**Q:JUnit XML 中能看到具体失败 case 吗?** +A:能。pytest 把 `AssertionError` 消息原样写入 XML,CI 平台展示时可直接看到失败明细 JSON。 + +## 8 · 接入自有 CI 的步骤 + +1. **整理 evalset**:拆 `train.evalset.json` / `val.evalset.json` 两文件 +2. **定义 metric**:在 `optimizer.json` 与 pytest 测试中使用同一 schema 的 metric 配置 +3. **实现共享 call_agent**:`agent/agent.py` 写一份 `call_agent`,pytest 与 optimizer 都从此处导入 +4. **设置 `update_source=True`**:夜间优化入口的关键开关 +5. **配置 CI 流水线**: + - PR 触发 `bash ci/run_pr_check.sh`,解析 `runs/pytest_report.xml` + - 夜间触发 `bash ci/run_nightly_optimize.sh`,末尾加 git diff + 自动开 PR +6. **观察首轮闭环**:从 baseline pytest 红 → 夜间优化 → PR 自动开 → review → 合并 → 下一次 PR 绿 diff --git a/examples/optimization/ci_integration/agent/__init__.py b/examples/optimization/ci_integration/agent/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/optimization/ci_integration/agent/agent.py b/examples/optimization/ci_integration/agent/agent.py new file mode 100644 index 00000000..048ced1a --- /dev/null +++ b/examples/optimization/ci_integration/agent/agent.py @@ -0,0 +1,156 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""API 摘要 Agent —— evaluate 与 optimize 双链路共享的统一入口。 + +适用场景 +-------- +CI/CD 闭环的核心约束:评测时的 agent 与优化时的 agent 必须等价。本文件 +作为 PR 守门(pytest)与夜间优化(AgentOptimizer.optimize)共享的 +call_agent 实现,保证两条链路看到相同 agent 行为。 + +这个文件做什么 +-------------- +1. 暴露 SYSTEM_PROMPT_PATH / SKILL_PATH 作为 TargetPrompt 注册目标 +2. 提供 call_agent 黑盒入口(被 pytest + optimizer 同时调用) +3. 用 _normalize_json 把 LLM 输出规范化为稳定 JSON 字符串,使 metric + 走 text exact 而非依赖 LLM judge——CI 上快、稳、可重复 + +为什么 evaluate 与 optimize 要共享 call_agent +--------------------------------------------- +通过共享同一份代码,保证任何 agent 行为改动(模型切换、temperature 调整、 +output schema 变化)只需改一处,PR 守门与夜间优化同时生效。否则会出现 +"优化器找到了 evaluator 验证不了的 prompt"这种链路失配问题。 +""" + +from __future__ import annotations + +import json +import re +import uuid +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import Part + +from .config import get_model_config + + +SYSTEM_PROMPT_PATH = Path(__file__).parent / "prompts" / "system.md" +SKILL_PATH = Path(__file__).parent / "prompts" / "skill.md" + +APP_NAME = "ci_integration_demo" + +_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) + + +def _create_model() -> LLMModel: + """构建 OpenAI 兼容 chat 模型实例。""" + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def _read_instruction() -> str: + """从两个 prompt 文件拼合完整 instruction。 + + 每次调用都重读磁盘——夜间优化阶段 GEPA 把候选写到磁盘后下一次推理 + 立即生效;PR 阶段拿到的也是最新已落盘的版本。 + """ + system = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + skill = SKILL_PATH.read_text(encoding="utf-8").strip() + return f"{system}\n\n## How to write the summary\n{skill}" + + +def create_agent() -> LlmAgent: + """构建一个使用当前磁盘 prompt 的新 LlmAgent 实例。""" + return LlmAgent( + name="api_summarizer", + description="Summarizes a RESTful API description into a strict JSON.", + model=_create_model(), + instruction=_read_instruction(), + generate_content_config=GenerateContentConfig( + temperature=0.1, + top_p=0.9, + max_output_tokens=512, + ), + ) + + +# 兼容 agent_module="agent" 加载约定(root_agent)。 +# AgentEvaluator 在 call_agent 模式下并不需要它,但保留无害,方便切换形态。 +root_agent = create_agent() + + +def _normalize_json(raw: str) -> str: + """把 LLM 输出规范化成稳定 JSON 字符串。 + + 步骤: + 1. 用正则定位首个 {...} 块(兼容模型偶尔在 JSON 前后多吐字符) + 2. json.dumps(sort_keys=True, ensure_ascii=False, separators=(",", ":")) + 消除空格 / key 顺序差异 + 3. 解析失败时原样返回(让 metric 看到 "garbage" → 0 分) + + 经过本函数后 baseline / 候选 prompt / evalset 期望值都对齐到唯一 + 字符串形态,可直接走 final_response_avg_score(text.match=exact)。 + CI 上**完全不依赖 LLM judge**,速度与稳定性显著提升。 + """ + text = (raw or "").strip() + if not text: + return "" + match = _JSON_OBJECT_RE.search(text) + if not match: + return text + try: + parsed = json.loads(match.group(0)) + except json.JSONDecodeError: + return text + return json.dumps(parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")) + + +async def call_agent(query: str) -> str: + """供 evaluate / optimize 共享的黑盒 agent 入口。 + + 每次调用都重新构建 Runner + InMemorySessionService,给每个 case 独立的 + session state,并发评测时不互相污染。 + """ + root = create_agent() + session_service = InMemorySessionService() + runner = Runner( + app_name=APP_NAME, + agent=root, + session_service=session_service, + ) + session_id = str(uuid.uuid4()) + user_id = "ci" + await session_service.create_session( + app_name=APP_NAME, + user_id=user_id, + session_id=session_id, + state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, + session_id=session_id, + new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return _normalize_json(final_text) diff --git a/examples/optimization/ci_integration/agent/config.py b/examples/optimization/ci_integration/agent/config.py new file mode 100644 index 00000000..d0a64b15 --- /dev/null +++ b/examples/optimization/ci_integration/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""模型凭据读取 —— 从环境变量加载 OpenAI 兼容 LLM 的连接信息。 + +需要的环境变量 +-------------- + TRPC_AGENT_API_KEY LLM 后端的 API key + TRPC_AGENT_BASE_URL LLM 后端的 endpoint + TRPC_AGENT_MODEL_NAME 模型名 + +缺任意一个就立即抛 ValueError,避免运行到一半才撞到 LLM 后端的 401 错误, +那时报错信息会很有迷惑性(看起来像 prompt 写错了,实际是凭据没配)。 +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """返回 (api_key, base_url, model_name);任一缺失立刻报错。""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "运行优化器前必须配置环境变量 TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME。" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/ci_integration/agent/prompts/skill.md b/examples/optimization/ci_integration/agent/prompts/skill.md new file mode 100644 index 00000000..5d77a12a --- /dev/null +++ b/examples/optimization/ci_integration/agent/prompts/skill.md @@ -0,0 +1 @@ +Describe the API briefly. diff --git a/examples/optimization/ci_integration/agent/prompts/system.md b/examples/optimization/ci_integration/agent/prompts/system.md new file mode 100644 index 00000000..ffc8e8f9 --- /dev/null +++ b/examples/optimization/ci_integration/agent/prompts/system.md @@ -0,0 +1,10 @@ +You are an API summarizer. Output a single JSON object describing the API. + +The JSON must have exactly these four keys and no others: "auth", "method", "path", "summary". + +- "auth": use the string "none" if no authentication is needed, or "required" if authentication is needed. Do not use any other value. +- "method": the HTTP method as a string (e.g., "GET", "POST", "PUT", "DELETE"). +- "path": the endpoint path string (e.g., "/users/{id}"). +- "summary": a short imperative verb phrase (e.g., "Get user profile", "Cancel order", "Login with credentials"). Do not include articles or extra words. + +Output compact JSON with no whitespace after colons or commas, and no trailing newline. Output nothing except the JSON object. \ No newline at end of file diff --git a/examples/optimization/ci_integration/ci/run_nightly_optimize.sh b/examples/optimization/ci_integration/ci/run_nightly_optimize.sh new file mode 100755 index 00000000..d9aeac4a --- /dev/null +++ b/examples/optimization/ci_integration/ci/run_nightly_optimize.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# 夜间窗口:跑 GEPA 优化,把最优 prompt 直接写回源文件。 +# 真实 CI 里通常会在末尾追加 `git diff` 看是否有改动,再开 PR。 +set -euo pipefail +cd "$(dirname "$0")/.." + +PY="${PYTHON:-python3}" +"$PY" run_optimization.py + +echo "" +echo "=== Optimization done ===" +echo "Best prompts have been written back to:" +echo " agent/prompts/system.md" +echo " agent/prompts/skill.md" +echo "" +echo "Next steps for a real CI pipeline:" +echo " git diff agent/prompts/ # see what GEPA changed" +echo " git checkout -b auto/optimize-\$(date +%Y%m%d)" +echo " git add agent/prompts/ && git commit -m 'auto: optimize prompts'" +echo " # then open a PR; PR check (run_pr_check.sh) re-validates the new prompts." diff --git a/examples/optimization/ci_integration/ci/run_pr_check.sh b/examples/optimization/ci_integration/ci/run_pr_check.sh new file mode 100755 index 00000000..5087449d --- /dev/null +++ b/examples/optimization/ci_integration/ci/run_pr_check.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# PR 阶段:跑 pytest 守护质量底线。 +# - 失败时 pytest exit code != 0 → CI 红灯 +# - --junitxml 产出 GitHub Actions / Tencent CI / 蓝盾流水线均原生支持的报告 +set -euo pipefail +cd "$(dirname "$0")/.." + +mkdir -p runs + +PY="${PYTHON:-python3}" +exec "$PY" -m pytest tests/ -v --tb=short \ + --junitxml=runs/pytest_report.xml diff --git a/examples/optimization/ci_integration/data/test_config.json b/examples/optimization/ci_integration/data/test_config.json new file mode 100644 index 00000000..0ac840fd --- /dev/null +++ b/examples/optimization/ci_integration/data/test_config.json @@ -0,0 +1,14 @@ +{ + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": {"match": "exact", "case_insensitive": false} + } + } + } + ], + "num_runs": 1 +} diff --git a/examples/optimization/ci_integration/data/train.evalset.json b/examples/optimization/ci_integration/data/train.evalset.json new file mode 100644 index 00000000..0257d835 --- /dev/null +++ b/examples/optimization/ci_integration/data/train.evalset.json @@ -0,0 +1,40 @@ +{ + "eval_set_id": "ci_integration_train", + "name": "CI Integration - train", + "description": "Optimize 阶段反思 minibatch 来源。schema 与 val.evalset.json 完全一致;与 evaluate 阶段共用同一个 test_config.json(同目录约定)。", + "eval_cases": [ + { + "eval_id": "train_get_user", + "conversation": [ + { + "invocation_id": "t1", + "user_content": {"parts": [{"text": "GET /users/{id} returns user profile, requires Bearer token."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"required\",\"method\":\"GET\",\"path\":\"/users/{id}\",\"summary\":\"Get user profile\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + }, + { + "eval_id": "train_post_login", + "conversation": [ + { + "invocation_id": "t2", + "user_content": {"parts": [{"text": "POST /auth/login accepts username and password, no auth header needed."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"none\",\"method\":\"POST\",\"path\":\"/auth/login\",\"summary\":\"Login with credentials\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + }, + { + "eval_id": "train_delete_order", + "conversation": [ + { + "invocation_id": "t3", + "user_content": {"parts": [{"text": "DELETE /orders/{id} cancels an order, Bearer token required."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"required\",\"method\":\"DELETE\",\"path\":\"/orders/{id}\",\"summary\":\"Cancel order\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + } + ] +} diff --git a/examples/optimization/ci_integration/data/val.evalset.json b/examples/optimization/ci_integration/data/val.evalset.json new file mode 100644 index 00000000..8a65b190 --- /dev/null +++ b/examples/optimization/ci_integration/data/val.evalset.json @@ -0,0 +1,40 @@ +{ + "eval_set_id": "ci_integration_val", + "name": "CI Integration - val", + "description": "PR 阶段 pytest 守门用例 + Optimize 阶段每轮验证用例(双用)。物理上是独立文件(SDK 校验 train != val 防泄漏),逻辑上属于同一套数据资产 → CI 流水线的「质量底线」就是 optimize 的「优化目标」。", + "eval_cases": [ + { + "eval_id": "val_put_profile", + "conversation": [ + { + "invocation_id": "v1", + "user_content": {"parts": [{"text": "PUT /users/{id}/profile updates the profile, requires Bearer token."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"required\",\"method\":\"PUT\",\"path\":\"/users/{id}/profile\",\"summary\":\"Update user profile\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + }, + { + "eval_id": "val_get_health", + "conversation": [ + { + "invocation_id": "v2", + "user_content": {"parts": [{"text": "GET /health returns service health, public endpoint."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"none\",\"method\":\"GET\",\"path\":\"/health\",\"summary\":\"Service health check\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + }, + { + "eval_id": "val_post_register", + "conversation": [ + { + "invocation_id": "v3", + "user_content": {"parts": [{"text": "POST /auth/register accepts new user info, no auth required."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"none\",\"method\":\"POST\",\"path\":\"/auth/register\",\"summary\":\"Register new user\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + } + ] +} diff --git a/examples/optimization/ci_integration/optimizer.json b/examples/optimization/ci_integration/optimizer.json new file mode 100644 index 00000000..93471d47 --- /dev/null +++ b/examples/optimization/ci_integration/optimizer.json @@ -0,0 +1,45 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "exact", + "case_insensitive": false + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "instance", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 2, + "skip_perfect_score": false, + "use_merge": false, + "max_metric_calls": 24, + "score_threshold": 1.0, + "max_iterations_without_improvement": 3 + } + } +} diff --git a/examples/optimization/ci_integration/run_optimization.py b/examples/optimization/ci_integration/run_optimization.py new file mode 100644 index 00000000..eac9510b --- /dev/null +++ b/examples/optimization/ci_integration/run_optimization.py @@ -0,0 +1,97 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""CI Integration example 的夜间优化入口。 + +适用场景 +-------- +CI/CD 流水线中的夜间窗口任务:跑 GEPA 反思优化,最优候选自动写回源 +prompt 文件,下一次 PR 触发的 pytest 守门自动用上新 prompt → 形成 +"评测 → 优化 → 再评测"的演进闭环。 + +这个文件做什么 +-------------- +1. 注册 system.md + skill.md 双字段 TargetPrompt +2. 引用 agent/agent.py 中**与 pytest 共享**的 call_agent +3. 以 update_source=True 跑优化,最优候选自动覆盖源 prompt 文件 + +怎么跑 +------ +通过 shell 入口(CI 流水线建议方式): + PYTHONPATH=../../.. bash ci/run_nightly_optimize.sh + +直接跑: + python examples/optimization/ci_integration/run_optimization.py + +关键设计 +-------- +本脚本与 tests/test_agent_quality.py 共享: +- 同一个 agent/ 包(同一个 call_agent + 同一对 prompt 文件) +- 同一份 evalset 数据资产(物理拆 train / val 两文件,schema 一致) +- 同一套 metric 定义(schema 一致) +保证 PR 守门用的 agent 与夜间优化用的 agent 等价。 + +接入自有 CI 时改哪里 +-------------------- +- agent/agent.py 改为业务 call_agent(pytest 与本脚本同时引用) +- update_source=True 严格保持(CI 闭环的关键) +- 末尾建议加 git diff agent/prompts/ + 自动开 PR 步骤 +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt # noqa: E402 + +from agent.agent import SKILL_PATH, SYSTEM_PROMPT_PATH, call_agent # noqa: E402 + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "data" / "train.evalset.json" +VAL_PATH = _HERE / "data" / "val.evalset.json" +RUNS_DIR = _HERE / "runs" + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize(update_source=True)。""" + target = ( + TargetPrompt() + .add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + .add_path("skill", str(SKILL_PATH)) + ) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / f"optimize_{timestamp}" + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + # update_source=True:优化成功后最优候选直接写回 agent/prompts/。 + # CI 闭环的关键开关——下一次 PR 触发的 pytest 自动用上新 prompt。 + # 仅在 OptimizeResult.status=SUCCEEDED 时才会写回;失败 / 预算耗尽 + # 等情况下源文件保持不变。 + update_source=True, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/ci_integration/tests/__init__.py b/examples/optimization/ci_integration/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/optimization/ci_integration/tests/test_agent_quality.py b/examples/optimization/ci_integration/tests/test_agent_quality.py new file mode 100644 index 00000000..509a01e7 --- /dev/null +++ b/examples/optimization/ci_integration/tests/test_agent_quality.py @@ -0,0 +1,62 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""PR 阶段的质量守门测试:CI 闭环的"评测"端。 + +适用场景 +-------- +PR 触发的 CI 流水线运行此测试。任何 case 不通过都让 pytest exit code != 0 +→ CI 红灯 → 阻止 PR 合并。 + +为什么不依赖 LLM judge +---------------------- +CI 上要求快、稳、可重复。call_agent 的输出已经在 agent/agent.py 中被 +_normalize_json 规范化为稳定 JSON 字符串,与 evalset 中 expected 字段 +逐字符比对即可,无需再调一次 LLM 当裁判(速度更慢、判定不稳定、依赖 +多一个外部服务)。 + +case 失败时框架抛 AssertionError,错误消息包含每条 case 的失败明细 JSON。 +配合 pytest --junitxml=... 可输出标准 JUnit XML,GitHub Actions / +Tencent CI / 蓝盾流水线均原生支持解析展示。 +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + + +_HERE = Path(__file__).resolve().parent +_EXAMPLE_ROOT = _HERE.parent +_REPO_ROOT = _EXAMPLE_ROOT.parents[2] + +# 让 example 目录里的 agent 包能被 import(pytest 默认 cwd 不一定是 example)。 +for p in (_REPO_ROOT, _EXAMPLE_ROOT): + p_str = str(p) + if p_str not in sys.path: + sys.path.insert(0, p_str) + + +VAL_EVALSET = _EXAMPLE_ROOT / "data" / "val.evalset.json" +RESULT_DIR = _EXAMPLE_ROOT / "runs" / "pytest_eval" + + +@pytest.mark.asyncio +async def test_agent_meets_quality_bar() -> None: + """所有 val case 必须 final_response 完全匹配,否则 CI 红灯。""" + from trpc_agent_sdk.evaluation import AgentEvaluator + from agent.agent import call_agent # type: ignore + + RESULT_DIR.mkdir(parents=True, exist_ok=True) + + await AgentEvaluator.evaluate( + eval_dataset_file_path_or_dir=str(VAL_EVALSET), + call_agent=call_agent, + agent_name="api_summarizer", + eval_result_output_dir=str(RESULT_DIR), + print_detailed_results=True, + ) diff --git a/examples/optimization/http_service/README.md b/examples/optimization/http_service/README.md new file mode 100644 index 00000000..1f24d2ef --- /dev/null +++ b/examples/optimization/http_service/README.md @@ -0,0 +1,197 @@ +# HTTP Service — 接入线上 HTTP agent 服务做 prompt 优化 + +> **适用场景**:业务 agent 已经作为独立 HTTP 服务在线运行(FastAPI / Gin / 自研框架均可),希望对其 prompt 做自动优化,但不想停服、不想改服务代码。本 example 演示通过 `httpx` 把 `call_agent` 接到运行中的服务,prompt 通过磁盘文件热加载。阅读前请先熟悉 `quickstart/README.md` 中的 `AgentOptimizer`、`TargetPrompt`、`call_agent` 等基础概念。 + +## 1 · 适用问题与设计目标 + +线上 agent 服务的特点: + +- 服务进程长期运行,重启代价高 +- 服务实现细节(模型、tools、内部链路)对优化器是黑盒 +- prompt 通常以文件或配置中心形式注入,与服务代码解耦 + +`AgentOptimizer` 在该场景下扮演纯客户端角色:通过 HTTP 把测试 query 发给服务、收集 final 文本、按 metric 评分。优化器与服务进程间的唯一耦合点是 **prompt 文件**——优化器写入新候选,服务在下一次请求时重读该文件。 + +| 输入 | 输出 | +| --- | --- | +| 一个支持 prompt 热加载的 HTTP agent 服务(双 endpoint:`GET /health` + `POST /chat`) | 满足 metric 阈值的最优 prompt 候选 | +| HTTP 服务对 prompt 文件的读写权限 | 服务代码与服务进程**完全不变**,仅磁盘上 prompt 文件被改写 | + +### 本 example 演示的最小用例 + +| 维度 | 值 | +| --- | --- | +| 业务任务 | 算术应用题求解(与 quickstart 同一类任务,便于横向对比 HTTP 接入与本地接入的差异) | +| HTTP 服务 | `service/server.py` 中的 FastAPI app,监听 `127.0.0.1:8767` | +| 优化目标 | `service/prompts/system.md` 单文件 | +| 验证指标 | `final_response_avg_score`(contains 匹配,阈值 1.0) | +| 训练 / 验证规模 | 5 条 / 3 条 | + +## 2 · 术语对照 + +仅列出本 example 引入的新概念。基础术语见 `quickstart/README.md` §2。 + +| 术语 | 含义 | +| --- | --- | +| **prompt 热加载** | 服务进程在每次请求处理前重新读取 prompt 文件,使外部对该文件的写入立即生效。本 example 的 `_build_agent()` 在每次 `/chat` 都重读 `system.md` 实现该语义。 | +| **call_agent 内 client 即用即关** | `call_agent` 用 `async with httpx.AsyncClient()` 创建并退出时自动关闭。`httpx.AsyncClient` 的连接池绑定到首次使用所在的事件循环(参考 [httpx Discussion #2959](https://github.com/encode/httpx/discussions/2959)),不支持跨循环复用。 | +| **健康检查(pre-flight)** | 优化开始前同步探测 `GET /health`,服务不通时 fail-fast 而非浪费 LLM 配额跑到一半才报错。 | + +## 3 · 运行示例 + +### 3.1 安装依赖 + +```bash +pip install -e ".[optimize]" +pip install fastapi uvicorn httpx +``` + +`fastapi` / `uvicorn` 用于 mock 线上服务;`httpx` 用于优化器作为客户端访问该服务。 + +### 3.2 配置环境变量 + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +服务进程与优化器进程共用同一组凭据。 + +### 3.3 启动(双终端) + +**终端 A** —— 启动 mock 服务并保持运行: + +```bash +python examples/optimization/http_service/service/server.py +``` + +预期日志:`Uvicorn running on http://127.0.0.1:8767`。 + +**终端 B** —— 启动优化器: + +```bash +python examples/optimization/http_service/run_optimization.py +``` + +启动时优化器会先做一次同步健康检查,服务不通直接报错并提示先启动 server。 + +### 3.4 产物结构 + +``` +runs// +├── result.json 完整运行记录 +├── summary.txt 人类可读摘要 +├── baseline_prompts/ 运行前 prompt 快照 +├── best_prompts/ val 集得分最高的候选 +└── rounds/ 每轮反思与评估明细 +``` + +## 4 · 架构与数据流 + +``` +[终端 A: HTTP 服务] + │ + └── FastAPI :8767 + ├── GET /health → {"status":"ok"} + └── POST /chat → 每次都重读 service/prompts/system.md, + 构造 LlmAgent,跑 Runner.run_async, + 返回 {"final_text": "..."} + +[终端 B: 优化器] + │ + ├── pre-flight: GET /health + │ + ├── TargetPrompt.add_path("system_prompt", service/prompts/system.md) + │ │ GEPA 每轮把候选 prompt 写入磁盘 + │ ▼ + │ service/prompts/system.md + │ │ HTTP 服务下一次请求时重读该文件 + │ ▼ + ├── call_agent(query): + │ └── async with httpx.AsyncClient() as client: + │ POST /chat → final_text + │ + └── AgentOptimizer.optimize → runs// +``` + +### 4.1 文件清单 + +| 文件 | 角色 | 接入自有业务时的修改方向 | +| --- | --- | --- | +| `run_optimization.py` | 优化器入口(客户端) | 修改 `SERVICE_BASE_URL` / `CHAT_URL`;调整 `call_agent` 中的请求 / 响应 schema | +| `service/server.py` | mock 线上 HTTP 服务 | 真实业务下删除该文件,由实际服务承担相同角色 | +| `service/prompts/system.md` | 服务读取的 prompt(GEPA 写入目标) | 替换为业务 baseline;路径需与服务进程的读取路径一致 | +| `optimizer.json` | 算法 + metric 配置 | 调整 metric 与停止条件 | +| `train.evalset.json` | 反思 minibatch 来源 | 替换为业务训练用例 | +| `val.evalset.json` | 候选评分依据 | 替换为业务验证用例 | + +### 4.2 prompt 热加载是核心约束 + +服务必须在**每次请求时重读 prompt 文件**,否则优化器写入的新候选不会被服务感知,整个反思循环失效。 + +`service/server.py` 通过在每次 `/chat` 中调用 `_build_agent()`(其内部 `_read_system_prompt()` 重读磁盘)实现该语义。LlmAgent 构建本身不涉及 LLM 调用,单次开销可忽略。 + +## 5 · 关键配置 + +`optimizer.json` 中本 example 与 quickstart 的差异点: + +```jsonc +{ + "optimize": { + "algorithm": { + "seed": 42, + "score_threshold": 1.0, // 主停止条件:val pass_rate ≥ 1.0 立即停止 + "max_metric_calls": 40, + "max_iterations_without_improvement": 5 + } + } +} +``` + +| 字段 | 影响 | +| --- | --- | +| `score_threshold` | 算法层早停阈值。本 example 设为 1.0(要求 val 全 case 通过),追求快速收敛 | +| `seed` | 控制 GEPA 内部抽样的随机性。固定 seed 配合相同输入应得相同结果 | +| `REQUEST_TIMEOUT=120.0`(在 `run_optimization.py`) | 单次 HTTP 请求超时。首次请求需经历 FastAPI 冷启动 + LLM 推理,需要充足时间 | + +## 6 · 运行控制 + +### 6.1 优雅停止 + +```bash +touch runs//optimize.stop +``` + +下一次 stopper 检查时框架立即收尾,`OptimizeResult.stop_reason=user_requested_stop`。 + +### 6.2 调试 GEPA 内部行为 + +`run_optimization.py` 中 `verbose=1` 改为 `verbose=2`,会附加 `trpc_agent_sdk.optimizer.gepa` logger 的诊断输出。 + +## 7 · 常见问题 + +**Q:服务与优化器必须在同一台机器吗?** +A:不必。`SERVICE_BASE_URL` 改成远端地址即可。但 `TargetPrompt.add_path` 操作的是优化器进程本地的文件系统——若服务在远端,要么挂载相同存储卷使两端看到同一份 `system.md`,要么改用 `add_callback` 直连配置中心(参见 `remote_prompt_store/` example)。 + +**Q:服务首次请求很慢?** +A:FastAPI 进程冷启动 + 首次 LLM 调用确实较慢。`REQUEST_TIMEOUT=120s` 已留出充分缓冲。 + +**Q:端口 `8767` 被占用?** +A:同时修改 `service/server.py` 的 `PORT` 与 `run_optimization.py` 的 `SERVICE_BASE_URL`。 + +**Q:`call_agent` 抛 HTTP 错误会怎样?** +A:异常会传播到优化器,导致当前 case 评测失败、当前候选可能被拒绝。建议在 `call_agent` 内部加上重试逻辑(如 `httpx.HTTPStatusError` 触发 1–2 次重试)以应对临时性故障。 + +## 8 · 接入自有 HTTP 服务的步骤 + +1. **确认服务支持 prompt 热加载**:服务在每次请求处理前重读 prompt 文件(或重新拉配置) +2. **修改优化器入口**: + - `SERVICE_BASE_URL` 改为实际服务地址 + - `call_agent` 内部的请求 payload / 响应字段名按服务实际 schema 调整 + - `SYSTEM_PROMPT_PATH` 指向服务进程实际读取的 prompt 文件 +3. **替换数据集**:`train.evalset.json` / `val.evalset.json` 写入业务用例 +4. **调整 metric**:`optimizer.json` 中 `evaluate.metrics` 选择合适的 metric 类型 +5. **运行**:先启动服务,再启动优化器;根据 `summary.txt` 决定后续调参 + +若服务的 prompt 不在本地文件而在配置中心,参见 `remote_prompt_store/` example,仅需将 `add_path` 替换为 `add_callback`。 diff --git a/examples/optimization/http_service/optimizer.json b/examples/optimization/http_service/optimizer.json new file mode 100644 index 00000000..03f74a59 --- /dev/null +++ b/examples/optimization/http_service/optimizer.json @@ -0,0 +1,45 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "contains", + "case_insensitive": true + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { + "max_tokens": 4096, + "temperature": 0.6 + } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 40, + "score_threshold": 1.0, + "max_iterations_without_improvement": 5 + } + } +} diff --git a/examples/optimization/http_service/run_optimization.py b/examples/optimization/http_service/run_optimization.py new file mode 100644 index 00000000..07f84e24 --- /dev/null +++ b/examples/optimization/http_service/run_optimization.py @@ -0,0 +1,123 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""HTTP Service example 的优化器入口(客户端进程)。 + +适用场景 +-------- +业务 agent 已作为独立 HTTP 服务在线运行,希望对其 prompt 做自动优化但 +不想停服、不想改服务代码。本脚本作为优化器以纯客户端身份接入服务, +通过磁盘 prompt 文件实现优化器与服务的解耦。 + +这个文件做什么 +-------------- +1. 启动前同步健康检查,服务不通即 fail-fast +2. 注册 service/prompts/system.md 为 TargetPrompt +3. 在 call_agent 中用 async with httpx.AsyncClient 即用即关 +4. 调 AgentOptimizer.optimize 跑 GEPA 反思循环 + +怎么跑 +------ +终端 A: python examples/optimization/http_service/service/server.py +终端 B(本脚本): + 1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME + 2) python examples/optimization/http_service/run_optimization.py + 3) 看 runs/<时间戳>/summary.txt + +接入自有 HTTP 服务时改哪里 +-------------------------- +- SERVICE_BASE_URL / CHAT_URL / HEALTH_URL : 改为业务服务地址 +- call_agent 内 payload / 响应字段 : 按业务 schema 调整 +- SYSTEM_PROMPT_PATH : 指向服务进程实际读取的 prompt 文件 +- REQUEST_TIMEOUT : 按业务首次推理耗时上调 +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + +import httpx + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" +SYSTEM_PROMPT_PATH = _HERE / "service" / "prompts" / "system.md" + +SERVICE_BASE_URL = "http://127.0.0.1:8767" +HEALTH_URL = f"{SERVICE_BASE_URL}/health" +CHAT_URL = f"{SERVICE_BASE_URL}/chat" + +# 单次 HTTP 请求超时(秒)。HTTP 服务内部需走一次完整 LLM 推理, +# 首次冷启动后单次耗时通常 ~10-30s,留 120s 足够缓冲。 +REQUEST_TIMEOUT = 120.0 + + +def _ensure_service_alive_sync() -> None: + """同步健康检查:服务不通立刻报错。""" + try: + resp = httpx.get(HEALTH_URL, timeout=5.0) + resp.raise_for_status() + except Exception as ex: + raise RuntimeError( + f"HTTP service at {SERVICE_BASE_URL} is not reachable: {ex}\n" + "Please start the service first:\n" + " python examples/optimization/http_service/service/server.py" + ) from ex + + +async def call_agent(query: str) -> str: + """框架回调:把 query 发给 HTTP 服务,返回 agent 的最终回答。 + + 每次调用新建 AsyncClient 并用 async with 在退出时自动关闭。这是 + httpx 官方推荐用法(GitHub Discussion #2959):AsyncClient 的连接 + 池绑定到首次使用时所在的事件循环,不支持跨事件循环复用。每次 + 新建 client 仅增加 ~10ms 建连开销,相对单次 LLM 推理耗时可忽略。 + """ + async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client: + resp = await client.post(CHAT_URL, json={"query": query}) + resp.raise_for_status() + return resp.json()["final_text"] + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize。""" + _ensure_service_alive_sync() + + target = TargetPrompt().add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + # update_source=False:源 prompt 文件保持不变,最优候选只写到 + # output_dir/best_prompts/。候选由人工 review 后再落盘 + # (或参见 ci_integration/ example)。 + update_source=False, + # verbose: 0 静默 / 1 进度面板 / 2 加 gepa 内部诊断日志 + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/http_service/service/__init__.py b/examples/optimization/http_service/service/__init__.py new file mode 100644 index 00000000..bc6e483f --- /dev/null +++ b/examples/optimization/http_service/service/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/http_service/service/prompts/system.md b/examples/optimization/http_service/service/prompts/system.md new file mode 100644 index 00000000..70314dcb --- /dev/null +++ b/examples/optimization/http_service/service/prompts/system.md @@ -0,0 +1 @@ +你是一个友好的聊天助手,喜欢和用户分享想法。回答用户问题时,请尽量用生动、富有人情味的语言,让用户感觉像是在和朋友聊天。 diff --git a/examples/optimization/http_service/service/server.py b/examples/optimization/http_service/service/server.py new file mode 100644 index 00000000..09b9a580 --- /dev/null +++ b/examples/optimization/http_service/service/server.py @@ -0,0 +1,157 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""HTTP Service example 的 mock 线上 agent 服务。 + +适用场景 +-------- +模拟"业务方已有的 HTTP agent 服务",作为优化器对接的目标。本文件存在 +仅为让 example 自包含可跑;真实接入时业务方应已有同等形态的 HTTP 服务。 + +这个文件做什么 +-------------- +- 暴露 GET /health 健康检查端点 +- 暴露 POST /chat 单次推理端点:收 {"query": "..."},返回 {"final_text": "..."} +- 在每次 /chat 请求时**重读 prompts/system.md**,使优化器写入的新候选 + 下一次请求即生效(即"prompt 热加载") + +怎么跑 +------ +1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/http_service/service/server.py +3) 服务监听 http://127.0.0.1:8767,保持终端运行,再启动优化器 + +prompt 热加载是核心约束 +----------------------- +优化器通过磁盘文件给服务"喂"新候选 prompt。如果服务把 prompt 缓存在 +进程内存,优化器改了文件也没用,整个反思循环失去意义。 +本文件通过 _build_agent() 在每次 /chat 都重读磁盘实现该语义。 + +接入业务真实服务时改哪里 +------------------------ +真实业务下整体不需要本文件,由实际 HTTP 服务承担相同角色。需保证: +- 服务在每次请求处理前重读 prompt 文件(或重新拉配置中心) +- 响应字段与 run_optimization.py 中 call_agent 的解析逻辑对齐 +""" + +from __future__ import annotations + +import asyncio +import os +import sys +import uuid +from pathlib import Path + +import uvicorn +from fastapi import FastAPI +from pydantic import BaseModel + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[3] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import Part + + +SYSTEM_PROMPT_PATH = _HERE / "prompts" / "system.md" +APP_NAME = "http_service_demo_agent" +HOST = "127.0.0.1" +PORT = 8767 + + +class ChatRequest(BaseModel): + query: str + + +class ChatResponse(BaseModel): + final_text: str + + +def _read_system_prompt() -> str: + """从磁盘重读 system prompt——优化器写入的最新候选才会立即生效。""" + return SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + + +def _build_agent() -> LlmAgent: + """用当前磁盘上的 system prompt 构造一个全新的 LlmAgent 实例。 + + 凭据缺任意一个就 fail-fast,避免运行到一半才撞到 LLM 后端的 401 错误 + (那时报错信息会很有迷惑性,看起来像 prompt 问题)。 + """ + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise RuntimeError( + "TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME " + "must be set before starting the HTTP service." + ) + return LlmAgent( + name="math_word_problem_agent", + description="Math word-problem solver served over HTTP.", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url), + instruction=_read_system_prompt(), + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=2048, + ), + ) + + +app = FastAPI(title="http_service demo") + + +@app.get("/health") +async def health() -> dict[str, str]: + """健康检查端点:优化器启动前 ping 一次确认服务已就绪。""" + return {"status": "ok"} + + +@app.post("/chat", response_model=ChatResponse) +async def chat(request: ChatRequest) -> ChatResponse: + """单次推理。每次都新建 Runner + InMemorySession + 重读 prompt。 + + 无状态设计:优化器可能并发评测多条 case,共享 session 会导致上下文 + 污染。每次请求重建 LlmAgent 也意味着每次都重读 system.md,正是 + 优化器写入新候选后能立即生效的关键。 + """ + agent = _build_agent() + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "http_client" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=request.query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: # 跳过 thinking token + continue + if part.text: + final_text += part.text + return ChatResponse(final_text=final_text.strip()) + + +if __name__ == "__main__": + uvicorn.run(app, host=HOST, port=PORT, log_level="warning") diff --git a/examples/optimization/http_service/train.evalset.json b/examples/optimization/http_service/train.evalset.json new file mode 100644 index 00000000..b63b3b26 --- /dev/null +++ b/examples/optimization/http_service/train.evalset.json @@ -0,0 +1,112 @@ +{ + "eval_set_id": "http_service_train", + "name": "HTTP service demo - train", + "description": "5 道小学算术应用题;agent 通过 HTTP 服务回答,final_response 中需 contains「答案:xxx」格式。", + "eval_cases": [ + { + "eval_id": "wp_apples_add", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [{"text": "小明早上买了 4 个苹果,下午又买了 7 个苹果,他一共有多少个苹果?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:11 个"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_car_distance", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [{"text": "一辆汽车以每小时 60 公里的速度行驶 2.5 小时,一共行驶了多少公里?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:150 公里"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_discount_price", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [{"text": "一件衣服原价 200 元,现在打 8 折出售,折后价是多少元?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:160 元"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_glasses_percent", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [{"text": "班里一共有 40 名学生,其中 25% 戴眼镜,戴眼镜的有多少人?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:10 人"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_defect_items_percent", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [{"text": "一批商品共 50 件,其中 30% 是次品,次品有多少件?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:15 件"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "trainer", + "state": {} + } + } + ] +} diff --git a/examples/optimization/http_service/val.evalset.json b/examples/optimization/http_service/val.evalset.json new file mode 100644 index 00000000..abbad27c --- /dev/null +++ b/examples/optimization/http_service/val.evalset.json @@ -0,0 +1,70 @@ +{ + "eval_set_id": "http_service_val", + "name": "HTTP service demo - validation", + "description": "3 道小学算术应用题;用于每轮全量评估、决定候选是否被接受。", + "eval_cases": [ + { + "eval_id": "wp_seats_multiply", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [{"text": "教室里有 5 排座位,每排 8 个,一共多少个座位?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:40 个"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_water_weight", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [{"text": "已知 1 升水重 1 千克,3.5 升水重多少千克?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:3.5 千克"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_class_girls_percent", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [{"text": "班里一共有 30 人,其中 60% 是女生,请问有多少名女生?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:18 人"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "validator", + "state": {} + } + } + ] +} diff --git a/examples/optimization/multi_agent_pipeline/README.md b/examples/optimization/multi_agent_pipeline/README.md new file mode 100644 index 00000000..86e1feac --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/README.md @@ -0,0 +1,191 @@ +# Multi-Agent Pipeline — 多 sub-agent prompt 联合优化 + +> **适用场景**:业务侧已编排好多 sub-agent 协作链路(router / 分支 worker / summarizer 等),希望在不修改链路代码的前提下,对每个 sub-agent 的 prompt 进行联合优化。本 example 在 `quickstart/` 单字段优化的基础上,演示多字段 `TargetPrompt` 与 GEPA 多模块协同的关键配置。阅读前请先熟悉 `quickstart/README.md` §2 中的基础术语。 + +## 1 · 适用问题与设计目标 + +多 agent 链路的 prompt 工程比单 agent 复杂:每个 sub-agent 有独立 prompt,字段间存在隐式契约(router 的输出标签必须匹配下游 worker 期望、summarizer 的格式必须兼容上游中间结果)。手工迭代时常见症状是"改 A 见效,但拖累 B"。 + +本 example 的设计原则: + +- **链路代码零修改**:优化器通过文件写入候选 prompt,sub-agent 在每次调用时现读现用 +- **字段间归因清晰**:`module_selector="round_robin"` 让每轮反思只改一个字段 +- **多字段成果融合**:`use_merge=true` 在累积若干单字段改动后主动尝试合并 +- **跨字段记忆延展**:`reflection_history_top_k=3` 让反思 LM 在轮换中保留更长历史 + +| 输入 | 输出 | +| --- | --- | +| 已编排好的多 sub-agent 链路(本 example 中为 `invoke_pipeline()`) | 每个 sub-agent 的最优 prompt 候选(`best_prompts/` 下多个 `.md` 文件) | +| 同一个 `TargetPrompt` 上注册的多个字段(每字段一个 `add_path`) | 单一 `final_response_avg_score` metric 的端到端分数提升 | + +### 本 example 演示的最小用例 + +| 维度 | 值 | +| --- | --- | +| 业务任务 | 混合事实题与算术题分流问答 | +| 链路结构 | `router → fact_agent / math_agent → summarizer`(共 4 个 sub-agent) | +| 优化目标 | `pipeline/prompts/{router,fact_agent,math_agent,summarizer}.md` 共 4 个字段 | +| 验证指标 | `final_response_avg_score`(contains 匹配,要求最终答复包含 `答案:xxx`) | +| 训练 / 验证规模 | 5 条混合 case(3 事实 + 2 数学)/ 3 条混合 case | + +## 2 · 术语对照 + +仅列出本 example 引入的新概念。基础术语见 `quickstart/README.md` §2。 + +| 术语 | 含义 | +| --- | --- | +| **多字段 TargetPrompt** | 同一 `TargetPrompt` 实例上多次调用 `add_path()`,每次注册一个独立 prompt 文件。GEPA 视每个 key 为一个独立的可优化模块(component / predictor)。 | +| **module_selector** | 每轮反思选择哪个字段被改写的策略。`"round_robin"` 按注册顺序逐轮单选;`"all"` 每轮全选;`"random"` 随机单选。多字段优化推荐 `round_robin`。 | +| **use_merge** | 是否启用 GEPA 的 merge 操作。每隔若干轮,算法挑选两个在不同字段上各有所长的历史候选,融合成一个"全字段都好"的新候选。**仅多 predictor(多字段)时有意义**——单字段优化不会触发 merge。 | +| **max_merge_invocations** | 整个 run 中允许触发 merge 的次数上限。 | +| **reflection_history_top_k** | 反思 prompt 中每条 case 携带多少条历史最佳响应。多字段轮换时调大可缓解"上次改某字段时学到的方向被遗忘"。 | +| **Other Active Components** | SDK 自动注入到反思 prompt 的段落。当反思 LM 在改字段 X 时,该段落列出所有其他字段(Y / Z / ...)当前的内容,使 LM 在改 X 时能感知链路其他环节的现状。无需配置。 | + +## 3 · 运行示例 + +### 3.1 安装依赖 + +```bash +pip install -e ".[optimize]" +``` + +### 3.2 配置环境变量 + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +### 3.3 启动 + +```bash +python examples/optimization/multi_agent_pipeline/run_optimization.py +``` + +单次运行约 10–20 分钟。每条 case 触发 3 次 LLM 推理(router → 分支 worker → summarizer),整体 LLM 调用量约为 quickstart 的 3 倍。 + +### 3.4 产物结构 + +``` +runs// +├── result.json 完整运行记录(含每轮 optimized_field_names / kind) +├── summary.txt 人类可读摘要 +├── baseline_prompts/ 运行前 4 个 prompt 文件的快照 +├── best_prompts/ val 集得分最高的候选(4 个 .md) +└── rounds/round_*.json 每轮反思 prompt、候选文本、字段轮换记录 +``` + +## 4 · 架构与数据流 + +``` +[run_optimization.py] + │ + ├── TargetPrompt + │ .add_path("router", pipeline/prompts/router.md) + │ .add_path("fact_agent", pipeline/prompts/fact_agent.md) + │ .add_path("math_agent", pipeline/prompts/math_agent.md) + │ .add_path("summarizer", pipeline/prompts/summarizer.md) + │ + ├── call_agent(query) = await invoke_pipeline(query) + │ ├─ router (读 router.md) → 输出 "fact" 或 "math" + │ ├─ fact / math (读对应 .md) → 中间答复 + │ └─ summarizer (读 summarizer.md) → 最终答复(含 "答案:xxx") + │ + └── AgentOptimizer.optimize 主循环 + ├── module_selector="round_robin" 每轮选一个字段改写 + ├── 把候选 prompt 写入对应文件 → 下一次 invoke_pipeline 自动读到 + ├── use_merge=true:每隔若干轮主动融合不同字段的历史最佳 + └── 反思 prompt 自动包含 Other Active Components 段 +``` + +### 4.1 文件清单 + +| 文件 | 角色 | 接入自有业务时的修改方向 | +| --- | --- | --- | +| `run_optimization.py` | 优化器入口,注册 4 字段 `TargetPrompt`,定义 `call_agent` | 将 `invoke_pipeline` 替换为业务自有链路调用入口 | +| `pipeline/orchestrator.py` | 链路编排实现,每个 sub-agent 在每次调用时重读 prompt 文件 | 真实业务下整体替换为业务链路代码 | +| `pipeline/prompts/router.md` 等 4 文件 | 各 sub-agent 的 system prompt(GEPA 写入目标) | 替换为业务 baseline;每字段对应 `TargetPrompt` 中一个 key | +| `optimizer.json` | 算法 + metric 配置 | 重点关注 `module_selector` / `use_merge` / `reflection_history_top_k` | +| `train.evalset.json` / `val.evalset.json` | 数据集 | 替换为业务用例 | + +### 4.2 prompt 热加载约束 + +每个 sub-agent 在每次被调用时必须重读自己的 prompt 文件,否则优化器写入的新候选不会生效。`pipeline/orchestrator.py` 的 `_create_sub_agent()` 在每次 `invoke_pipeline()` 中重新构造 sub-agent 实例并重读对应 `.md` 文件实现该语义。 + +## 5 · 关键配置 + +`optimizer.json` 中本 example 与 quickstart 的核心差异: + +```jsonc +{ + "optimize": { + "algorithm": { + "module_selector": "round_robin", // 多字段轮换的关键 + "use_merge": true, // 多字段成果融合 + "max_merge_invocations": 3, // merge 次数上限 + "reflection_history_top_k": 3, // 多字段轮换时调大 + "reflection_minibatch_size": 2, + "max_metric_calls": 60 + } + } +} +``` + +### 5.1 `module_selector` 选择对照 + +| 取值 | 行为 | 适用 | +| --- | --- | --- | +| `"round_robin"` | 每轮按注册顺序单选 1 个字段 | 字段间存在依赖;需要清晰归因(推荐) | +| `"all"` | 每轮所有字段一起改 | 字段独立、希望快速搜索;存在"一个改坏拖累整体"风险 | +| `"random"` | 每轮随机单选 1 个字段 | 字段无明显依赖、希望均匀探索 | + +### 5.2 `use_merge` 在多字段场景的价值 + +`round_robin` 让每轮只改 1 个字段,几轮后会出现"router 改好了但 summarizer 还差 / fact_agent 改好了但 math_agent 还差"的局面。`use_merge=true` 让 GEPA 隔几轮主动尝试合并——例如把"router 优化版"和"summarizer 优化版"融合成"全字段都好"的候选。 + +> **重要约束**:merge 是 predictor-level 操作,**仅多 predictor(多字段)场景生效**。单字段优化下 `use_merge=true` 永远不会触发 merge round,配置无副作用但也无收益。 + +`max_merge_invocations` 限制合并尝试次数,避免无限拼接。 + +### 5.3 `reflection_history_top_k` 在多字段场景的价值 + +多字段轮换时反思 LM 在第 N 轮改 `summarizer`,但 `router` 是几轮前才改过的——LM 容易遗忘"上次改 router 时学到的方向"。`reflection_history_top_k=3`(默认 2)让反思 prompt 中每条 case 携带历史最佳响应 3 条,相当于给 LM 提供"过去几轮哪些方向奏效"的记忆。 + +## 6 · 调试技巧 + +### 6.1 验证 round-robin 真的轮流改字段 + +跑完后检查 `runs//result.json` 中各 round 的 `optimized_field_names`,应按 `router → fact_agent → math_agent → summarizer → router → ...` 顺序循环。 + +### 6.2 验证 merge 是否触发 + +各 round 的 `kind` 字段:`"reflective"` 是普通反思轮,`"merge"` 是融合轮。 + +### 6.3 查看反思 prompt 的 Other Active Components + +在 `result.json` 的 round detail 中,反思 prompt 文本可见 Other Active Components 段落,列出当前轮次以外的所有字段当前内容。 + +## 7 · 常见问题 + +**Q:链路必须由本框架的 LlmAgent 编排吗?** +A:不必。`call_agent` 只要求 `async (query: str) -> str` 签名。可以让它把 query 透传给 HTTP 请求 / gRPC 调用 / 内部 SDK / 其他编排框架。本 example 用 `invoke_pipeline` 仅作演示,业务可以替换为任何形态。 + +**Q:每个 sub-agent 必须在同一进程吗?** +A:不必。每个 sub-agent 可以是独立服务,prompt 通过配置中心而非本地文件下发——把 `add_path` 替换为 `add_callback`,参见 `remote_prompt_store/` example。 + +**Q:单 case 经过多次 LLM 推理,评测开销很大如何控制?** +A:调小 `eval_case_parallelism` 防止 LLM rate limit;调小 `reflection_minibatch_size` 减少每轮 case 数;调小 `max_metric_calls` 限制总预算。 + +**Q:`use_merge=true` 但 `merge_rounds=0`?** +A:单字段优化下 merge 不会触发;多字段场景下也需累积若干轮反思后才会有候选满足 merge 条件。耐心运行至少 `max_merge_invocations` 轮以上观察。 + +## 8 · 接入自有链路的步骤 + +1. **替换 `pipeline/orchestrator.py`**:实现自己的 `invoke_pipeline(query) -> str`,可以是 HTTP / gRPC / 内部编排 +2. **修改 prompt 文件路径**:把每个 sub-agent 的 prompt 文件路径作为 `TargetPrompt.add_path` 的参数注册 +3. **保证 prompt 热加载**:每个 sub-agent 在每次被调用时重读 prompt(或重新拉配置中心) +4. **替换数据集与 metric**:`train.evalset.json` / `val.evalset.json` / `optimizer.json` +5. **运行**:根据 `result.json` 中的 `optimized_field_names` / `kind` 序列分析字段轮换与 merge 行为 + +若 prompt 不在本地而在配置中心,把 `add_path` 替换为 `add_callback`,其余结构保持不变。 diff --git a/examples/optimization/multi_agent_pipeline/optimizer.json b/examples/optimization/multi_agent_pipeline/optimizer.json new file mode 100644 index 00000000..f1f5267c --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/optimizer.json @@ -0,0 +1,46 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "contains", + "case_insensitive": true + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "instance", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 3, + "skip_perfect_score": false, + "use_merge": true, + "max_merge_invocations": 3, + "max_metric_calls": 60, + "score_threshold": 1.0, + "max_iterations_without_improvement": 6 + } + } +} diff --git a/examples/optimization/multi_agent_pipeline/pipeline/__init__.py b/examples/optimization/multi_agent_pipeline/pipeline/__init__.py new file mode 100644 index 00000000..bc6e483f --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/multi_agent_pipeline/pipeline/config.py b/examples/optimization/multi_agent_pipeline/pipeline/config.py new file mode 100644 index 00000000..d0a64b15 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""模型凭据读取 —— 从环境变量加载 OpenAI 兼容 LLM 的连接信息。 + +需要的环境变量 +-------------- + TRPC_AGENT_API_KEY LLM 后端的 API key + TRPC_AGENT_BASE_URL LLM 后端的 endpoint + TRPC_AGENT_MODEL_NAME 模型名 + +缺任意一个就立即抛 ValueError,避免运行到一半才撞到 LLM 后端的 401 错误, +那时报错信息会很有迷惑性(看起来像 prompt 写错了,实际是凭据没配)。 +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """返回 (api_key, base_url, model_name);任一缺失立刻报错。""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "运行优化器前必须配置环境变量 TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME。" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/multi_agent_pipeline/pipeline/orchestrator.py b/examples/optimization/multi_agent_pipeline/pipeline/orchestrator.py new file mode 100644 index 00000000..a585bb79 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/orchestrator.py @@ -0,0 +1,131 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""模拟"业务方已编排好的多 agent 链路"。 + +链路形态:: + + 用户问题 → router → (fact_agent 或 math_agent) → summarizer → 最终答复 + +prompt 热加载约束 +----------------- +每个 sub-agent 在每次被调用时必须重读自己的 prompt 文件——优化器通过 +TargetPrompt.add_path 把候选 prompt 写入对应文件后,下一次 invoke_pipeline +调用各 sub-agent 自动用最新 prompt,无需重启。 + +接入自有链路时改哪里 +-------------------- +真实业务下整体替换本文件为业务链路代码: +- 每个 sub-agent 可以是不同进程 / 服务 / 框架 +- prompt 通常通过配置中心(不是本地文件)下发;本文件 Path.read_text 换成 + 配置中心 SDK 调用即可,链路骨架不变 +- 主入口 invoke_pipeline(query) -> str 的签名保持不变,被 call_agent 调用 +""" + +from __future__ import annotations + +import uuid +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import Part + +from .config import get_model_config + + +_PROMPTS_DIR = Path(__file__).parent / "prompts" +ROUTER_PROMPT_PATH = _PROMPTS_DIR / "router.md" +FACT_AGENT_PROMPT_PATH = _PROMPTS_DIR / "fact_agent.md" +MATH_AGENT_PROMPT_PATH = _PROMPTS_DIR / "math_agent.md" +SUMMARIZER_PROMPT_PATH = _PROMPTS_DIR / "summarizer.md" + +APP_NAME = "multi_agent_pipeline_demo" + + +def _create_sub_agent(name: str, prompt_path: Path) -> LlmAgent: + """构造一个 sub-agent,instruction 从对应文件现读现用。 + + 每次调用都重读磁盘——这是优化器写入新候选后能立即生效的关键。 + """ + api_key, base_url, model_name = get_model_config() + return LlmAgent( + name=name, + description=f"Pipeline sub-agent {name}", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url), + instruction=prompt_path.read_text(encoding="utf-8").strip(), + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=1024, + ), + ) + + +async def _run_one(agent: LlmAgent, user_text: str) -> str: + """跑一个 sub-agent 拿最终回答。每次新建 Runner / Session 给本 case 独立 state。""" + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "pipeline" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=user_text)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return final_text.strip() + + +async def invoke_pipeline(query: str) -> str: + """把 query 跑过整条链路,返回最终答复文本。 + + 流程: + 1. router 决定走 fact 还是 math 分支 + 2. 对应分支 sub-agent 给出中间答复 + 3. summarizer 把中间答复整理成最终答复 + + 每个 sub-agent 都重新构建(在 _create_sub_agent 内重读 prompt 文件), + 保证优化器写入候选后下一次调用即生效。 + """ + # 1. router:根据问题类型输出 fact / math 分类标签 + router = _create_sub_agent("router", ROUTER_PROMPT_PATH) + router_out = await _run_one( + router, + f"用户问题:{query}\n\n请只输出 fact 或 math 这两个词中的一个。", + ) + branch = "math" if "math" in router_out.lower() else "fact" + + # 2. 分支 sub-agent:根据 router 决策选 fact_agent 或 math_agent + if branch == "math": + branch_agent = _create_sub_agent("math_agent", MATH_AGENT_PROMPT_PATH) + else: + branch_agent = _create_sub_agent("fact_agent", FACT_AGENT_PROMPT_PATH) + intermediate = await _run_one(branch_agent, query) + + # 3. summarizer:把中间结果整理为最终答复 + summarizer = _create_sub_agent("summarizer", SUMMARIZER_PROMPT_PATH) + final_text = await _run_one( + summarizer, + f"用户问题:{query}\n\n上游 agent 给出的中间结果:{intermediate}\n\n" + "请整理后呈现最终答复。", + ) + return final_text diff --git a/examples/optimization/multi_agent_pipeline/pipeline/prompts/fact_agent.md b/examples/optimization/multi_agent_pipeline/pipeline/prompts/fact_agent.md new file mode 100644 index 00000000..6a621f33 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/prompts/fact_agent.md @@ -0,0 +1 @@ +你是一个百科助手,回答用户的事实性问题,给出准确简洁的答案。 diff --git a/examples/optimization/multi_agent_pipeline/pipeline/prompts/math_agent.md b/examples/optimization/multi_agent_pipeline/pipeline/prompts/math_agent.md new file mode 100644 index 00000000..69415066 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/prompts/math_agent.md @@ -0,0 +1 @@ +你是一个数学助手,请解答用户的算术题。 diff --git a/examples/optimization/multi_agent_pipeline/pipeline/prompts/router.md b/examples/optimization/multi_agent_pipeline/pipeline/prompts/router.md new file mode 100644 index 00000000..ead8bfce --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/prompts/router.md @@ -0,0 +1 @@ +你是一个分发助手,请看用户问题然后回答 fact 或 math。 diff --git a/examples/optimization/multi_agent_pipeline/pipeline/prompts/summarizer.md b/examples/optimization/multi_agent_pipeline/pipeline/prompts/summarizer.md new file mode 100644 index 00000000..21f28d6f --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/prompts/summarizer.md @@ -0,0 +1 @@ +你是一个回答整理助手。请把上游 agent 给出的中间结果整理后呈现给用户,让回答更友好易读。 diff --git a/examples/optimization/multi_agent_pipeline/run_optimization.py b/examples/optimization/multi_agent_pipeline/run_optimization.py new file mode 100644 index 00000000..7b43518c --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/run_optimization.py @@ -0,0 +1,105 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Multi-Agent Pipeline example 的优化器入口。 + +适用场景 +-------- +业务侧已编排好多 sub-agent 协作链路(router / 分支 worker / summarizer 等), +希望在不修改链路代码的前提下,对每个 sub-agent 的 prompt 进行联合优化。 + +这个文件做什么 +-------------- +1. 注册 4 个 prompt 文件作为 TargetPrompt 的 4 个独立字段 +2. 定义 call_agent 把 query 透传给整条 pipeline 链路 +3. 调 AgentOptimizer.optimize 跑 GEPA 多模块协同优化 + +怎么跑 +------ +1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/multi_agent_pipeline/run_optimization.py +3) 看 runs/<时间戳>/best_prompts/ 下 4 个 .md 文件 + +关键配置(详见 README §5) +-------------------------- +- module_selector="round_robin" : 每轮反思只改 1 个字段,便于归因 +- use_merge=true : 累积单字段改进后主动融合(多字段才有意义) +- reflection_history_top_k=3 : 多字段轮换时给反思 LM 更长历史 + +接入自有链路时改哪里 +-------------------- +- pipeline/orchestrator.py 中的 invoke_pipeline 替换为业务真实链路调用 + (HTTP / gRPC / 内部编排框架等任意形态) +- TargetPrompt.add_path 调整为业务各 sub-agent 实际读取的 prompt 文件路径 +- 若 prompt 在配置中心而非本地,把 add_path 替换为 add_callback + (参考 remote_prompt_store/ example) +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt + +from pipeline.orchestrator import ( + FACT_AGENT_PROMPT_PATH, + MATH_AGENT_PROMPT_PATH, + ROUTER_PROMPT_PATH, + SUMMARIZER_PROMPT_PATH, + invoke_pipeline, +) + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" + + +async def call_agent(query: str) -> str: + """框架回调:把 query 透传给整条 pipeline 链路,返回最终答复。""" + return await invoke_pipeline(query) + + +async def main() -> None: + """组装 4 字段 TargetPrompt + 调 AgentOptimizer.optimize。""" + # 4 个 add_path 注册多字段优化目标。GEPA 把每个 key 视为独立 component, + # module_selector="round_robin" 让每轮只改其中 1 个,便于归因。 + target = ( + TargetPrompt() + .add_path("router", str(ROUTER_PROMPT_PATH)) + .add_path("fact_agent", str(FACT_AGENT_PROMPT_PATH)) + .add_path("math_agent", str(MATH_AGENT_PROMPT_PATH)) + .add_path("summarizer", str(SUMMARIZER_PROMPT_PATH)) + ) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/multi_agent_pipeline/train.evalset.json b/examples/optimization/multi_agent_pipeline/train.evalset.json new file mode 100644 index 00000000..ee203428 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/train.evalset.json @@ -0,0 +1,92 @@ +{ + "eval_set_id": "multi_agent_pipeline_train", + "name": "Multi-agent pipeline demo - train", + "description": "5 条混合 case:3 道事实题 + 2 道算术题;考核整条 pipeline 的端到端效果(路由 + 分支应答 + 总结格式)。", + "eval_cases": [ + { + "eval_id": "fact_capital_china", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [{"text": "中国的首都是哪里?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:北京"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "fact_water_boil", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [{"text": "在标准大气压下,水的沸点是多少摄氏度?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:100 摄氏度"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "fact_solar_planets", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [{"text": "太阳系有几大行星?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:8 颗"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "math_simple_add", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [{"text": "3 加 5 等于多少?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:8"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "math_multiply", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [{"text": "12 乘以 7 等于多少?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:84"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "trainer", "state": {}} + } + ] +} diff --git a/examples/optimization/multi_agent_pipeline/val.evalset.json b/examples/optimization/multi_agent_pipeline/val.evalset.json new file mode 100644 index 00000000..009ceab3 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/val.evalset.json @@ -0,0 +1,58 @@ +{ + "eval_set_id": "multi_agent_pipeline_val", + "name": "Multi-agent pipeline demo - validation", + "description": "3 条混合 case:2 道事实题 + 1 道算术题。", + "eval_cases": [ + { + "eval_id": "fact_great_wall", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [{"text": "中国的长城最早是哪个朝代修建的?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:春秋"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "validator", "state": {}} + }, + { + "eval_id": "fact_largest_ocean", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [{"text": "世界上最大的洋是哪个?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:太平洋"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "validator", "state": {}} + }, + { + "eval_id": "math_subtract", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [{"text": "100 减去 37 等于多少?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:63"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "validator", "state": {}} + } + ] +} diff --git a/examples/optimization/multi_metric_with_judges/README.md b/examples/optimization/multi_metric_with_judges/README.md new file mode 100644 index 00000000..f029e7c5 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/README.md @@ -0,0 +1,241 @@ +# Multi-Metric with Judges — 多 metric 与 multi-judge 集成 + +> **适用场景**:业务 agent 同时受多类约束(答案正确性硬约束 + 风格 / 安全 / 合规软约束),需要多条 metric 共同参与优化与早停判定,并希望通过多 judge 投票降低单 LLM 裁判的偏差。本 example 演示 `llm_final_response`(多 judge 投票)+ `llm_rubric_response`(单 judge 多 rubric)双 metric 共存、`frontier_type="hybrid"` 双层 Pareto 前沿、`stop.required_metrics` 显式列表的完整配置。阅读前请先熟悉 `quickstart/README.md` §2。 + +## 1 · 适用问题与设计目标 + +单 metric 优化在工程实践中往往不够: + +- "答案对就行"忽视格式 / 风格 / 合规约束,容易拿到"对但不能用"的回答 +- 单 LLM 裁判存在偏差(temperature 抖动、prompt 暗示、模型偏好),尤其在主观维度上 +- 不同 metric 反映不同业务诉求,应能在前沿上协同存在而非互相覆盖 + +本 example 的设计原则: + +- **硬约束 / 软约束分离**:`llm_final_response` 用 multi-judge `all_pass` 投票把关答案正确性;`llm_rubric_response` 用单 judge 多 rubric 评估格式 / 风格 +- **多 judge 投票降低偏差**:3 个 judge 在不同 temperature 下独立判断,全体通过才算 PASS +- **双层 Pareto 前沿**:`frontier_type="hybrid"` 同时维护 per-case 与 per-metric 前沿,避免"为了改 metric A 牺牲 metric B"的退化 +- **稳定评估**:`num_runs=2` 平滑 LLM 输出方差;`eval_case_parallelism=1` 控制 multi-judge 并发避免 rate limit + +| 输入 | 输出 | +| --- | --- | +| 多条 metric(每条独立 threshold + 独立判分逻辑) | 同时满足所有指定 metric 阈值的最优候选 | +| `stop.required_metrics` 中列出的"必须达标"的 metric 子集 | 严格的早停判定:列表中所有 metric 在 val 集上达标才提前终止 | + +### 本 example 演示的最小用例 + +| 维度 | 值 | +| --- | --- | +| 业务任务 | 数学辅导 agent,要求答案正确 + 风格规范(无 emoji、推理清晰、答案带单位) | +| 优化目标 | `agent/prompts/system.md` 单文件 | +| 验证指标 | `llm_final_response`(3 judge `all_pass`,threshold 1.0) + `llm_rubric_response`(4 rubric 均值,threshold 0.75) | +| 训练 / 验证规模 | 5 条 / 3 条 | + +## 2 · 术语对照 + +仅列出本 example 引入的新概念。基础术语见 `quickstart/README.md` §2。 + +| 术语 | 含义 | +| --- | --- | +| **`llm_final_response` metric** | 由 LLM 裁判判断 agent 输出是否与参考答案实质一致,输出 PASS / FAIL。可配置多个 judge 共同打分。 | +| **`llm_rubric_response` metric** | 单 LLM 裁判按多条 rubric(评分标准)独立打分后取均值;适合多维度软约束。 | +| **multi-judge** | `criterion.llm_judge.judge_models` 数组形式配置多个独立 judge,每个 judge 独立调用 LLM 给出判断。 | +| **`models_aggregator`** | 多 judge 结果的聚合策略,6 种取值(见 §5.2)。本 example 用 `all_pass`。 | +| **frontier_type** | Pareto 前沿的粒度。4 种取值:`instance`(按 case) / `objective`(按 metric) / `hybrid`(双层) / `cartesian`(按 case×metric)。多 metric 推荐 `hybrid`。 | +| **stop.required_metrics** | 框架层早停的 metric 子集声明。`"all"` / 列表 / `null` 三种形式。 | + +## 3 · 运行示例 + +### 3.1 安装依赖 + +```bash +pip install -e ".[optimize]" +``` + +### 3.2 配置环境变量 + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +agent、reflection LM、所有 judge 默认共用同一组凭据。需要让 judge 用独立模型时单独配置 `judge_model` 字段。 + +### 3.3 启动 + +```bash +python examples/optimization/multi_metric_with_judges/run_optimization.py +``` + +单次运行约 5–10 分钟。每条 case 一次评测约触发 (3 + 1) × 2 = 8 次 LLM 调用(3 个 judge × `num_runs=2` 加 1 个 rubric judge × `num_runs=2`)。 + +### 3.4 产物结构 + +与 quickstart 一致。`result.json` 中 `metric_breakdown` 字段会同时包含 `llm_final_response` 与 `llm_rubric_response` 两条独立分数。 + +## 4 · 架构与数据流 + +``` +每个 case 一次评测: +├── agent 输出 final_text +│ +├── llm_final_response (硬约束) +│ ├─ judge_1 (temperature=0.0) → valid / invalid +│ ├─ judge_2 (temperature=0.3) → valid / invalid +│ ├─ judge_3 (temperature=0.6) → valid / invalid +│ └─ aggregator: all_pass → 三个全 valid 才算 PASS(threshold=1.0) +│ +└── llm_rubric_response (软约束,单 judge 多 rubric) + ├─ rubric: no_emoji_or_slang → 0/1 + ├─ rubric: numeric_correct → 0/1 + ├─ rubric: reasoning_clear → 0/1 + └─ rubric: units_present → 0/1 + 平均分 = quality score(threshold=0.75 ≈ 4 条至少 3 条过) + +stop.required_metrics = ["llm_final_response", "llm_rubric_response"] + 两个 metric 都在 val 集上达 threshold 才提前停止 +``` + +### 4.1 文件清单 + +| 文件 | 角色 | 接入自有业务时的修改方向 | +| --- | --- | --- | +| `run_optimization.py` | 优化器入口 | 与 quickstart 同;多 metric 场景下基本不变 | +| `agent/agent.py` | LlmAgent 工厂 | 替换为业务 agent 构建逻辑 | +| `agent/prompts/system.md` | baseline prompt | 写入业务 baseline | +| `optimizer.json` | **核心改造点**:多 metric / multi-judge / hybrid frontier 配置 | 按业务 metric 数量与维度调整 | +| `train.evalset.json` / `val.evalset.json` | 数据集 | 替换为业务用例(reference 字段需配合 metric 类型) | + +## 5 · 关键配置 + +### 5.1 多 metric 与 multi-judge 配置示例 + +```jsonc +{ + "evaluate": { + "num_runs": 2, // 平滑 LLM 输出方差 + "metrics": [ + { + "metric_name": "llm_final_response", + "threshold": 1.0, + "criterion": { + "llm_final_response": { + "llm_judge": { + "judge_models": [ // 多 judge 数组:3 个独立 judge + { "...": "..." , "generation_config": { "temperature": 0.0 } }, + { "...": "..." , "generation_config": { "temperature": 0.3 } }, + { "...": "..." , "generation_config": { "temperature": 0.6 } } + ], + "models_aggregator": "all_pass" // 全 PASS 才算通过 + } + } + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.75, + "criterion": { + "llm_rubric_response": { + "llm_judge": { "judge_model": { "...": "..." } }, + "rubrics": [ + { "name": "no_emoji_or_slang", "description": "..." }, + { "name": "numeric_correct", "description": "..." }, + { "name": "reasoning_clear", "description": "..." }, + { "name": "units_present", "description": "..." } + ] + } + } + } + ] + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": ["llm_final_response", "llm_rubric_response"] + }, + "algorithm": { + "frontier_type": "hybrid", + "max_metric_calls": 30 + } + } +} +``` + +### 5.2 `models_aggregator` 6 种取值 + +按业务严格度从严到松排序: + +| aggregator | 通过条件 | 适用场景 | +| --- | --- | --- | +| `all_pass` | 全部 judge 判 PASS | 合规 / 安全场景,任何一票否决都拦截(最严格) | +| `weighted_majority` | 加权 PASS 票 > FAIL 票 | 不同 judge 信任度不同(如主 judge 权重 2、副 judge 权重 1) | +| `majority_pass` | 超过半数 judge 判 PASS | 多数表决 | +| `weighted_avg` | 加权均分 ≥ threshold | 多 judge 给的是连续分而非二元判断时 | +| `avg` | 简单均分 ≥ threshold | 多 judge 连续分简单平均 | +| `any_pass` | 至少一个 judge 判 PASS | 鼓励探索 / 高召回场景(最宽松) | + +### 5.3 `frontier_type` 4 种取值 + +| 取值 | 含义 | 适用 | +| --- | --- | --- | +| `instance` | 每个 case 维护一个 best 候选 | 单 metric 或简单业务 | +| `objective` | 每个 metric 维护一个 best 候选 | 多 metric 但 case 量少 | +| `hybrid` | 同时维护 case + metric 双层前沿 | **多 metric 真冲突场景**(本 example 推荐) | +| `cartesian` | 每个 (case, metric) 组合一个 best | 极复杂 / 调试用,常导致候选池爆炸 | + +`hybrid` 让 GEPA 在改进一个 metric 时不丢失另一个 metric 上的最佳候选,是多 metric 业务的默认推荐。 + +### 5.4 `stop.required_metrics` 3 种取值 + +| 取值 | 语义 | +| --- | --- | +| `"all"`(默认) | val 集上**所有** metric 都达 threshold 才早停 | +| `["m1", "m2"]` | 列出的 metric 全部达 threshold 才早停(其他 metric 仍参与评测但不影响早停) | +| `null` 或 `[]` | 不参与早停,仅靠算法层 budget / no-improvement / score_threshold 控制 | + +本 example 显式用列表形式列出两条 metric。当业务 metric 较多但只有部分作为早停门禁时,列表形式比 `"all"` 更精准。 + +### 5.5 `eval_case_parallelism` 与 multi-judge 的相互作用 + +multi-judge × `num_runs=2` 使每条 case 一次评测约 8 次 LLM 调用。若 `eval_case_parallelism=4`(默认)+ 训练集 5 case,单轮可能产生 ~40 个 judge 请求并发,容易撞 LLM 后端的 rate limit。本 example 设为 `1` 串行执行;业务可根据 LLM 后端 QPS 上调。 + +## 6 · 调试技巧 + +### 6.1 查看反思 LM 看到的多 metric 反馈 + +`run_optimization.py` 中 `verbose=1` 改为 `verbose=2`,gepa 内部日志会附带反思 prompt,可看到反思 LM 实际接收的 metric 反馈结构。 + +### 6.2 优雅停止 + +```bash +touch runs//optimize.stop +``` + +## 7 · 常见问题 + +**Q:`all_pass` 是不是太严格了?** +A:取决于业务诉求。合规 / 安全场景下 "any judge raises a red flag" 应该立即拦截,`all_pass` 是合理选择。质量评估场景可换 `majority_pass` 或 `weighted_avg`。 + +**Q:3 个 judge 用同一个模型只是 temperature 不同,有意义吗?** +A:有部分意义——不同 temperature 触发不同采样路径,可发现一些边界情况。但更稳健的做法是混用不同模型族(如 GPT + Claude + GLM),可显著降低同源偏差。 + +**Q:reflection LM 与 judge 用同一个模型会"自评"吗?** +A:`llm_rubric_response` 的 judge 看的是预定义 rubric 文本,受偏差影响较小。`llm_final_response` 的 judge 看 reference 答案做实质等价判断,相对客观。生产环境建议至少 judge 与 agent 模型不同源,参见 quickstart §5.1。 + +**Q:`num_runs` 调高会不会降低优化效率?** +A:会。`num_runs=2` 让每条 case 评测耗时翻倍,但能消除一定 LLM 输出方差(同一 prompt 同一 case 两次跑分不一致),通常对收敛稳定性有正向作用。`num_runs=1` 适合追求速度的早期实验,`num_runs=2–3` 适合接近收尾的精打磨。 + +**Q:rubric 数量太多怎么办?** +A:单 judge 一次打多 rubric 时若 rubric > 6–8 条,judge 输出质量下降明显。建议拆成多条 `llm_rubric_response` metric,每条聚焦 2–4 条相关 rubric。 + +## 8 · 接入自有业务的步骤 + +1. **梳理业务约束**:哪些是硬约束(必须通过)、哪些是软约束(按比例打分) +2. **选择 metric 类型**:硬约束用 `llm_final_response` + `all_pass`;软约束用 `llm_rubric_response` 多 rubric +3. **配置 multi-judge**:`judge_models` 数组形式;选择合适的 `models_aggregator` +4. **设置 `stop.required_metrics`**:列出哪些 metric 决定何时早停 +5. **启用 `frontier_type="hybrid"`**:多 metric 场景的默认推荐 +6. **调整数据集**:`evalset` 中的 `final_response` / `reference` 字段需配合 metric 类型 +7. **控制并发**:`eval_case_parallelism` 设为 LLM 后端能承受的 QPS / 单 case judge 调用数 +8. **运行并观察**:`result.json` 中 `metric_breakdown` 显示每条 metric 独立分数,便于诊断瓶颈 diff --git a/examples/optimization/multi_metric_with_judges/agent/__init__.py b/examples/optimization/multi_metric_with_judges/agent/__init__.py new file mode 100644 index 00000000..bc6e483f --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/agent/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/multi_metric_with_judges/agent/agent.py b/examples/optimization/multi_metric_with_judges/agent/agent.py new file mode 100644 index 00000000..95373034 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/agent/agent.py @@ -0,0 +1,48 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""数学辅导 agent —— Multi-Metric with Judges example 专用。 + +每次 create_agent() 重读 prompts/system.md,使优化器写入的新候选立即生效。 +单文件优化目标。 +""" + +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.types import GenerateContentConfig + +from .config import get_model_config + + +SYSTEM_PROMPT_PATH = Path(__file__).parent / "prompts" / "system.md" + + +def _create_model() -> LLMModel: + """构建 OpenAI 兼容 chat 模型实例。凭据从环境变量读取。""" + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def _read_instruction() -> str: + """从磁盘重读 system.md。每次调用都重读,让优化器写入的候选立即生效。""" + return SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + + +def create_agent() -> LlmAgent: + """构建一个使用当前磁盘 prompt 的新 LlmAgent 实例。""" + return LlmAgent( + name="math_word_problem_agent", + description="Math word-problem solver under formality / safety constraints.", + model=_create_model(), + instruction=_read_instruction(), + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=2048, + ), + ) diff --git a/examples/optimization/multi_metric_with_judges/agent/config.py b/examples/optimization/multi_metric_with_judges/agent/config.py new file mode 100644 index 00000000..d0a64b15 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""模型凭据读取 —— 从环境变量加载 OpenAI 兼容 LLM 的连接信息。 + +需要的环境变量 +-------------- + TRPC_AGENT_API_KEY LLM 后端的 API key + TRPC_AGENT_BASE_URL LLM 后端的 endpoint + TRPC_AGENT_MODEL_NAME 模型名 + +缺任意一个就立即抛 ValueError,避免运行到一半才撞到 LLM 后端的 401 错误, +那时报错信息会很有迷惑性(看起来像 prompt 写错了,实际是凭据没配)。 +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """返回 (api_key, base_url, model_name);任一缺失立刻报错。""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "运行优化器前必须配置环境变量 TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME。" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/multi_metric_with_judges/agent/prompts/system.md b/examples/optimization/multi_metric_with_judges/agent/prompts/system.md new file mode 100644 index 00000000..f03fc2d2 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/agent/prompts/system.md @@ -0,0 +1 @@ +你是一个超有趣的数学小助手!🎉 学习数学应该是快乐的事情,所以请用轻松活泼的语气和孩子们交流。多用一些 emoji(比如 🚗、😊、💯、🎈)让回答看起来更生动;遇到结果不错的题目可以适当夸夸学生("哇塞"、"棒棒哒"、"超厉害"),让他们更有成就感! diff --git a/examples/optimization/multi_metric_with_judges/optimizer.json b/examples/optimization/multi_metric_with_judges/optimizer.json new file mode 100644 index 00000000..5978afd1 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/optimizer.json @@ -0,0 +1,100 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "llm_final_response", + "threshold": 1.0, + "criterion": { + "llm_judge": { + "judge_models": [ + { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 512, "temperature": 0.0 } + }, + { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 512, "temperature": 0.3 } + }, + { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 512, "temperature": 0.6 } + } + ], + "models_aggregator": "all_pass", + "parallel": true + } + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.75, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 1024, "temperature": 0.2 } + }, + "rubrics": [ + { + "id": "no_emoji_or_slang", + "content": { "text": "回答中**不包含**任何 emoji(如 🎉、😊、💯、🚗 等图形字符),且**不使用**夸张俚语和娱乐化口吻(如「哇塞」「棒棒哒」「超厉害」「咋」「呀」「啦」等)。文字应当简洁、克制、专业。" }, + "description": "回答严肃、不含 emoji 和俚语", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "numeric_correct", + "content": { "text": "最终给出的数字答案是否与参考答案一致(数值相同即视为通过)。" }, + "description": "数字答案与参考答案一致", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "reasoning_clear", + "content": { "text": "回答中是否给出清晰、可追溯的推理或计算步骤。" }, + "description": "推理步骤清晰", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "units_present", + "content": { "text": "最终数字答案是否带有正确的单位(例如:个、元、公里、千克、人 等)。" }, + "description": "答案带正确单位", + "type": "FINAL_RESPONSE_QUALITY" + } + ] + } + } + } + ], + "num_runs": 2 + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": ["llm_final_response", "llm_rubric_response"] + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "hybrid", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 30, + "max_iterations_without_improvement": 3 + } + } +} diff --git a/examples/optimization/multi_metric_with_judges/run_optimization.py b/examples/optimization/multi_metric_with_judges/run_optimization.py new file mode 100644 index 00000000..b43e6d1d --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/run_optimization.py @@ -0,0 +1,123 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Multi-Metric with Judges example 的优化器入口。 + +适用场景 +-------- +业务 agent 同时受多类约束(答案正确性硬约束 + 风格 / 安全 / 合规软约束), +需要多条 metric 共同参与优化与早停判定,并希望通过多 judge 投票降低单 +LLM 裁判的偏差。 + +这个文件做什么 +-------------- +1. 注册单字段 TargetPrompt(agent/prompts/system.md) +2. 定义 call_agent 用当前 prompt 跑一次推理 +3. 调 AgentOptimizer.optimize;具体 multi-metric / multi-judge 配置在 + optimizer.json 中 + +怎么跑 +------ +1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/multi_metric_with_judges/run_optimization.py +3) 单次约 5-10 分钟,每条 case 约 (3+1)×num_runs=2 = 8 次 LLM 调用 + +接入自有业务时改哪里 +-------------------- +本脚本本身基本不变,主要改动在 optimizer.json: +- evaluate.metrics:列出业务的多条 metric +- judge_models 数组形式 + models_aggregator 选择投票策略 +- frontier_type="hybrid" 多 metric 推荐 +- stop.required_metrics 决定哪些 metric 参与早停 +- eval_case_parallelism 控制 multi-judge 并发避免 rate limit +详见 README §5。 +""" + +from __future__ import annotations + +import asyncio +import sys +import uuid +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + +from agent.agent import SYSTEM_PROMPT_PATH, create_agent + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" +APP_NAME = "multi_metric_demo_agent" + + +async def call_agent(query: str) -> str: + """框架回调:用当前 system.md 构造 LlmAgent,跑一次推理。 + + 每次调用都重读 prompt + 新建 Runner + InMemorySessionService,给每个 + case 独立的 session state,并发评测时不互相污染。 + """ + agent = create_agent() + + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "optimizer" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return final_text.strip() + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize。""" + target = TargetPrompt().add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/multi_metric_with_judges/train.evalset.json b/examples/optimization/multi_metric_with_judges/train.evalset.json new file mode 100644 index 00000000..1ea2bae2 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/train.evalset.json @@ -0,0 +1,112 @@ +{ + "eval_set_id": "multi_metric_train", + "name": "Multi-metric demo - train", + "description": "5 道小学算术应用题;同时考核 safety(无 emoji / 无俚语)+ quality(数字正确 / 推理清晰 / 带单位)。", + "eval_cases": [ + { + "eval_id": "wp_apples_add", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [{"text": "小明早上买了 4 个苹果,下午又买了 7 个苹果,他一共有多少个苹果?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:11 个"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_car_distance", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [{"text": "一辆汽车以每小时 60 公里的速度行驶 2.5 小时,一共行驶了多少公里?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:150 公里"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_discount_price", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [{"text": "一件衣服原价 200 元,现在打 8 折出售,折后价是多少元?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:160 元"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_glasses_percent", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [{"text": "班里一共有 40 名学生,其中 25% 戴眼镜,戴眼镜的有多少人?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:10 人"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_defect_items_percent", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [{"text": "一批商品共 50 件,其中 30% 是次品,次品有多少件?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:15 件"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "trainer", + "state": {} + } + } + ] +} diff --git a/examples/optimization/multi_metric_with_judges/val.evalset.json b/examples/optimization/multi_metric_with_judges/val.evalset.json new file mode 100644 index 00000000..4a2a9262 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/val.evalset.json @@ -0,0 +1,70 @@ +{ + "eval_set_id": "multi_metric_val", + "name": "Multi-metric demo - validation", + "description": "3 道小学算术应用题;用于每轮全量评估、决定候选是否被接受。", + "eval_cases": [ + { + "eval_id": "wp_seats_multiply", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [{"text": "教室里有 5 排座位,每排 8 个,一共多少个座位?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:40 个"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_water_weight", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [{"text": "已知 1 升水重 1 千克,3.5 升水重多少千克?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:3.5 千克"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_class_girls_percent", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [{"text": "班里一共有 30 人,其中 60% 是女生,请问有多少名女生?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:18 人"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "validator", + "state": {} + } + } + ] +} diff --git a/examples/optimization/quickstart/README.md b/examples/optimization/quickstart/README.md new file mode 100644 index 00000000..d08fbbfa --- /dev/null +++ b/examples/optimization/quickstart/README.md @@ -0,0 +1,213 @@ +# Quickstart — `AgentOptimizer` 入门示例 + +> **适用场景**:首次使用 `AgentOptimizer`,需要在最小完整流程下理解 prompt 自动优化的输入、输出与基本工作机制。本文档是后续 9 个 example 的前置阅读材料,所有进阶 example(HTTP 服务接入、远端 prompt 源、多 agent 链路等)默认假设读者已熟悉本文涉及的概念。 + +## 1 · 适用问题与设计目标 + +迭代 prompt 是 LLM agent 工程中重复成本最高的环节之一:手动改 prompt → 重跑评估 → 根据失败用例再改,循环数十次。`AgentOptimizer` 将该循环自动化: + +| 输入 | 输出 | +| --- | --- | +| 一个支持热加载 prompt 的 agent | 满足 metric 阈值的最优 prompt 候选集 | +| 训练集(反思样本来源) + 验证集(候选评分依据) | `result.json`(机器可读)+ `summary.txt`(人类可读)+ 每轮过程产物 | +| 一组 metric(精确匹配 / 正则 / LLM 裁判 / 多 metric 组合) | baseline → best 的端到端分数对比 | + +底层算法采用 **GEPA**(reflective prompt mutation),由一个独立的 reflection LLM 检视 agent 在训练集上的失败用例,生成候选 prompt;候选先在验证集上全量评估,再与历史 Pareto 前沿比较,决定是否接受。 + +### 本 example 演示的最小用例 + +| 维度 | 值 | +| --- | --- | +| 业务任务 | 小学算术应用题求解 | +| 优化目标 | `agent/prompts/system.md`(角色定义) + `agent/prompts/skill.md`(解题方法论) | +| 验证指标 | `final_response_avg_score`(精确匹配,阈值 1.0) + `llm_rubric_response`(三条评分标准均值,阈值 0.66) | +| 训练 / 验证规模 | 5 条 / 3 条 | + +`system.md` 与 `skill.md` 的 baseline 内容刻意制造冲突(前者要求"只输出答案",后者要求"展开推理"),以确保 GEPA 必须改写至少其中一个文件才能让两条 metric 同时达标——这一设计让反思机制的作用对读者直接可见。 + +## 2 · 术语对照 + +下列术语在后续章节首次出现时不再展开解释,请先建立认知。 + +| 术语 | 含义 | +| --- | --- | +| **GEPA** | Genetic-Pareto reflective prompt optimization。本 SDK 默认且当前唯一收录的优化算法。 | +| **TargetPrompt** | 声明哪些 prompt 字段会被优化器读写的注册表。每个字段对应一个本地文件(`add_path`)或一对异步 `read/write` 回调(`add_callback`)。 | +| **call_agent** | 用户提供的回调,签名固定为 `async def(query: str) -> str`。框架通过它驱动 agent 完成单次推理。 | +| **eval set** | 评估用例集合。`train` 用于反思 minibatch 抽样,`val` 用于决定候选是否接受、是否触发早停。 | +| **metric** | 通过 / 失败标准,可组合使用。本 SDK 内置 `final_response_avg_score`、`llm_final_response`、`llm_rubric_response`、`trajectory_avg_score` 等。 | +| **reflection LM** | 负责检视失败用例并生成新候选 prompt 的 LLM。可与 agent 共用模型,亦可独立配置。 | +| **judge model** | LLM 裁判,按 rubric 对 agent 输出打分。 | +| **minibatch** | 每轮反思从 train 集随机抽取的若干 case,用于驱动 reflection LM。 | +| **Pareto 前沿** | 在任一 metric 上是历史最优的候选都被保留,作为下一轮反思的潜在亲本。 | +| **stop condition** | 终止优化的判定条件。SDK 同时支持算法层(budget / no-improvement / score threshold 等)与框架层(`stop.required_metrics`)两类。 | + +## 3 · 运行示例 + +### 3.1 安装可选依赖 + +```bash +pip install -e ".[optimize]" +``` + +`optimize` extra 包含 `gepa`(反思算法实现)与 `rich`(终端进度面板)。`rich` 缺失时进度面板自动降级为纯文本。 + +### 3.2 配置环境变量 + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +默认情况下 agent、reflection LM、judge model 共用同一组凭据。如需为 judge 配置独立模型,参见 §6.3。 + +### 3.3 启动 + +```bash +python examples/optimization/quickstart/run_optimization.py +``` + +终端将依序输出:baseline 评估分数 → 每轮 GEPA 反思的接受 / 拒绝记录 → 收尾摘要(含 `stop_reason`)。 + +### 3.4 产物结构 + +``` +runs// +├── result.json 完整运行记录,机器可读 +├── summary.txt 人类可读的总览 +├── baseline_prompts/ 运行前的 prompt 快照(用于回滚与对照) +├── best_prompts/ val 集上得分最高的候选 +└── rounds/ 每轮的反思 prompt、候选文本、评估明细 +``` + +推荐阅读顺序:先看 `summary.txt` 了解总体走向,再用 `diff -r baseline_prompts/ best_prompts/` 查看 prompt 实际变更内容。 + +## 4 · 架构与数据流 + +``` + ┌────────────────────────────────┐ + │ AgentOptimizer.optimize(...) │ + └───────────────┬────────────────┘ + │ + ┌───────────────────────┼─────────────────────────────┐ + ▼ ▼ ▼ + baseline 评估 GEPA 主循环 收尾产物 + ───────────── ─────────── ──────── + 当前 prompt 每轮: best_prompts/ + 在 val 集上的 ① module_selector 选定字段 result.json + 起始分数 ② 抽 train minibatch summary.txt + ③ reflection LM 生成新候选 rounds/*.json + ④ 候选在 val 集上全量评估 + ⑤ 与 Pareto 前沿比较 + ⑥ 接受 / 拒绝候选 + ⑦ 触发停止条件检查 +``` + +### 4.1 文件清单 + +| 文件 | 角色 | 接入自有业务时的修改方向 | +| --- | --- | --- | +| `run_optimization.py` | 入口脚本,注册 `TargetPrompt`、定义 `call_agent` | 替换 `call_agent` 实现以驱动自有 agent | +| `agent/agent.py` | `LlmAgent` 工厂,每次调用重读 prompt | 替换为自有 agent 构建逻辑(模型、tools、output schema 等) | +| `agent/prompts/system.md` | 角色定义 prompt(GEPA 写入目标) | 写入业务 baseline;可作为初始版本起点 | +| `agent/prompts/skill.md` | 方法论 prompt(GEPA 写入目标) | 单字段优化时可整体删除 | +| `optimizer.json` | 算法 + metric 配置 | 调整 metric 类型、阈值、停止条件 | +| `train.evalset.json` | 反思 minibatch 来源 | 替换为业务训练用例 | +| `val.evalset.json` | 候选评分依据 | 替换为业务验证用例 | + +### 4.2 prompt 拆分的设计动机 + +将 prompt 拆为 `system.md` 与 `skill.md` 两个独立文件,对应 `TargetPrompt` 的两个 key(`"system_prompt"` 与 `"skill"`)。`module_selector="round_robin"` 配置下,GEPA 每轮仅改写其中一个文件,便于: + +- **归因**:可直接定位是哪个文件的改动带来分数提升 +- **稳定性**:单字段改动比多字段同改更易被验证集接受 +- **演示价值**:刻意冲突的 baseline 强制 GEPA 至少改写一个文件,否则 metric 无法同时达标 + +若业务只需优化单文件 prompt,移除第二个 `add_path` 调用即可。 + +## 5 · 关键配置 + +`optimizer.json` 中以下字段直接影响优化效率与产物质量: + +| 字段 | 默认 / 本 example 值 | 影响 | +| --- | --- | --- | +| `evaluate.num_runs` | 1 | 每条 case 的推理次数。提高至 2–3 可平滑 LLM 输出方差,代价为评估耗时线性增长 | +| `optimize.eval_case_parallelism` | 2 | 单批 case 的最大并发推理数。LLM 后端有 QPS 限制时需调小 | +| `optimize.stop.required_metrics` | `"all"` | 框架层早停:`"all"` = 所有 metric 达标;列表 = 仅指定 metric 达标即可;`null`/`[]` = 完全交由算法层停止条件决定 | +| `optimize.algorithm.reflection_minibatch_size` | 3 | 每轮反思的 case 数。过小会导致反思素材单调;过大单轮耗时增加 | +| `optimize.algorithm.skip_perfect_score` | `false` | 是否跳过已满分的 case。小训练集建议保持 `false`,否则 minibatch 容易反复抽到同一条 case | +| `optimize.algorithm.max_metric_calls` | 60 | 累计 case 评估次数上限,控制总开销的主要手段 | +| `optimize.algorithm.max_iterations_without_improvement` | 8 | 连续 N 轮 val 分无提升即提前停止 | + +> JSON 标准不支持 `//` 注释,配置文件中使用本表说明替代行内注释。 + +### 5.1 分离 judge 模型 + +`optimizer.json` 中 `evaluate.metrics[*].criterion.llm_judge.judge_model` 可独立配置,与 agent 凭据互不影响: + +```json +"judge_model": { + "provider_name": "openai", + "model_name": "", + "api_key": "", + "base_url": "" +} +``` + +### 5.2 启用 reflection / judge 的思考模式 + +`OptimizeModelOptions` 与 `JudgeModelOptions` 均支持三态 `think` 字段: + +| 取值 | 行为 | +| --- | --- | +| `null`(默认) | 沿用模型默认配置,不做修改 | +| `true` | 注入 `BuiltInPlanner(ThinkingConfig(include_thoughts=True, thinking_budget=-1))`,并在 `http_options.extra_body` 写入 `chat_template_kwargs.enable_thinking=true`(兼容 GLM 等 OpenAI 兼容后端) | +| `false` | 显式关闭思考模式 | + +## 6 · 运行控制 + +### 6.1 优雅停止 + +`Ctrl+C` 可能截断当前轮的产物文件。需要中途收尾时建议改用 stop 文件: + +```bash +touch runs//optimize.stop +``` + +下一次 stopper 检查时框架立即收尾,所有已完成轮次的 artifact 完整落盘,`OptimizeResult.stop_reason` 标记为 `user_requested_stop`。 + +### 6.2 update_source 的语义 + +`AgentOptimizer.optimize(update_source=False)`(默认)下源 prompt 文件保持不变,最优候选仅写入 `runs//best_prompts/`。若需在优化成功后直接覆盖源文件(典型于 CI/CD 闭环场景,参见 `ci_integration/` example),将该参数置为 `True`。 + +## 7 · 常见问题 + +**Q:多轮对话 case 在优化时是否保留上下文?** +A:默认不保留。`call_agent` 每次调用使用独立的 `Runner + InMemorySessionService`。需要真实多轮上下文时,需在 `call_agent` 内部自行维护 session 状态——典型做法是用 `contextvars.ContextVar` 存放当前 case 的 history(`asyncio.Task` 启动时自动 `copy_context`,并发评估下天然按 task 隔离)。 + +**Q:reflection LM 与 agent 共用模型是否会引入"自评"偏差?** +A:`llm_rubric_response` 让 judge 依据预定义 rubric 文本打分,而非自由评价,可缓解大部分偏差。生产环境建议 judge 配置独立模型,参见 §5.1。 + +**Q:`best_prompts/` 中的文件就是最终产物吗?** +A:是 val 集上得分最高的候选。`update_source=False` 时源文件不变,需手动复制;`update_source=True` 时源文件被自动覆盖(仅在 `OptimizeResult.status=SUCCEEDED` 时触发)。 + +**Q:运行结束后 baseline 与 best 分数无变化(无收敛)该如何排查?** +A:按以下顺序检查: +1. baseline prompt 是否过于简单,导致 reflection LM 无明确改进方向 +2. `reflection_minibatch_size` 是否过小,导致反思素材单调 +3. metric 阈值是否设置过高(如 1.0 要求全 case 完美匹配) +4. 直接阅读 `runs//rounds/round_*.json` 中的 reflection LM 原始输出,常可定位具体原因 + +**Q:单次运行的开销估算?** +A:本 example 默认配置下约 5 分钟、约 60 次 LLM 调用。`max_metric_calls=60` 是硬性上限,超出立即停止。 + +## 8 · 接入自有 agent 的步骤 + +1. 替换 `agent/prompts/*.md` 为业务 baseline prompt +2. 修改 `agent/agent.py` 中 `create_agent()` 实现,对接业务模型 / tools / output schema +3. 替换 `train.evalset.json` 与 `val.evalset.json` 为业务用例 +4. 调整 `optimizer.json` 中 metric 类型与阈值 +5. 运行 `run_optimization.py`,根据 `summary.txt` 与 `result.json` 决定是否继续调参 + +若业务 agent 的形态不同于本 example(HTTP 服务、远端 prompt 源、多 agent 编排、CLI 黑盒等),请参考 `examples/optimization/` 下对应专题示例。 diff --git a/examples/optimization/quickstart/agent/__init__.py b/examples/optimization/quickstart/agent/__init__.py new file mode 100644 index 00000000..bc6e483f --- /dev/null +++ b/examples/optimization/quickstart/agent/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/quickstart/agent/agent.py b/examples/optimization/quickstart/agent/agent.py new file mode 100644 index 00000000..f30bd8a7 --- /dev/null +++ b/examples/optimization/quickstart/agent/agent.py @@ -0,0 +1,103 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""小学算术应用题求解 agent —— Quickstart 专用。 + +适用场景 +-------- +本文件是 Quickstart example 的 agent 实现。它演示一个被 GEPA 优化的 agent +最常见的写法:用一个工厂函数 create_agent(),每次调用都从磁盘重读 prompt +文件再构建 LlmAgent,让 GEPA 写入的新候选立即生效。 + +这个文件做什么 +-------------- +- 暴露 SYSTEM_PROMPT_PATH / SKILL_PATH(被 run_optimization.py 注册到 TargetPrompt) +- 提供 create_agent() 工厂函数(被 call_agent 在每次推理时调用) + +为什么 prompt 拆成两个文件 +-------------------------- +两个文件扮演不同角色,同时被 GEPA 优化: + + system.md (key="system_prompt") + 定义 agent 的角色定位和输出格式约束。 + baseline 故意写"只输出最终答案"——与 skill.md 的"展开思路"冲突。 + + skill.md (key="skill") + 描述解题方法论,要求 agent 展开推理过程。 + +冲突是刻意设计:让 GEPA 必须识别矛盾、改写其中至少一个文件,才能让两条 +metric 同时通过。这样能直观看到反思机制的价值。 + +两个文件按以下格式拼合: + {system.md 内容}\n\n## 解题方法\n{skill.md 内容} + +为什么每次都重新构建 agent,不复用实例 +-------------------------------------- +1. GEPA 在轮次之间会修改 prompt 文件;复用实例会用到旧 prompt +2. 并发 case 评测时每次独立构建更安全,无共享状态 +3. LlmAgent 构建本身很轻(不涉及 LLM 调用),开销可忽略 +""" + +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel, OpenAIModel +from trpc_agent_sdk.types import GenerateContentConfig + +from .config import get_model_config + + +# 两个 prompt 文件的绝对路径(run_optimization.py 把它们注册成 TargetPrompt) +SYSTEM_PROMPT_PATH = Path(__file__).parent / "prompts" / "system.md" +SKILL_PATH = Path(__file__).parent / "prompts" / "skill.md" + + +def _create_model() -> LLMModel: + """构建 OpenAI 兼容的 chat 模型实例。 + + 凭据从环境变量读取(见 config.py),缺任何一个都会 fail-fast。 + """ + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def _read_instruction() -> str: + """从两个 prompt 文件拼合完整 instruction。 + + 每次调用都重读磁盘,确保 GEPA 写入的新候选立即生效;分隔符 "## 解题方法" + 让拼合后的文本仍保持两块内容的边界,便于人类和 reflection_lm 阅读。 + """ + system = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + skill = SKILL_PATH.read_text(encoding="utf-8").strip() + return f"{system}\n\n## 解题方法\n{skill}" + + +def _create_agent_with_prompts(instruction: str) -> LlmAgent: + """LlmAgent 构建公共逻辑——给定 instruction,返回 agent 实例。 + + 把"读 prompt"和"构建 agent"分开,方便测试时直接传入字符串而不必依赖磁盘。 + """ + return LlmAgent( + name="math_word_problem_agent", + description=( + "小学算术应用题求解 agent。system prompt 与 skill prompt 由 GEPA " + "反思机制联合优化。" + ), + model=_create_model(), + instruction=instruction, + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=2048, + ), + ) + + +def create_agent() -> LlmAgent: + """构建一个使用当前磁盘 prompt 的新 LlmAgent 实例。 + + call_agent 在每次推理时调用此函数。 + """ + return _create_agent_with_prompts(_read_instruction()) diff --git a/examples/optimization/quickstart/agent/config.py b/examples/optimization/quickstart/agent/config.py new file mode 100644 index 00000000..bdc79fd8 --- /dev/null +++ b/examples/optimization/quickstart/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""模型凭据读取 —— 从环境变量加载 OpenAI 兼容 LLM 的连接信息。 + +需要的环境变量 +-------------- + TRPC_AGENT_API_KEY LLM 后端的 API key + TRPC_AGENT_BASE_URL LLM 后端的 endpoint,例如 https://api.example.com/v1 + TRPC_AGENT_MODEL_NAME 模型名,例如 glm-5.1 / gpt-4o-mini + +为什么三个都缺一不可 +-------------------- +缺任意一个就立即抛 ValueError,避免运行到一半才撞到 LLM 后端的 401 错误, +那时报错信息会很有迷惑性(看起来像 prompt 写错了,实际是凭据没配)。 +""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """返回 (api_key, base_url, model_name);任一缺失立刻报错。""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "运行优化器前必须配置环境变量 TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME。" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/quickstart/agent/prompts/skill.md b/examples/optimization/quickstart/agent/prompts/skill.md new file mode 100644 index 00000000..7bcd691a --- /dev/null +++ b/examples/optimization/quickstart/agent/prompts/skill.md @@ -0,0 +1 @@ +当遇到问题时,请展开讲讲你的思路,可以多写几段,让用户看到完整的思考过程。 diff --git a/examples/optimization/quickstart/agent/prompts/system.md b/examples/optimization/quickstart/agent/prompts/system.md new file mode 100644 index 00000000..70314dcb --- /dev/null +++ b/examples/optimization/quickstart/agent/prompts/system.md @@ -0,0 +1 @@ +你是一个友好的聊天助手,喜欢和用户分享想法。回答用户问题时,请尽量用生动、富有人情味的语言,让用户感觉像是在和朋友聊天。 diff --git a/examples/optimization/quickstart/optimizer.json b/examples/optimization/quickstart/optimizer.json new file mode 100644 index 00000000..bc274eb6 --- /dev/null +++ b/examples/optimization/quickstart/optimizer.json @@ -0,0 +1,88 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "contains", + "case_insensitive": true + } + } + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "num_samples": 1, + "generation_config": { + "max_tokens": 1024, + "temperature": 0.2 + } + }, + "rubrics": [ + { + "id": "numeric_correct", + "content": { + "text": "最终给出的数字答案是否与参考答案一致(数值相同即视为通过)。" + }, + "description": "数字答案与参考答案一致", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "reasoning_clear", + "content": { + "text": "回答中是否给出清晰、可追溯的推理或计算步骤,让读者能复核结果。" + }, + "description": "推理步骤清晰", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "units_present", + "content": { + "text": "最终数字答案是否带有正确的单位(例如:个、元、公里、千克、人 等)。" + }, + "description": "答案带正确单位", + "type": "FINAL_RESPONSE_QUALITY" + } + ] + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { + "max_tokens": 4096, + "temperature": 0.6 + } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 60, + "max_iterations_without_improvement": 8 + } + } +} diff --git a/examples/optimization/quickstart/run_optimization.py b/examples/optimization/quickstart/run_optimization.py new file mode 100644 index 00000000..1111c692 --- /dev/null +++ b/examples/optimization/quickstart/run_optimization.py @@ -0,0 +1,167 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Quickstart 入口脚本:演示用 GEPA 同时优化两个 prompt 文件。 + +适用场景 +-------- +你想跑通 prompt 自动优化的最小完整流程:让一个反思 LLM 看你 agent 的失败用例, +自动改写 prompt 直到通过率达标。本脚本是 10 个 example 的入门款。 + +这个文件做什么 +-------------- +1. 注册两个 prompt 文件作为优化目标(system.md + skill.md) +2. 定义 call_agent 回调(框架通过它驱动 agent) +3. 调 AgentOptimizer.optimize 开跑 + +怎么跑 +------ +1) 配三个环境变量:TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/quickstart/run_optimization.py +3) 看 runs/<时间戳>/ 下的 summary.txt 和 best_prompts/ + +接入自己业务时改哪里 +-------------------- +- target : 改成你自己的 prompt 文件路径(main 函数内) +- call_agent : 替换实现,让它调你的 agent(HTTP / 多 agent 链路 / + 远端 prompt 等其他形态见对应 example) +- update_source=False : 想跑完直接覆盖源文件改 True(典型 CI 场景) +- verbose : 0 静默 / 1 进度面板 / 2 加 gepa 内部日志 +- CONFIG_PATH : 算法和 metric 配置都在 optimizer.json +""" + +from __future__ import annotations + +import asyncio +import sys +import uuid +from datetime import datetime +from pathlib import Path + + +# ---- 路径自举:让脚本在任意 cwd 下都能运行 ---- +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content, Part + +from agent.agent import SKILL_PATH, SYSTEM_PROMPT_PATH, create_agent + + +# ---- 配置与数据路径 ---- +CONFIG_PATH = _HERE / "optimizer.json" # 算法 + metric 配置 +TRAIN_PATH = _HERE / "train.evalset.json" # 反思时的 minibatch 来源(5 条算术题) +VAL_PATH = _HERE / "val.evalset.json" # 每轮全量评估,决定是否接受候选 +RUNS_DIR = _HERE / "runs" # 每次运行写到独立时间戳子目录 +APP_NAME = "math_word_problem_optimizer" # Runner / SessionService 的命名空间 + + +async def call_agent(query: str) -> str: + """框架回调:用当前候选 prompt 驱动 agent 一次,返回最终回答文本。 + + 框架在以下时机会调用本函数: + - baseline 评估:每条 val case × num_runs 次 + - 每轮反思:每条 minibatch case 评测一次 + - 每轮验证:每条 val case × num_runs 次 + + 实现要点 + -------- + 1. 每次调用都从磁盘重读 prompt → GEPA 写入新候选后立即生效,无需重启进程 + 2. 每次调用独立创建 Runner + InMemorySessionService → 每个 case 拿到全新 + session state,并发评测时不互相污染(评估隔离的硬性要求) + 3. 只收集 is_final_response() 事件中非 thought 的文本 → 过滤掉 thinking + token,只返回正式回答 + + 参数 + ---- + query: 用户输入文本(来自 evalset 的 conversation[*].user_content) + + 返回 + ---- + agent 最终回答的纯文本(已 strip) + """ + # 每次调用重读 prompt 文件(在 create_agent() 内部完成) + root_agent = create_agent() + + # 每个 case 一份独立的 session 服务,保证并发评测时不会通过 session + # state 互相污染评分。 + session_service = InMemorySessionService() + runner = Runner( + app_name=APP_NAME, + agent=root_agent, + session_service=session_service, + ) + + session_id = str(uuid.uuid4()) + user_id = "optimizer" + await session_service.create_session( + app_name=APP_NAME, + user_id=user_id, + session_id=session_id, + state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + # 收集最终回答;过滤掉 thinking token(如果模型启用了 think 模式) + final_text = "" + async for event in runner.run_async( + user_id=user_id, + session_id=session_id, + new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: # 跳过 thinking,只保留正式回答 + continue + if part.text: + final_text += part.text + return final_text.strip() + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize。""" + + # 注册两个优化目标文件。 + # GEPA 的 round_robin module_selector 会每轮交替选其中一个改写—— + # 单轮只改一个文件能让反思 LM 更聚焦,也容易归因"是哪个文件提升了效果"。 + target = ( + TargetPrompt() + .add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + .add_path("skill", str(SKILL_PATH)) + ) + + # 每次运行落到独立目录,重复运行不覆盖历史结果 + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + # update_source=False:源 prompt 文件保持不变,最优候选只写到 + # output_dir/best_prompts/。改 True 则在 SUCCEEDED 后覆盖源文件, + # 适用于"跑完直接用"的 CI 场景(参考 ci_integration/ example)。 + update_source=False, + # verbose: 0 静默;1 Rich 进度面板;2 附带 gepa 内部诊断日志 + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/quickstart/train.evalset.json b/examples/optimization/quickstart/train.evalset.json new file mode 100644 index 00000000..1d5546a5 --- /dev/null +++ b/examples/optimization/quickstart/train.evalset.json @@ -0,0 +1,112 @@ +{ + "eval_set_id": "math_word_problems_train", + "name": "小学算术应用题 - 训练集", + "description": "5 道小学水平算术应用题,覆盖加法、乘法、折扣、百分比与「人/名」类单位;final_response 中带「答案:xxx」的标准答句供 contains 匹配,同时作为 LLM 裁判的参考答案。", + "eval_cases": [ + { + "eval_id": "wp_apples_add", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [{"text": "小明早上买了 4 个苹果,下午又买了 7 个苹果,他一共有多少个苹果?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:11 个"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_car_distance", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [{"text": "一辆汽车以每小时 60 公里的速度行驶 2.5 小时,一共行驶了多少公里?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:150 公里"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_discount_price", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [{"text": "一件衣服原价 200 元,现在打 8 折出售,折后价是多少元?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:160 元"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_glasses_percent", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [{"text": "班里一共有 40 名学生,其中 25% 戴眼镜,戴眼镜的有多少人?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:10 人"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_defect_items_percent", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [{"text": "一批商品共 50 件,其中 30% 是次品,次品有多少件?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:15 件"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "trainer", + "state": {} + } + } + ] +} diff --git a/examples/optimization/quickstart/val.evalset.json b/examples/optimization/quickstart/val.evalset.json new file mode 100644 index 00000000..9408bb11 --- /dev/null +++ b/examples/optimization/quickstart/val.evalset.json @@ -0,0 +1,70 @@ +{ + "eval_set_id": "math_word_problems_val", + "name": "小学算术应用题 - 验证集", + "description": "3 道小学水平算术应用题,覆盖乘法、单位换算和百分比;final_response 中带「答案:xxx」的标准答句供 contains 匹配,同时作为 LLM 裁判的参考答案。", + "eval_cases": [ + { + "eval_id": "wp_seats_multiply", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [{"text": "教室里有 5 排座位,每排 8 个,一共多少个座位?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:40 个"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_water_weight", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [{"text": "已知 1 升水重 1 千克,3.5 升水重多少千克?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:3.5 千克"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_class_girls_percent", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [{"text": "班里一共有 30 人,其中 60% 是女生,请问有多少名女生?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:18 人"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "validator", + "state": {} + } + } + ] +} diff --git a/examples/optimization/remote_prompt_store/README.md b/examples/optimization/remote_prompt_store/README.md new file mode 100644 index 00000000..5ae37fba --- /dev/null +++ b/examples/optimization/remote_prompt_store/README.md @@ -0,0 +1,208 @@ +# Remote Prompt Store — 接入远端配置中心做 prompt 优化 + +> **适用场景**:业务 prompt 不在本地文件,而由 ops 配在远端配置中心(七彩石 / Apollo / Nacos / 自研 KV / 数据库),业务服务从中心拉取使用。本 example 演示通过 `TargetPrompt.add_callback` 将优化器对接远端读写接口,并通过 production / sandbox 双 namespace 隔离生产数据。阅读前请先熟悉 `quickstart/README.md` 与 `http_service/README.md`。 + +## 1 · 适用问题与设计目标 + +远端 prompt 场景与本地文件场景的关键差异: + +- 优化器无法直接读写本地文件——必须通过用户提供的 async 函数操作远端 +- 生产 prompt 通常承担线上流量,未经审批的写入意味着合规风险 +- 不同环境(生产 / 沙箱 / 灰度)的 prompt 通常已经存在 namespace 隔离机制 + +本 example 的设计原则: + +- **优化器只读写沙箱 namespace**,生产 namespace 全程不被触碰 +- **`update_source=False` 强制约束**:跑完后沙箱自动回滚到 baseline,候选只输出到本地 `runs//best_prompts/`,由人工审批后另行同步到生产 +- **配置中心实现透明**:用户提供两个 async 函数(`read` / `write`),优化器对 KV 后端形态完全黑盒 + +| 输入 | 输出 | +| --- | --- | +| 一对 async 函数:`async read() -> str` 与 `async write(value: str) -> None` | 沙箱 namespace 中的最优 prompt 候选副本(runs/best_prompts/) | +| 沙箱 namespace 的写入权限 | 生产 namespace 不变;沙箱在收尾时自动回滚到 baseline | + +### 本 example 演示的最小用例 + +| 维度 | 值 | +| --- | --- | +| 业务任务 | 算术应用题求解(与 quickstart 同一类任务) | +| 远端 KV 模拟 | `store/fake_kv_store.py` 用本地 JSON 文件持久化的字典 | +| 优化目标 | `system_prompt` 字段,存储于 `system_prompt:sandbox` 这个 KV key | +| 验证指标 | `final_response_avg_score`(contains 匹配) | +| 训练 / 验证规模 | 5 条 / 3 条 | + +## 2 · 术语对照 + +仅列出本 example 引入的新概念。基础术语见 `quickstart/README.md` §2,`call_agent` async 资源约束见 `http_service/README.md` §2。 + +| 术语 | 含义 | +| --- | --- | +| **TargetPrompt.add_callback(name, read=, write=)** | 注册一个由用户函数驱动的 prompt 字段。`read` / `write` 必须是 async 函数;`read` 无参数返回 prompt 文本,`write` 接收新文本并写入。优化器在评测前调 `read`、产生新候选时调 `write`。 | +| **生产 / 沙箱 namespace** | 配置中心常见的环境隔离形态。本 example 用两个固定 KV key 模拟:`system_prompt:production`(线上读取)与 `system_prompt:sandbox`(优化器写入)。 | +| **自动回滚** | `update_source=False` 时优化器在 `finally` 阶段调用 `write` 把字段还原为运行开始时通过 `read` 获取的 baseline 快照,避免沙箱被遗留的候选污染。 | + +## 3 · 运行示例 + +### 3.1 安装依赖 + +```bash +pip install -e ".[optimize]" +``` + +### 3.2 配置环境变量 + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +### 3.3 启动 + +```bash +python examples/optimization/remote_prompt_store/run_optimization.py +``` + +启动时脚本会先调 `reset_store(...)` 把 production / sandbox 都初始化为 baseline。**这一步仅用于演示**——真实业务中生产 namespace 已由 ops 维护,无需重置。 + +### 3.4 产物结构 + +``` +runs// +├── result.json +├── summary.txt +├── baseline_prompts/ 运行前从 KV 读取的 baseline 快照 +├── best_prompts/ val 集得分最高的候选(待人工审批) +└── rounds/ + +store/store.json KV 持久化文件(演示用) + 收尾时 sandbox key 已被回滚到 baseline + production key 全程未变 +``` + +## 4 · 架构与数据流 + +``` +[配置中心 KV] + ├── "system_prompt:production" ← 线上服务读这里(永远不被优化器触碰) + └── "system_prompt:sandbox" ← 优化器读 / 写这里 + +[run_optimization.py] + │ + ├── reset_store(BASELINE_PROMPT) 演示前置:production = sandbox = baseline + │ (真实业务跳过此步) + │ + ├── TargetPrompt.add_callback( + │ "system_prompt", + │ read=read_sandbox_prompt, async () -> str 读 sandbox key + │ write=write_sandbox_prompt, async (str) -> None 写 sandbox key + │ ) + │ + ├── call_agent(query): + │ prompt_text = await read_sandbox_prompt() # 现读现用 + │ agent = create_agent(prompt_text) # 即时构造 + │ return await runner.run_async(...) # 跑一次推理 + │ + └── AgentOptimizer.optimize(update_source=False, ...) + ├── 每轮把候选 prompt 写入 sandbox key + ├── 收尾:sandbox key 自动回滚到 baseline 快照 + └── best_prompts/ 落本地,待人工审批 +``` + +### 4.1 文件清单 + +| 文件 | 角色 | 接入自有业务时的修改方向 | +| --- | --- | --- | +| `run_optimization.py` | 优化器入口,注册 callback | `reset_store(...)` 改为 ops 真实初始化(或直接删除);其余基本不变 | +| `agent/agent.py` | LlmAgent 工厂,prompt 通过参数注入 | 替换为业务 agent 构建逻辑 | +| `store/prompt_client.py` | async `read` / `write` 函数定义 | **核心改造点**:把内部实现替换为业务配置中心 SDK 调用,函数签名保持不变 | +| `store/fake_kv_store.py` | 本地 JSON 文件模拟 KV | 真实业务下整体删除 | +| `optimizer.json` | 算法 + metric 配置 | 与 quickstart 一致 | +| `train.evalset.json` / `val.evalset.json` | 数据集 | 替换为业务用例 | + +### 4.2 与 `http_service/` 的对照 + +唯一差异在 `TargetPrompt` 的注册方式: + +```python +# http_service:prompt 在本地文件 +target = TargetPrompt().add_path("system_prompt", "service/prompts/system.md") + +# remote_prompt_store:prompt 在远端 KV +target = TargetPrompt().add_callback( + "system_prompt", + read=read_sandbox_prompt, + write=write_sandbox_prompt, +) +``` + +`optimizer.json`、`call_agent` 的整体结构、metric 定义、产物 layout 均保持一致。 + +## 5 · 关键配置 + +### 5.1 `update_source` 的强制约束 + +远端 prompt 场景下**强烈建议始终保持 `update_source=False`**。理由: + +- 远端配置通常承担线上流量,自动写回意味着未审批变更直接进生产 +- 即便沙箱 namespace 也有联调 / 灰度等隐式约束,应避免让框架替业务做"提交"决策 +- `update_source=False` 时优化器收尾会把沙箱回滚到 baseline,唯一遗留物是本地 `best_prompts/`,由人工或审批工具决定后续动作 + +### 5.2 `read` / `write` 的实现约束 + +| 约束 | 说明 | +| --- | --- | +| 签名必须是 async | `read: async () -> str`;`write: async (str) -> None` | +| `read` 异常处理 | 优化器启动期会调一次 `read` 获取 baseline 快照。该次调用抛错会让 `optimize()` 直接 fail-fast,异常透传给调用方。运行中 `read` 抛错会导致当前 case 评测失败 | +| `write` 幂等性 | 优化器收尾时会再次调 `write` 把沙箱回滚到 baseline;若 `write` 不幂等或无事务保护,回滚可能失败。建议实现支持重复调用同一 value | +| 重试 | 配置中心 SDK 通常有内置重试;本 example 的 `read` / `write` 不额外封装重试,业务方按需自行加上 | + +## 6 · 接入真实配置中心 + +将 `store/prompt_client.py` 内部实现替换为业务 SDK 调用,**保持函数签名不变**: + +```python +# store/prompt_client.py 替换示例 +async def read_sandbox_prompt() -> str: + return await your_config_sdk.get( + namespace="sandbox", + key="system_prompt", + ) + +async def write_sandbox_prompt(value: str) -> None: + await your_config_sdk.put( + namespace="sandbox", + key="system_prompt", + value=value, + ) +``` + +`run_optimization.py` 中 `TargetPrompt.add_callback(...)` 调用与其他配置无需修改。 + +`fake_kv_store.py` 在真实接入后可整体删除。 + +## 7 · 常见问题 + +**Q:业务服务在另一个进程,优化器写入沙箱后服务能感知吗?** +A:取决于业务服务的 prompt 加载策略。**业务服务必须在每次请求时重新拉配置**(即"热加载"),否则优化器的写入对服务不可见、反思循环失效。这是与 `http_service/` example 完全相同的约束,只是介质从本地文件换成了远端 KV。 + +**Q:`reset_store(BASELINE_PROMPT)` 在生产环境也要调吗?** +A:不要。该调用仅用于演示首次接入时把 KV 初始化到已知状态。真实业务的生产 namespace 已由 ops 维护,优化器**只关心读 / 写沙箱**。 + +**Q:`read` 一次返回的内容会被缓存吗?** +A:不会。优化器在每次评测候选前都重新调 `read`,因此沙箱被写入新值后下一次 `call_agent` 立即生效。 + +**Q:跑完后如何同步候选到生产?** +A:本 example 的产物 `best_prompts/system_prompt.md` 为人工审批起点。建议的工作流:人工 review → 通过审批工具调用业务自有 SDK 把候选写入 production namespace(不通过本框架)。 + +**Q:能否优化多个远端字段?** +A:可以。`TargetPrompt` 支持多次 `add_callback`,每次注册一组独立的 `read` / `write`。多字段联合优化的算法层配置参见 `multi_agent_pipeline/` example。 + +## 8 · 接入自有业务的步骤 + +1. **替换 `store/prompt_client.py`**:实现 `read_sandbox_prompt` / `write_sandbox_prompt` 调用业务配置中心 SDK +2. **删除 `reset_store(...)` 调用** 或改为业务真实初始化逻辑 +3. **修改 `agent/agent.py`**:对接业务模型 / tools / output schema +4. **替换数据集**:`train.evalset.json` / `val.evalset.json` +5. **保持 `update_source=False`**:合规约束 +6. **运行**:观察 `summary.txt` 与 `result.json`;最优候选位于 `runs//best_prompts/`,由人工审批后通过业务自有流程同步到生产 diff --git a/examples/optimization/remote_prompt_store/agent/__init__.py b/examples/optimization/remote_prompt_store/agent/__init__.py new file mode 100644 index 00000000..bc6e483f --- /dev/null +++ b/examples/optimization/remote_prompt_store/agent/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/remote_prompt_store/agent/agent.py b/examples/optimization/remote_prompt_store/agent/agent.py new file mode 100644 index 00000000..d651214d --- /dev/null +++ b/examples/optimization/remote_prompt_store/agent/agent.py @@ -0,0 +1,48 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""数学题求解 agent —— Remote Prompt Store example 专用。 + +与 quickstart / http_service 的关键差异 +--------------------------------------- +本 agent **不读 prompt 文件**——prompt 通过 create_agent(prompt_text) 的 +入参传入。call_agent 在每次调用时先从远端 KV 拉最新 prompt,再用它 +构造 agent 实例。 + +这种"prompt 通过参数注入"的形态是远端 KV 场景的自然写法:业务服务 +在每次请求时从配置中心拉 prompt,再创建 agent,不依赖任何本地文件。 +""" + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.types import GenerateContentConfig + +from .config import get_model_config + + +def _create_model() -> LLMModel: + """构建 OpenAI 兼容 chat 模型实例。凭据从环境变量读取。""" + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def create_agent(prompt_text: str) -> LlmAgent: + """用给定 prompt 文本构造一个 LlmAgent 实例。 + + 参数 prompt_text 由调用方(call_agent)从远端 KV 现读现传, + 所以优化器把候选写入 KV 后下一次调用立即生效。 + """ + return LlmAgent( + name="math_word_problem_agent", + description="Math word-problem solver whose prompt lives in a remote KV store.", + model=_create_model(), + instruction=prompt_text, + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=2048, + ), + ) diff --git a/examples/optimization/remote_prompt_store/agent/config.py b/examples/optimization/remote_prompt_store/agent/config.py new file mode 100644 index 00000000..d0a64b15 --- /dev/null +++ b/examples/optimization/remote_prompt_store/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""模型凭据读取 —— 从环境变量加载 OpenAI 兼容 LLM 的连接信息。 + +需要的环境变量 +-------------- + TRPC_AGENT_API_KEY LLM 后端的 API key + TRPC_AGENT_BASE_URL LLM 后端的 endpoint + TRPC_AGENT_MODEL_NAME 模型名 + +缺任意一个就立即抛 ValueError,避免运行到一半才撞到 LLM 后端的 401 错误, +那时报错信息会很有迷惑性(看起来像 prompt 写错了,实际是凭据没配)。 +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """返回 (api_key, base_url, model_name);任一缺失立刻报错。""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "运行优化器前必须配置环境变量 TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME。" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/remote_prompt_store/optimizer.json b/examples/optimization/remote_prompt_store/optimizer.json new file mode 100644 index 00000000..03f74a59 --- /dev/null +++ b/examples/optimization/remote_prompt_store/optimizer.json @@ -0,0 +1,45 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "contains", + "case_insensitive": true + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { + "max_tokens": 4096, + "temperature": 0.6 + } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 40, + "score_threshold": 1.0, + "max_iterations_without_improvement": 5 + } + } +} diff --git a/examples/optimization/remote_prompt_store/run_optimization.py b/examples/optimization/remote_prompt_store/run_optimization.py new file mode 100644 index 00000000..bf770db7 --- /dev/null +++ b/examples/optimization/remote_prompt_store/run_optimization.py @@ -0,0 +1,161 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Remote Prompt Store example 的优化器入口。 + +适用场景 +-------- +业务 prompt 不在本地文件,而由 ops 配在远端配置中心(七彩石 / Apollo / +Nacos / 自研 KV)。本脚本演示通过 TargetPrompt.add_callback 接入用户提供 +的 async read / write 函数读写远端,并通过 production / sandbox 双 +namespace 隔离生产数据。 + +这个文件做什么 +-------------- +1. (演示用)reset_store 把 production + sandbox 都初始化为 baseline +2. 注册 add_callback:优化器通过 read_sandbox_prompt / write_sandbox_prompt + 异步函数与沙箱 namespace 交互 +3. 定义 call_agent:每次调用先从 KV 拉最新 prompt 再构造 agent +4. 调 AgentOptimizer.optimize 跑 GEPA 反思循环 +5. 收尾时打印生产 / 沙箱 namespace 的状态变化 + +怎么跑 +------ +1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/remote_prompt_store/run_optimization.py +3) 看 runs/<时间戳>/best_prompts/system_prompt.md(待人工审批) + +接入自有配置中心时改哪里 +------------------------ +- 删除 reset_store(...) 调用(真实业务下生产 namespace 已由 ops 维护) +- 替换 store/prompt_client.py 中 read/write 函数的内部实现为业务 SDK 调用 +- update_source=False 严格保持(防生产被未审批变更覆盖) +- 跑完后由人工审批工具把 best_prompts/ 同步到生产 +""" + +from __future__ import annotations + +import asyncio +import sys +import uuid +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + +from agent.agent import create_agent +from store.prompt_client import ( + PROMPT_KEY_PRODUCTION, + read_production_prompt, + read_sandbox_prompt, + reset_store, + write_sandbox_prompt, +) + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" +APP_NAME = "remote_prompt_store_demo_agent" + +# 演示用 baseline。真实业务里这一步对应"ops 已经在生产 KV 配好 prompt"。 +BASELINE_PROMPT = ( + "你是一个友好的聊天助手,喜欢和用户分享想法。回答用户问题时," + "请尽量用生动、富有人情味的语言,让用户感觉像是在和朋友聊天。\n" +) + + +async def call_agent(query: str) -> str: + """框架回调:从沙箱 KV 拉最新 prompt → 构造 agent → 跑一次推理。 + + 每次调用都重读 KV,保证优化器写入新候选后立即生效。每次新建 + Runner + InMemorySessionService 给每个 case 独立的 session state, + 并发评测时不互相污染。 + """ + prompt_text = await read_sandbox_prompt() + agent = create_agent(prompt_text) + + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "optimizer" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return final_text.strip() + + +async def main() -> None: + """组装 TargetPrompt(add_callback)+ 调 AgentOptimizer.optimize。""" + # 演示前置:把 KV 重置到"ops 刚配好生产 prompt + 同步到沙箱"的初始态。 + # 真实业务下不需要这一步——业务方的生产 KV 已经有 prompt。 + reset_store(BASELINE_PROMPT) + + # 用 add_callback 而非 add_path:优化器通过两个异步函数与沙箱交互, + # KV 后端形态对优化器完全黑盒。 + target = TargetPrompt().add_callback( + "system_prompt", + read=read_sandbox_prompt, + write=write_sandbox_prompt, + ) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + result = await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + # 远端 prompt 场景下严格保持 False:跑完自动把沙箱回滚到 baseline, + # 生产 namespace 永远不被触碰。最佳候选写到 output_dir/best_prompts/, + # 由人工审批后通过单独脚本 / 工单流程同步到生产。 + update_source=False, + verbose=1, + ) + + # 演示"审批后同步"工作流:实际生产中下方逻辑由独立审批工具触发。 + print("\n=== 优化已完成 ===") + print(f"baseline → best : {result.baseline_pass_rate:.4f} → {result.best_pass_rate:.4f}") + production_text = await read_production_prompt() + sandbox_text = await read_sandbox_prompt() + print(f"\n[KV] production ({PROMPT_KEY_PRODUCTION}) 内容长度: {len(production_text)} 字 (未变)") + print(f"[KV] sandbox 已自动回滚到 baseline,长度: {len(sandbox_text)} 字") + print(f"\n请在 {output_dir}/best_prompts/system_prompt.md 查看最佳候选;") + print("人工审批通过后,再调用 store.prompt_client 中的工具同步到生产。") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/remote_prompt_store/store/__init__.py b/examples/optimization/remote_prompt_store/store/__init__.py new file mode 100644 index 00000000..bc6e483f --- /dev/null +++ b/examples/optimization/remote_prompt_store/store/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/remote_prompt_store/store/fake_kv_store.py b/examples/optimization/remote_prompt_store/store/fake_kv_store.py new file mode 100644 index 00000000..f6944204 --- /dev/null +++ b/examples/optimization/remote_prompt_store/store/fake_kv_store.py @@ -0,0 +1,53 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""演示用:本地 JSON 文件模拟远端配置中心。 + +真实业务下这一层换成七彩石 / Apollo / Nacos / 自研 KV 的 SDK 即可, +对外暴露的 read(key) / write(key, value) 同步 API 保持不变。 + +数据 schema +----------- +store.json 是一个 {key: value} 字典,本 example 用两个固定 key: + "system_prompt:production" 生产 namespace 的 prompt + "system_prompt:sandbox" 沙箱 namespace 的 prompt(优化器读写) +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + + +class FakeKVStore: + """JSON 文件持久化的 KV,简化版的远端配置中心。""" + + def __init__(self, path: Path) -> None: + self._path = path + if not self._path.exists(): + self._path.write_text("{}", encoding="utf-8") + + def _load(self) -> dict[str, Any]: + return json.loads(self._path.read_text(encoding="utf-8")) + + def _save(self, data: dict[str, Any]) -> None: + self._path.write_text( + json.dumps(data, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + def read(self, key: str) -> str: + """读 KV;缺失时抛 KeyError,模拟远端"配置不存在"。""" + data = self._load() + if key not in data: + raise KeyError(f"prompt key not found in store: {key}") + return str(data[key]) + + def write(self, key: str, value: str) -> None: + """覆盖式写入。""" + data = self._load() + data[key] = value + self._save(data) diff --git a/examples/optimization/remote_prompt_store/store/prompt_client.py b/examples/optimization/remote_prompt_store/store/prompt_client.py new file mode 100644 index 00000000..2291cecb --- /dev/null +++ b/examples/optimization/remote_prompt_store/store/prompt_client.py @@ -0,0 +1,85 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""prompt KV 的 async 访问层 —— TargetPrompt.add_callback 期望的实现。 + +适用场景 +-------- +本文件是 add_callback 接入远端配置中心的**核心模板**。函数签名严格匹配 +add_callback 的协议:read 是 async () -> str,write 是 async (str) -> None。 + +namespace 隔离设计 +------------------ +- production:业务线上读取的 prompt,**永远不被优化器写入** +- sandbox:优化器读 / 写的工作 namespace;update_source=False 时优化器 + 在收尾阶段把 sandbox 自动回滚到 baseline 快照 + +接入自有配置中心时改哪里 +------------------------ +保持四个公开 async 函数的签名不变,把内部实现从 FakeKVStore 替换为 +业务真实 SDK 调用: + + async def read_sandbox_prompt() -> str: + return await your_config_sdk.get(namespace="sandbox", key="system_prompt") + + async def write_sandbox_prompt(value: str) -> None: + await your_config_sdk.put(namespace="sandbox", key="system_prompt", value=value) + +run_optimization.py 中 add_callback 调用无需修改。 +""" + +from __future__ import annotations + +import asyncio +from pathlib import Path + +from .fake_kv_store import FakeKVStore + + +# 演示用:本地 JSON 文件持久化的 KV。真实业务里这一层换成配置中心 SDK +# 的全局 client(如 _CFG_CLIENT = your_sdk.Client(...)),不再依赖本文件。 +_STORE_PATH = Path(__file__).resolve().parent / "store.json" +_KV = FakeKVStore(_STORE_PATH) + +PROMPT_KEY_PRODUCTION = "system_prompt:production" +PROMPT_KEY_SANDBOX = "system_prompt:sandbox" + + +async def read_sandbox_prompt() -> str: + """从沙箱 namespace 读 prompt——优化器评测候选时调用。 + + add_callback 期望此函数无参数返回当前 prompt 文本。 + """ + # 真实场景下走网络请求;这里 await asyncio.sleep(0) 模拟一次 await + # 切点,让协程在 KV 调用处可被调度。 + await asyncio.sleep(0) + return _KV.read(PROMPT_KEY_SANDBOX) + + +async def write_sandbox_prompt(value: str) -> None: + """写入沙箱 namespace——优化器落候选 / 收尾回滚 baseline 都走这里。 + + add_callback 期望此函数接受新 prompt 文本,无返回值。 + 实现需保证幂等性:优化器收尾时会再次调本函数把 sandbox 写回 baseline, + 不幂等的写入可能导致回滚失败。 + """ + await asyncio.sleep(0) + _KV.write(PROMPT_KEY_SANDBOX, value) + + +async def read_production_prompt() -> str: + """读生产 namespace 的 prompt——首次接入时用它初始化沙箱。""" + await asyncio.sleep(0) + return _KV.read(PROMPT_KEY_PRODUCTION) + + +def reset_store(production_prompt: str) -> None: + """演示用:把 KV 初始化到 production / sandbox 都为给定 prompt 的状态。 + + 真实业务下不应调本函数——业务的生产 namespace 由 ops 维护, + 优化器只关心读 / 写沙箱。 + """ + _KV.write(PROMPT_KEY_PRODUCTION, production_prompt) + _KV.write(PROMPT_KEY_SANDBOX, production_prompt) diff --git a/examples/optimization/remote_prompt_store/store/store.json b/examples/optimization/remote_prompt_store/store/store.json new file mode 100644 index 00000000..37998635 --- /dev/null +++ b/examples/optimization/remote_prompt_store/store/store.json @@ -0,0 +1,4 @@ +{ + "system_prompt:production": "你是一个友好的聊天助手,喜欢和用户分享想法。回答用户问题时,请尽量用生动、富有人情味的语言,让用户感觉像是在和朋友聊天。\n", + "system_prompt:sandbox": "你是一个友好的聊天助手,喜欢和用户分享想法。回答用户问题时,请尽量用生动、富有人情味的语言,让用户感觉像是在和朋友聊天。\n" +} \ No newline at end of file diff --git a/examples/optimization/remote_prompt_store/train.evalset.json b/examples/optimization/remote_prompt_store/train.evalset.json new file mode 100644 index 00000000..1d2cb465 --- /dev/null +++ b/examples/optimization/remote_prompt_store/train.evalset.json @@ -0,0 +1,112 @@ +{ + "eval_set_id": "remote_prompt_store_train", + "name": "Remote prompt store demo - train", + "description": "5 道小学算术应用题;agent 的 prompt 通过远端 KV callback 读写。", + "eval_cases": [ + { + "eval_id": "wp_apples_add", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [{"text": "小明早上买了 4 个苹果,下午又买了 7 个苹果,他一共有多少个苹果?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:11 个"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_car_distance", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [{"text": "一辆汽车以每小时 60 公里的速度行驶 2.5 小时,一共行驶了多少公里?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:150 公里"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_discount_price", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [{"text": "一件衣服原价 200 元,现在打 8 折出售,折后价是多少元?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:160 元"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_glasses_percent", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [{"text": "班里一共有 40 名学生,其中 25% 戴眼镜,戴眼镜的有多少人?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:10 人"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_defect_items_percent", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [{"text": "一批商品共 50 件,其中 30% 是次品,次品有多少件?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:15 件"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "trainer", + "state": {} + } + } + ] +} diff --git a/examples/optimization/remote_prompt_store/val.evalset.json b/examples/optimization/remote_prompt_store/val.evalset.json new file mode 100644 index 00000000..e3036b0c --- /dev/null +++ b/examples/optimization/remote_prompt_store/val.evalset.json @@ -0,0 +1,70 @@ +{ + "eval_set_id": "remote_prompt_store_val", + "name": "Remote prompt store demo - validation", + "description": "3 道小学算术应用题;用于每轮全量评估、决定候选是否被接受。", + "eval_cases": [ + { + "eval_id": "wp_seats_multiply", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [{"text": "教室里有 5 排座位,每排 8 个,一共多少个座位?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:40 个"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_water_weight", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [{"text": "已知 1 升水重 1 千克,3.5 升水重多少千克?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:3.5 千克"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_class_girls_percent", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [{"text": "班里一共有 30 人,其中 60% 是女生,请问有多少名女生?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:18 人"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "validator", + "state": {} + } + } + ] +} diff --git a/examples/optimization/slo_runtime_control/README.md b/examples/optimization/slo_runtime_control/README.md new file mode 100644 index 00000000..b21b8e2d --- /dev/null +++ b/examples/optimization/slo_runtime_control/README.md @@ -0,0 +1,218 @@ +# SLO Runtime Control — 多重停止条件下的运行时 SLO 守门 + +> **适用场景**:在 CI 流水线 / 夜间窗口等具有硬性时间和资源约束的环境下运行 prompt 优化,需要"任何一个 SLO 触发都立刻停"的多重停止策略。本 example 演示同时启用 SDK 提供的全部 6 种 algorithm-level stop conditions,并通过任务设计让任一条件都有机会成为最先触发者。阅读前请先熟悉 `quickstart/README.md` §2。 + +## 1 · 适用问题与设计目标 + +线上业务跑 prompt 优化的现实约束: + +- **时间预算硬性**:CI 流水线必须 N 分钟内结束,超出即失败 +- **调用预算硬性**:LLM 后端配额按月计算,单次优化不能跑爆预算 +- **候选池规模**:内存 / 评估时间随候选池增长,需要上限 +- **早停灵敏度**:连续若干轮无改善应主动放弃,不耗费剩余预算 + +单一停止条件无法同时覆盖以上诉求。SDK 提供的 6 种 algorithm-level stop conditions 满足"OR 语义"——任意一条触发即停止,使业务可以叠加多重 SLO。 + +| 输入 | 输出 | +| --- | --- | +| 6 种 stop condition 的阈值组合 | 满足最先触发条件的最优候选 | +| `OptimizeResult.stop_reason` 字段 | 哪条 SLO 抢闸的明确反馈 | + +### 本 example 演示的最小用例 + +| 维度 | 值 | +| --- | --- | +| 业务任务 | 客服工单分类(输入工单文本,输出 `{category, priority}` JSON) | +| 优化目标 | `agent/prompts/system.md` 单文件 | +| 验证指标 | `final_response_avg_score`(exact 匹配规范化 JSON) | +| 训练 / 验证规模 | 8 case / 4 case | +| 任务难度 | 训练集中混入 3 道边界混淆题,使 score_threshold 不会先抢闸,能观察其他 stopper 真实行为 | + +## 2 · 术语对照 + +仅列出本 example 引入的新概念。基础术语见 `quickstart/README.md` §2。 + +| 术语 | 含义 | +| --- | --- | +| **algorithm-level stop condition** | GEPA 算法内部的停止判定(如预算、超时、无改善),写在 `optimizer.json` 的 `algorithm` 段。 | +| **framework-level metric stop** | 优化器框架基于 metric 阈值的早停判定,写在 `optimizer.json` 的 `optimize.stop` 段(如 `required_metrics`)。 | +| **OR 语义** | 多个 stop condition 同时启用时,**任意一条触发即停止**。这是本 example 的核心机制。 | +| **抢闸** | 在多 stop condition OR 语义下,最先满足条件的那条决定最终 `stop_reason`。 | +| **完成当前轮再停** | timeout 等条件触发后框架不会立即 kill 当前 round,而是等当前 round 完成(避免候选数据丢失/污染)。 | + +## 3 · 运行示例 + +### 3.1 安装依赖 + +```bash +pip install -e ".[optimize]" +``` + +### 3.2 配置环境变量 + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +### 3.3 启动 + +```bash +python examples/optimization/slo_runtime_control/run_optimization.py +``` + +终端将输出每轮分数与最终 `stop_reason`,明确告知是哪条 SLO 触发了停止。 + +### 3.4 产物结构 + +``` +runs// +├── result.json 其中 stop_reason 字段标识抢闸者 +├── summary.txt +├── baseline_prompts/ +├── best_prompts/ +└── rounds/ +``` + +## 4 · 架构与数据流 + +``` +optimizer.optimize() + │ + ├─ baseline 评估 + │ + └─ for each round: + ├─ GEPA 反思 → candidate prompt + ├─ 写入 system.md + ├─ EvalConfig 触发 call_agent for each train sample + │ └─ create_agent() → Runner.run_async() → _normalize_response() + │ ↓ + │ final_response_avg_score(text.match=exact) + │ + └─ 6 个 stopper 在每轮结束时检查(OR 语义): + wall_clock ≥ 90s ? + metric_calls ≥ 30 ? + no_improvement ≥ 3 轮 ? + best_score ≥ 1.0 ? + proposals ≥ 12 ? + tracked_candidates ≥ 5 ? + ↓ + 任意一条满足 → 立即收尾,stop_reason 写入 OptimizeResult +``` + +### 4.1 文件清单 + +| 文件 | 角色 | 接入自有业务时的修改方向 | +| --- | --- | --- | +| `run_optimization.py` | 优化器入口(含 `_normalize_response`) | 与 quickstart 同 | +| `agent/agent.py` | LlmAgent 工厂 | 替换为业务 agent | +| `agent/prompts/system.md` | baseline prompt | 写入业务 baseline | +| `optimizer.json` | **核心改造点**:6 stop condition 阈值组合 | 按业务 SLO 调整每条阈值 | +| `train.evalset.json` / `val.evalset.json` | 数据集 | 替换为业务用例 | + +## 5 · 6 种 stop condition 详解 + +| 字段 | 本 example 值 | gepa 内部映射 | 抢闸条件 | 适用场景 | +| --- | --- | --- | --- | --- | +| `max_metric_calls` | 30 | `MaxMetricCallsStopper` | 累计 case 评估次数 ≥ 30 | LLM 配额硬上限 | +| `max_iterations_without_improvement` | 3 | `NoImprovementStopper` | 连续 N 轮 best valset 无提升 | 优化已收敛或陷入局部最优时主动放弃 | +| `timeout_seconds` | 90.0 | `TimeoutStopCondition` | wall-clock ≥ N 秒 | CI 流水线时间窗硬约束 | +| `score_threshold` | 1.0 | `ScoreThresholdStopper` | best valset pass_rate ≥ 阈值 | 已达业务目标,无需继续 | +| `max_candidate_proposals` | 12 | `MaxCandidateProposalsStopper` | reflection LM 累计提议次数 ≥ N | 限制反思 LM 调用预算 | +| `max_tracked_candidates` | 5 | `MaxTrackedCandidatesStopper` | Pareto 前沿候选池大小 ≥ N | 控制内存与 merge 候选空间规模 | + +### 5.1 至少配 1 个 + +`optimizer.json` 中至少配置上述 6 个字段中的 1 个,否则框架启动期 `_require_at_least_one_stop_condition` 报错。**多个同时启用即 OR 语义**——任一触发立即停止。 + +### 5.2 显式禁用 framework-level metric 早停 + +```jsonc +{ + "optimize": { + "stop": { + "required_metrics": [] // 显式禁用框架层 metric 早停 + }, + "algorithm": { + "max_metric_calls": 30, + "max_iterations_without_improvement": 3, + "timeout_seconds": 90.0, + "score_threshold": 1.0, + "max_candidate_proposals": 12, + "max_tracked_candidates": 5 + } + } +} +``` + +`required_metrics: []` 让 6 个 algorithm 级 stopper 独占 stop 决策权——避免框架层在 algorithm 层之前提前终止,影响对底层 stopper 行为的观察。 + +业务真实使用时是否禁用 framework-level 早停取决于诉求: + +- 仅关心 algorithm-level 时序与开销控制 → 禁用(本 example 的选择) +- 同时关心 metric 是否达标 → 启用 `required_metrics: "all"` 或具体 metric 列表(参见 quickstart §5) + +## 6 · 关键配置 + +### 6.1 timeout 不是 hard kill + +`timeout_seconds=90` 触发后框架不会立即 kill 当前正在跑的 round,而是等当前 round 结束。实际终止时间通常超过设定值。原因:中途 kill 会导致候选数据丢失 / 文件写入截断。 + +**业务面应对**: + +- 若 SLO 是**硬截止**(如 CI 流水线必须 N 分钟内结束),把 `timeout_seconds` 设为真实窗口的一半左右留出缓冲 +- 单轮典型耗时由 LLM 调用速度决定。可通过缩小 `reflection_minibatch_size` / `eval_case_parallelism` 控制单轮时长 + +### 6.2 阈值之间的相对关系 + +阈值之间应保持自洽,否则部分 stopper 永远不会触发: + +| 关系 | 含义 | +| --- | --- | +| `max_metric_calls > reflection_minibatch_size × max_iterations_without_improvement` | 否则 no_improvement 永远先抢闸 | +| `timeout_seconds > 单轮典型耗时 × 2` | 否则 timeout 在第 1 轮就触发,看不到优化进展 | +| `max_candidate_proposals ≥ 1` | 至少要让 reflection LM 跑过一次 | +| `max_tracked_candidates ≥ 2` | 否则 Pareto 前沿无法保留多于 baseline 的候选 | + +### 6.3 `_normalize_response` 的复用 + +与 `blackbox_cli/` example 完全相同的规范化逻辑:用 `json.dumps(sort_keys=True, ensure_ascii=False, separators=(",", ":"))` 把 LLM 输出转换为唯一字符串形态,使 `final_response_avg_score(text.match=exact)` 可直接走精确匹配,**评测层不依赖 LLM judge**——这对运行时控制场景至关重要,避免 judge 调用引入额外不确定性与时间开销。 + +## 7 · 常见问题 + +**Q:`stop_reason` 字段值有哪些?** +A:常见取值包括 `score_threshold_reached` / `budget_exhausted` / `timeout_reached` / `no_improvement` / `max_proposals_reached` / `max_tracked_candidates_reached` / `user_requested_stop`(由 `optimize.stop` 文件触发)。具体取值由触发的 stopper 决定。 + +**Q:触发 timeout 后产物完整吗?** +A:完整。"完成当前轮再停"语义保证当前轮的 round_*.json、result.json、summary.txt 都已写入。中途中止仅丢弃尚未开始的下一轮。 + +**Q:能否调整 stop condition 的优先级?** +A:不能。多 stop condition 间是 OR 语义且同步检查,最先满足条件的 stopper 决定 `stop_reason`。需要"优先看 timeout,timeout 之内尽量跑高 score"这种语义时,应把次要 stopper 的阈值放宽到永远不会先触发。 + +**Q:单轮已经超过 timeout 了怎么办?** +A:仍会等当前轮跑完才停止。若该轮跑得太久(如 LLM 卡住),可在 `call_agent` 内部对 LLM 调用加超时(见 `blackbox_cli/agent/call_agent.py` 的 `CLI_TIMEOUT_SEC` 模式)。 + +**Q:业务里只关心 timeout,其他不限怎么配?** +A:仅设 `timeout_seconds=<秒数>`,其余 5 个字段不写即可(默认禁用)。但需注意至少配 1 个 stopper。 + +## 8 · 实验建议:让其他 stopper 抢闸 + +通过调整阈值组合可以观察不同 stopper 的真实行为。可作为业务调参参考: + +| 想看哪条 stopper 抢闸 | 阈值调整方向 | +| --- | --- | +| `score_threshold` | 把 baseline 写得"约束更紧"让 GEPA 容易达 1.0;或把 score_threshold 调到 0.7 | +| `max_metric_calls` | 把 timeout_seconds 调高(如 600)+ minibatch 调小让评估速度快 | +| `max_iterations_without_improvement` | timeout_seconds 调高 + 任务设计成"难以再提升"的边界场景 | +| `max_candidate_proposals` | 调到 2、timeout=300 | +| `max_tracked_candidates` | 调到 2、timeout=300、`frontier_type="hybrid"`(多候选并存) | + +业务真实接入步骤: + +1. 测量典型业务负载下单轮耗时与单轮 metric_calls 数 +2. 按 SLO 反推每个 stopper 的合理阈值(如 CI 5min → timeout=180s 留 60s 缓冲) +3. 跑一次基准实验观察 `stop_reason` 是否如期 +4. 根据实际行为微调阈值 + +> 业务真实接入时不要复制本 example 的 6 个值——本 example 的阈值是为"演示效果可见"而设,实际业务应根据 LLM 后端速度、数据集规模、SLO 窗口反推。 diff --git a/examples/optimization/slo_runtime_control/agent/__init__.py b/examples/optimization/slo_runtime_control/agent/__init__.py new file mode 100644 index 00000000..c3f64077 --- /dev/null +++ b/examples/optimization/slo_runtime_control/agent/__init__.py @@ -0,0 +1,10 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""SLO runtime control demo agent — 客户工单分类。""" + +from .agent import SYSTEM_PROMPT_PATH, create_agent + +__all__ = ["SYSTEM_PROMPT_PATH", "create_agent"] diff --git a/examples/optimization/slo_runtime_control/agent/agent.py b/examples/optimization/slo_runtime_control/agent/agent.py new file mode 100644 index 00000000..6a930965 --- /dev/null +++ b/examples/optimization/slo_runtime_control/agent/agent.py @@ -0,0 +1,47 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""客户工单分类 agent —— SLO Runtime Control example 专用。 + +每次 create_agent() 重读 prompts/system.md,使优化器写入的新候选立即生效。 +单文件优化目标。 +""" + +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel, OpenAIModel +from trpc_agent_sdk.types import GenerateContentConfig + +from .config import get_model_config + + +SYSTEM_PROMPT_PATH = Path(__file__).parent / "prompts" / "system.md" + + +def _create_model() -> LLMModel: + """构建 OpenAI 兼容 chat 模型实例。""" + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def _read_instruction() -> str: + """从磁盘重读 system.md。""" + return SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + + +def create_agent() -> LlmAgent: + """构建一个使用当前磁盘 prompt 的新 LlmAgent 实例。""" + return LlmAgent( + name="ticket_classifier_agent", + description="A customer-service ticket classifier under multi-stop SLO control.", + model=_create_model(), + instruction=_read_instruction(), + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=512, + ), + ) diff --git a/examples/optimization/slo_runtime_control/agent/config.py b/examples/optimization/slo_runtime_control/agent/config.py new file mode 100644 index 00000000..d0a64b15 --- /dev/null +++ b/examples/optimization/slo_runtime_control/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""模型凭据读取 —— 从环境变量加载 OpenAI 兼容 LLM 的连接信息。 + +需要的环境变量 +-------------- + TRPC_AGENT_API_KEY LLM 后端的 API key + TRPC_AGENT_BASE_URL LLM 后端的 endpoint + TRPC_AGENT_MODEL_NAME 模型名 + +缺任意一个就立即抛 ValueError,避免运行到一半才撞到 LLM 后端的 401 错误, +那时报错信息会很有迷惑性(看起来像 prompt 写错了,实际是凭据没配)。 +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """返回 (api_key, base_url, model_name);任一缺失立刻报错。""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "运行优化器前必须配置环境变量 TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME。" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/slo_runtime_control/agent/prompts/system.md b/examples/optimization/slo_runtime_control/agent/prompts/system.md new file mode 100644 index 00000000..602d9eb7 --- /dev/null +++ b/examples/optimization/slo_runtime_control/agent/prompts/system.md @@ -0,0 +1 @@ +你帮用户分类工单。 diff --git a/examples/optimization/slo_runtime_control/optimizer.json b/examples/optimization/slo_runtime_control/optimizer.json new file mode 100644 index 00000000..339d95d7 --- /dev/null +++ b/examples/optimization/slo_runtime_control/optimizer.json @@ -0,0 +1,48 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "exact", + "case_insensitive": false + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": [] + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "instance", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 2, + "skip_perfect_score": false, + "use_merge": false, + "max_metric_calls": 30, + "max_iterations_without_improvement": 3, + "timeout_seconds": 90.0, + "score_threshold": 1.0, + "max_candidate_proposals": 12, + "max_tracked_candidates": 5 + } + } +} diff --git a/examples/optimization/slo_runtime_control/run_optimization.py b/examples/optimization/slo_runtime_control/run_optimization.py new file mode 100644 index 00000000..0ef65d00 --- /dev/null +++ b/examples/optimization/slo_runtime_control/run_optimization.py @@ -0,0 +1,143 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""SLO Runtime Control example 的优化器入口。 + +适用场景 +-------- +在 CI 流水线 / 夜间窗口等具有硬性时间和资源约束的环境下运行 prompt 优化, +需要"任何一个 SLO 触发都立刻停"的多重停止策略。本脚本演示同时启用 SDK +提供的 6 种 algorithm-level stop conditions,OR 语义抢闸。 + +这个文件做什么 +-------------- +1. 注册单字段 TargetPrompt(agent/prompts/system.md) +2. 定义 call_agent:用 _normalize_response 把 LLM 输出规范化为稳定 JSON + 字符串,使 final_response_avg_score 走 text exact 而非依赖 LLM judge +3. 调 AgentOptimizer.optimize;6 种 stop condition 阈值在 optimizer.json 中 + +怎么跑 +------ +1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/slo_runtime_control/run_optimization.py +3) 看 runs/<时间戳>/result.json 中的 stop_reason 字段,识别哪条 SLO 抢闸 + +接入自有业务时改哪里 +-------------------- +- optimizer.json 中 6 个 stop condition 阈值按业务 SLO 反推 + (详见 README §5 与 §8) +- agent/agent.py 改为业务 agent +- _normalize_response 按业务输出格式调整(业务非 JSON 输出可整体替换) +""" + +from __future__ import annotations + +import asyncio +import json +import re +import sys +import uuid +from datetime import datetime +from pathlib import Path + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt # noqa: E402 +from trpc_agent_sdk.runners import Runner # noqa: E402 +from trpc_agent_sdk.sessions import InMemorySessionService # noqa: E402 +from trpc_agent_sdk.types import Content, Part # noqa: E402 + +from agent.agent import SYSTEM_PROMPT_PATH, create_agent # noqa: E402 + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" +APP_NAME = "slo_runtime_control_agent" + + +_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) + + +def _normalize_response(raw: str) -> str: + """把 LLM 自由文本规范化成与 reference 完全一致的字符串形态。 + + 与 blackbox_cli / ci_integration 完全相同的规范化逻辑:让 + final_response_avg_score(text.match=exact) 直接走精确匹配, + 避免 LLM judge 引入额外不确定性与时间开销——这对运行时 SLO + 控制场景至关重要。 + """ + text = (raw or "").strip() + if not text: + return "" + match = _JSON_OBJECT_RE.search(text) + if not match: + return text + try: + parsed = json.loads(match.group(0)) + except json.JSONDecodeError: + return text + return json.dumps(parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")) + + +async def call_agent(query: str) -> str: + """框架回调:跑一次推理,输出经 _normalize_response 规范化。 + + 每个 case 一份独立的 Runner + InMemorySessionService,保证并发评测时 + session state 不互相污染。 + """ + root_agent = create_agent() + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=root_agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "optimizer" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={} + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return _normalize_response(final_text) + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize。""" + target = TargetPrompt().add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/slo_runtime_control/train.evalset.json b/examples/optimization/slo_runtime_control/train.evalset.json new file mode 100644 index 00000000..ee853949 --- /dev/null +++ b/examples/optimization/slo_runtime_control/train.evalset.json @@ -0,0 +1,239 @@ +{ + "eval_set_id": "slo_runtime_train", + "name": "SLO runtime control demo - train", + "description": "8 客服工单。category in {account,billing,technical,feedback}; priority in {low,normal,high}. final_response 已规范化为紧凑 JSON 与 _normalize_response 输出格式一致,用 final_response_avg_score(text.match=exact) 做硬比对。混入 t5/t6/t8 三道边界混淆题让模型不易一次到 1.0,迫使其他 stopper(timeout/iterations/proposals)有机会先抢闸。", + "eval_cases": [ + { + "eval_id": "ticket_t1", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [ + { + "text": "我忘记了登录密码,怎么找回?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"account\",\"priority\":\"normal\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t2", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [ + { + "text": "上个月被多扣了 99 元话费,请退款。" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"billing\",\"priority\":\"high\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t3", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [ + { + "text": "App 启动后立刻闪退,已重装两次都不行。" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"technical\",\"priority\":\"high\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t4", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [ + { + "text": "希望增加深色模式,眼睛看亮屏太累。" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"feedback\",\"priority\":\"low\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t5", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [ + { + "text": "我账号被盗了,登录提示设备异常请求验证码也收不到。" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"account\",\"priority\":\"high\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t6", + "conversation": [ + { + "invocation_id": "t6", + "user_content": { + "parts": [ + { + "text": "续费按钮点了没反应,但银行短信显示已扣款。" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"billing\",\"priority\":\"high\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t7", + "conversation": [ + { + "invocation_id": "t7", + "user_content": { + "parts": [ + { + "text": "你们的客服态度很好,给个赞。" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"feedback\",\"priority\":\"low\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t8", + "conversation": [ + { + "invocation_id": "t8", + "user_content": { + "parts": [ + { + "text": "扣费失败但客户说想取消账户,下个月还要继续吗?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"account\",\"priority\":\"normal\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + } + ] +} \ No newline at end of file diff --git a/examples/optimization/slo_runtime_control/val.evalset.json b/examples/optimization/slo_runtime_control/val.evalset.json new file mode 100644 index 00000000..408b3af9 --- /dev/null +++ b/examples/optimization/slo_runtime_control/val.evalset.json @@ -0,0 +1,123 @@ +{ + "eval_set_id": "slo_runtime_val", + "name": "SLO runtime control demo - val", + "description": "4 道留出验证 case,含 1 道边界混淆(v1 billing vs technical)。", + "eval_cases": [ + { + "eval_id": "ticket_v1", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [ + { + "text": "信用卡支付一直转圈最后失败,账单页面也打不开。" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"technical\",\"priority\":\"high\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "ticket_v2", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [ + { + "text": "希望支持微信和支付宝同时绑定。" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"feedback\",\"priority\":\"low\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "ticket_v3", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [ + { + "text": "如何修改我的注册手机号?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"account\",\"priority\":\"normal\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "ticket_v4", + "conversation": [ + { + "invocation_id": "v4", + "user_content": { + "parts": [ + { + "text": "12 月份发票怎么开?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"billing\",\"priority\":\"low\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "validator", + "state": {} + } + } + ] +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c4e0e85a..6c47d4d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,11 @@ eval = [ "tabulate", ] +optimize = [ + "gepa>=0.0.7", + "rich>=13.0.0", +] + mem0 = [ "mem0ai>=1.0.3", "sentence-transformers", @@ -151,6 +156,8 @@ all = [ "wecom-aibot-sdk-python>=0.1.5", "a2a-sdk<1.0.0,>=0.3.22", "e2b-code-interpreter>=2.0.0", + "gepa>=0.0.7", + "rich>=13.0.0", ] [project.scripts] diff --git a/requirements-test.txt b/requirements-test.txt index 123052a5..c54dad06 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -10,6 +10,8 @@ unittest-xml-reporting rouge-score pandas tabulate +gepa>=0.0.7 +rich>=13.0.0 # Test DB greenlet diff --git a/requirements.txt b/requirements.txt index 4360f19f..5e7264de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -50,3 +50,5 @@ charset-normalizer>=3.0.0 litellm>=1.75.5 mempalace==3.3.4 json-repair>=0.40.0 +gepa>=0.0.7 +rich>=13.0.0 diff --git a/tests/evaluation/test_agent_optimizer.py b/tests/evaluation/test_agent_optimizer.py new file mode 100644 index 00000000..d7b061f7 --- /dev/null +++ b/tests/evaluation/test_agent_optimizer.py @@ -0,0 +1,1285 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for AgentOptimizer facade dispatch.""" + +from __future__ import annotations + +from typing import Optional + +import pytest + +from trpc_agent_sdk.evaluation._agent_optimizer import AgentOptimizer +from trpc_agent_sdk.evaluation._eval_case import EvalCase +from trpc_agent_sdk.evaluation._eval_case import Invocation +from trpc_agent_sdk.evaluation._eval_set import EvalSet +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import GepaReflectiveOptimizer +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +def _invocation(user_text: str, response_text: Optional[str] = None) -> Invocation: + final_response = ( + Content(role="model", parts=[Part.from_text(text=response_text)]) + if response_text is not None + else None + ) + return Invocation( + user_content=Content(role="user", parts=[Part.from_text(text=user_text)]), + final_response=final_response, + ) + + +def _eval_case(eval_id: str = "c1") -> EvalCase: + return EvalCase(eval_id=eval_id, conversation=[_invocation("hi", "ack")]) + + +async def _stub_call_agent(query: str) -> str: + return "stub" + + +def _new_target_prompt(recorder: Optional[dict[str, str]] = None) -> TargetPrompt: + target = TargetPrompt() + state = recorder if recorder is not None else {} + + async def read_cb() -> str: + return state.get("instruction", "initial") + + async def write_cb(value: str) -> None: + state["instruction"] = value + + target.add_callback("instruction", read=read_cb, write=write_cb) + return target + + +class _FakeGEPAResult: + def __init__(self, candidates, val_scores): + self.candidates = candidates + self.val_aggregate_scores = val_scores + self.parents = [[None]] + [[i - 1] for i in range(1, len(candidates))] + self.discovery_eval_counts = [0] * len(candidates) + self.total_metric_calls = 0 + self.best_outputs_valset = None + + @property + def best_idx(self) -> int: + return max( + range(len(self.val_aggregate_scores)), + key=lambda i: self.val_aggregate_scores[i], + ) + + +def _write_config_file( + tmp_path, + algo_name: str = "gepa_reflective", + *, + extra_algo: Optional[dict] = None, +) -> str: + """Write a valid optimizer.json file to tmp_path and return its path. + + ``extra_algo`` is merged into the algorithm block to override or add + optional fields (e.g. ``use_merge``). + """ + import json + algo_block = { + "name": algo_name, + "reflection_lm": { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + }, + "max_metric_calls": 30, + } + if extra_algo: + algo_block.update(extra_algo) + payload = { + "evaluate": { + "metrics": [{"metric_name": "m1", "threshold": 0.7}], + "num_runs": 1, + }, + "optimize": { + "algorithm": algo_block, + }, + } + config_path = tmp_path / "optimizer.json" + config_path.write_text(json.dumps(payload), encoding="utf-8") + return str(config_path) + + +@pytest.mark.asyncio +async def test_facade_reads_config_file_and_dispatches(tmp_path, monkeypatch): + """End-to-end: AgentOptimizer.optimize(config_path=...) reads the file + dispatches.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + recorder: dict[str, str] = {} + target = _new_target_prompt(recorder) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "test1"), + update_source=True, + verbose=0, + ) + + assert result.status == "SUCCEEDED" + assert result.best_pass_rate == pytest.approx(0.9) + assert result.best_prompts == {"instruction": "improved"} + assert recorder["instruction"] == "improved" + + +@pytest.mark.asyncio +async def test_facade_unknown_algorithm_raises_valueerror(tmp_path): + """If config.optimize.algorithm.name is not registered, raise ValueError listing options.""" + import json + payload = { + "evaluate": {"metrics": [{"metric_name": "m1", "threshold": 0.7}], "num_runs": 1}, + "optimize": { + "algorithm": { + "name": "no_such_algorithm", + "reflection_lm": { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + }, + "max_metric_calls": 30, + }, + }, + } + config_path = tmp_path / "optimizer.json" + config_path.write_text(json.dumps(payload), encoding="utf-8") + + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "test_unknown"), + verbose=0, + ) + + msg = str(exc_info.value) + assert "no_such_algorithm" in msg + + +@pytest.mark.asyncio +async def test_facade_unknown_algorithm_lists_available_algorithms(tmp_path): + """API-A1: error message must enumerate registered algorithms so the user + can see what they should have written instead. Previously pydantic's + literal_error fired first and produced 'Input should be ...' without + listing alternatives.""" + import json + payload = { + "evaluate": {"metrics": [{"metric_name": "m1", "threshold": 0.7}], "num_runs": 1}, + "optimize": { + "algorithm": { + "name": "gepa_reflactive", # typo of gepa_reflective + "reflection_lm": { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + }, + "max_metric_calls": 30, + }, + }, + } + config_path = tmp_path / "optimizer.json" + config_path.write_text(json.dumps(payload), encoding="utf-8") + + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "typo_check"), + verbose=0, + ) + + msg = str(exc_info.value) + # Friendly enumeration: must include both the typo and at least one + # registered algorithm so users see what to type. + assert "gepa_reflactive" in msg + assert "Available algorithms" in msg + assert "gepa_reflective" in msg + + +@pytest.mark.asyncio +async def test_facade_missing_config_file_raises(tmp_path): + """If config_path does not exist, propagate FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + await AgentOptimizer.optimize( + config_path=str(tmp_path / "nonexistent.json"), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "missing"), + verbose=0, + ) + + +def test_facade_is_exported_from_evaluation_package(): + import trpc_agent_sdk.evaluation as ev + assert ev.AgentOptimizer is AgentOptimizer + + +@pytest.mark.asyncio +async def test_facade_persists_artifacts_under_output_dir(tmp_path, monkeypatch): + """The facade must materialise result.json, summary.txt, rounds/*.json, + baseline_prompts/, best_prompts/, config.snapshot.json and run.log under + output_dir for every successful run.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + output_dir = tmp_path / "runs" / "artifact_check" + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + verbose=0, + ) + + assert (output_dir / "result.json").is_file() + assert (output_dir / "summary.txt").is_file() + assert (output_dir / "config.snapshot.json").is_file() + assert (output_dir / "run.log").is_file() + assert (output_dir / "baseline_prompts" / "instruction.md").is_file() + assert (output_dir / "best_prompts" / "instruction.md").is_file() + best_text = (output_dir / "best_prompts" / "instruction.md").read_text(encoding="utf-8") + assert best_text == "improved" + log_line = (output_dir / "run.log").read_text(encoding="utf-8") + assert "SUCCEEDED" in log_line + + +@pytest.mark.asyncio +async def test_facade_persists_artifacts_when_algorithm_fails(tmp_path, monkeypatch): + """Even when the algorithm returns a FAILED result the facade should + still leave baseline_prompts, config snapshot and run.log on disk so + debug context is preserved across runs.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + async def boom(self, **kwargs): + raise RuntimeError("evaluator timeout") + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", boom) + + output_dir = tmp_path / "runs" / "failure_check" + result = await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + verbose=0, + ) + assert result.status == "FAILED" + assert "evaluator timeout" in result.error_message + assert (output_dir / "result.json").is_file() + assert (output_dir / "baseline_prompts" / "instruction.md").is_file() + assert (output_dir / "config.snapshot.json").is_file() + assert (output_dir / "run.log").is_file() + + +@pytest.mark.asyncio +async def test_facade_verbose_zero_emits_no_terminal_output( + tmp_path, monkeypatch, capsys +): + """verbose=0 must suppress every reporter event so the user can run the + optimizer inside batch pipelines without polluting downstream stdout.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "silent"), + verbose=0, + ) + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + + +# ----- A3-A6: optimizer startup-time input validation (spec §3.2) ----- + + +@pytest.mark.asyncio +async def test_facade_rejects_tool_trajectory_avg_score_metric(tmp_path): + """spec §3.2 / acceptance #12: tool_trajectory_avg_score requires session traces + so it is unusable in call_agent mode; reject at startup.""" + import json + payload = { + "evaluate": { + "metrics": [{"metric_name": "tool_trajectory_avg_score", "threshold": 0.8}], + "num_runs": 1, + }, + "optimize": { + "algorithm": { + "name": "gepa_reflective", + "reflection_lm": { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + }, + "max_metric_calls": 10, + }, + }, + } + config_path = tmp_path / "optimizer.json" + config_path.write_text(json.dumps(payload), encoding="utf-8") + + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "metric_check"), + verbose=0, + ) + assert "tool_trajectory_avg_score" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_facade_rejects_llm_rubric_knowledge_recall_metric(tmp_path): + """F-4: ``llm_rubric_knowledge_recall`` reads tool responses from + ``Invocation.intermediate_data``; ``RemoteEvalService`` always emits + ``intermediate_data=None`` so the judge would silently see "No + knowledge search results were found." for every case. Reject at + startup so users do not waste an optimization run on a metric that + can never produce a non-zero score in call_agent mode. + """ + import json + payload = { + "evaluate": { + "metrics": [{"metric_name": "llm_rubric_knowledge_recall", "threshold": 0.8}], + "num_runs": 1, + }, + "optimize": { + "algorithm": { + "name": "gepa_reflective", + "reflection_lm": { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + }, + "max_metric_calls": 10, + }, + }, + } + config_path = tmp_path / "optimizer.json" + config_path.write_text(json.dumps(payload), encoding="utf-8") + + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "metric_check_recall"), + verbose=0, + ) + assert "llm_rubric_knowledge_recall" in str(exc_info.value) + # Error message should hint at compatible alternatives so users can + # immediately switch instead of guessing. + assert "final_response_avg_score" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_facade_rejects_empty_target_prompt(tmp_path): + """spec §3.2: TargetPrompt with no registered fields is a usage error.""" + config_path = _write_config_file(tmp_path) + empty_target = TargetPrompt() + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=empty_target, + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "empty_target"), + verbose=0, + ) + assert "TargetPrompt" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_facade_rejects_non_async_call_agent(tmp_path): + """spec §3.2: call_agent must be async; reject sync functions at startup.""" + config_path = _write_config_file(tmp_path) + + def sync_call_agent(query: str) -> str: + return "stub" + + with pytest.raises(TypeError) as exc_info: + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=sync_call_agent, # type: ignore[arg-type] + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "sync_check"), + verbose=0, + ) + assert "async" in str(exc_info.value).lower() + + +@pytest.mark.asyncio +async def test_facade_rejects_same_train_and_validation_paths(tmp_path): + """spec §3.2: train and validation paths must be different to avoid train-test leakage.""" + config_path = _write_config_file(tmp_path) + same_path = tmp_path / "shared.evalset.json" + same_path.write_text("{}", encoding="utf-8") + + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(same_path), + validation_dataset_path=str(same_path), + output_dir=str(tmp_path / "runs" / "leakage_check"), + verbose=0, + ) + assert "train" in str(exc_info.value).lower() or "leak" in str(exc_info.value).lower() + + +@pytest.mark.asyncio +async def test_facade_warns_when_use_merge_with_single_field(tmp_path): + """GEPA-3: gepa merge degenerates to picking one of two parents when only + a single component is registered. Surface a UserWarning so users don't + silently see merge_rounds_total=0.""" + from trpc_agent_sdk.evaluation._optimize_config import load_optimize_config + + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text("{}", encoding="utf-8") + val_path.write_text("{}", encoding="utf-8") + + config_path = _write_config_file(tmp_path, extra_algo={"use_merge": True}) + config = load_optimize_config(config_path) + + with pytest.warns(UserWarning, match="use_merge=true"): + AgentOptimizer._validate_inputs( + config=config, + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), # single callback field + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "merge_warn"), + ) + + +@pytest.mark.asyncio +async def test_facade_no_warn_when_use_merge_with_two_fields(tmp_path): + """Multi-field config + use_merge=True: warning must NOT fire.""" + import warnings as _warnings + from trpc_agent_sdk.evaluation._optimize_config import load_optimize_config + + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text("{}", encoding="utf-8") + val_path.write_text("{}", encoding="utf-8") + + config_path = _write_config_file(tmp_path, extra_algo={"use_merge": True}) + config = load_optimize_config(config_path) + + target = TargetPrompt() + state_a: dict[str, str] = {} + state_b: dict[str, str] = {} + + async def read_a() -> str: + return state_a.get("v", "") + + async def write_a(v: str) -> None: + state_a["v"] = v + + async def read_b() -> str: + return state_b.get("v", "") + + async def write_b(v: str) -> None: + state_b["v"] = v + + target.add_callback("a", read=read_a, write=write_a) + target.add_callback("b", read=read_b, write=write_b) + + with _warnings.catch_warnings(): + _warnings.simplefilter("error", UserWarning) # any UserWarning fails the test + AgentOptimizer._validate_inputs( + config=config, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "merge_two_fields"), + ) + + +@pytest.mark.asyncio +async def test_facade_no_warn_when_use_merge_false_single_field(tmp_path): + """use_merge=false (default) + single field: warning must NOT fire.""" + import warnings as _warnings + from trpc_agent_sdk.evaluation._optimize_config import load_optimize_config + + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text("{}", encoding="utf-8") + val_path.write_text("{}", encoding="utf-8") + + config_path = _write_config_file(tmp_path) # default use_merge=False + config = load_optimize_config(config_path) + + with _warnings.catch_warnings(): + _warnings.simplefilter("error", UserWarning) + AgentOptimizer._validate_inputs( + config=config, + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "no_merge"), + ) + + +@pytest.mark.asyncio +async def test_facade_restores_baseline_when_writeback_fails(tmp_path, monkeypatch): + """If update_source=True but writing the best candidate back fails, sources + must end up at the original baseline (not mid-run candidate) and the + write-back exception must surface to the caller.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + recorder: dict[str, str] = {"instruction": "BASELINE"} + target = _new_target_prompt(recorder) + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "BASELINE"}, {"instruction": "MID_CANDIDATE"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + # Simulate gepa rewriting the source during a round. + recorder["instruction"] = "MID_CANDIDATE" + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + # Patch write_all to fail only when the best is about to be persisted. + original_write_all = target.write_all + call_count = {"n": 0} + + async def explosive_write_all(prompts): + call_count["n"] += 1 + if prompts.get("instruction") == "IMPROVED_BEST": + raise RuntimeError("disk full") + await original_write_all(prompts) + + # Make optimizer.run() set best_prompts to a distinct value the test can + # detect; rebuild fake gepa result. + fake_gepa_result.candidates = [{"instruction": "BASELINE"}, {"instruction": "IMPROVED_BEST"}] + target.write_all = explosive_write_all # type: ignore[assignment] + + with pytest.raises(RuntimeError, match="disk full"): + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "writeback_fail"), + update_source=True, + verbose=0, + ) + + assert recorder["instruction"] == "BASELINE", ( + "after a failed write-back the source must be restored to baseline, " + f"got {recorder['instruction']!r}" + ) + + +@pytest.mark.asyncio +async def test_facade_default_update_source_false_keeps_source_intact(tmp_path, monkeypatch): + """A2: default ``update_source=False`` MUST leave TargetPrompt source untouched.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + recorder: dict[str, str] = {"instruction": "INITIAL"} + target = _new_target_prompt(recorder) + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "INITIAL"}, {"instruction": "IMPROVED"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "default_keep"), + verbose=0, + ) + + assert result.best_prompts == {"instruction": "IMPROVED"} + assert recorder["instruction"] == "INITIAL", ( + "default update_source=False MUST NOT write the best candidate back to source" + ) + + +@pytest.mark.asyncio +async def test_facade_update_source_true_writes_best_back(tmp_path, monkeypatch): + """A2: explicit ``update_source=True`` writes the best candidate back to TargetPrompt.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + recorder: dict[str, str] = {"instruction": "INITIAL"} + target = _new_target_prompt(recorder) + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "INITIAL"}, {"instruction": "IMPROVED"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "update_true"), + update_source=True, + verbose=0, + ) + + assert result.best_prompts == {"instruction": "IMPROVED"} + assert recorder["instruction"] == "IMPROVED" + + +@pytest.mark.asyncio +async def test_facade_accepts_train_and_validation_paths_differing_only_by_dot_slash(tmp_path): + """Resolve symlinks/relative prefixes so './x' and 'x' are detected as same file.""" + config_path = _write_config_file(tmp_path) + same_path = tmp_path / "shared.evalset.json" + same_path.write_text("{}", encoding="utf-8") + train_str = f"{same_path.parent}/./{same_path.name}" + + with pytest.raises(ValueError): + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=train_str, + validation_dataset_path=str(same_path), + output_dir=str(tmp_path / "runs" / "dotslash_check"), + verbose=0, + ) + + +@pytest.mark.asyncio +async def test_facade_forwards_extra_stop_and_gepa_callbacks(tmp_path, monkeypatch): + """AgentOptimizer.optimize must forward extra_stop/gepa_callbacks to the algorithm.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + def sentinel_stopper(gepa_state=None): + return False + + sentinel_callback = object() + captured: dict = {} + + async def _capture_run(self, *, reporter=None): + from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult + + captured["extra_stop"] = list(self.extra_stop_callbacks) + captured["extra_gepa"] = list(self.extra_gepa_callbacks) + return OptimizeResult( + algorithm="gepa_reflective", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + baseline_prompts={"instruction": "initial"}, + best_prompts={"instruction": "initial"}, + total_rounds=0, + rounds=[], + total_reflection_lm_calls=0, + total_judge_model_calls=0, + total_llm_cost=0.0, + duration_seconds=0.0, + started_at="2026-05-18T00:00:00+00:00", + finished_at="2026-05-18T00:00:00+00:00", + extras={}, + ) + + monkeypatch.setattr(GepaReflectiveOptimizer, "run", _capture_run) + + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "extras"), + extra_stop_callbacks=[sentinel_stopper], + extra_gepa_callbacks=[sentinel_callback], + verbose=0, + ) + + assert sentinel_stopper in captured["extra_stop"] + assert sentinel_callback in captured["extra_gepa"] + + +@pytest.mark.asyncio +async def test_facade_summary_txt_reflects_update_source_true(tmp_path, monkeypatch): + """DOC-1: summary.txt must reflect the actual update_source value used. + Previously _persist_artifacts hard-coded update_source=False so the file + contradicted the terminal banner whenever the user passed update_source=True.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + # Output dir intentionally lacks the substring "true" so the assertion + # below cannot accidentally match the path itself. + output_dir = tmp_path / "runs" / "us_check_a" + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + update_source=True, + verbose=0, + ) + + summary_text = (output_dir / "summary.txt").read_text(encoding="utf-8") + # format_summary writes the exact line "update_source : true" / "false". + assert "update_source : true" in summary_text, ( + f"summary.txt should reflect update_source=True; got:\n{summary_text}" + ) + assert "update_source : false" not in summary_text + + +@pytest.mark.asyncio +async def test_facade_summary_txt_reflects_update_source_false(tmp_path, monkeypatch): + """Complement: when update_source=False (default), summary still reflects that.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + output_dir = tmp_path / "runs" / "us_check_b" + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + update_source=False, + verbose=0, + ) + + summary_text = (output_dir / "summary.txt").read_text(encoding="utf-8") + assert "update_source : false" in summary_text + assert "update_source : true" not in summary_text + + +# --- FAIL-2: cleanup_done sentinel prevents double baseline write_all --- + +@pytest.mark.asyncio +async def test_facade_failed_writeback_invokes_baseline_callback_exactly_once( + tmp_path, monkeypatch +): + """FAIL-2: when write_all(best) raises, ``cleanup_done`` must guarantee the + ``except`` rollback restore_baseline call is NOT followed by a second + restore in ``finally``. + + Pre-fix code flipped ``writeback_succeeded`` only on the happy path, so + the failure path executed write_all(baseline) twice: once in ``except``, + once in ``finally``. Path-backed fields are idempotent (tmp + replace + is harmless), but callback-backed fields with non-idempotent + ``write_fn`` (audit logs, version counters) saw their hook fire twice + per failed update_source=True run. + """ + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + + # Spy on every write_fn call so we can count exactly how many times + # baseline is persisted after the best-write fails. + write_log: list[str] = [] + state: dict[str, str] = {"instruction": "BASELINE"} + + async def read_cb() -> str: + return state["instruction"] + + async def write_cb(value: str) -> None: + write_log.append(value) + if value == "IMPROVED_BEST": + raise RuntimeError("disk full while writing best candidate") + state["instruction"] = value + + target = TargetPrompt().add_callback( + "instruction", read=read_cb, write=write_cb + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "BASELINE"}, {"instruction": "IMPROVED_BEST"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + with pytest.raises(RuntimeError, match="disk full"): + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "fail2_double_baseline"), + update_source=True, + verbose=0, + ) + + # Expected sequence: best attempt (fails) -> baseline restore (success). + # Pre-fix would have appended a second "BASELINE" from the finally block. + assert write_log == ["IMPROVED_BEST", "BASELINE"], ( + "baseline write_fn must be invoked exactly once after a failed " + f"update_source=True writeback; got {write_log!r}" + ) + assert state["instruction"] == "BASELINE" + + +@pytest.mark.asyncio +async def test_facade_success_path_does_not_re_restore_baseline( + tmp_path, monkeypatch +): + """FAIL-2 happy-path counterpart: when write_all(best) succeeds, the + ``finally`` block must NOT re-write baseline either. + + Pre-fix code was also wrong here in a milder way: if ``writeback_succeeded`` + was False at finally entry the restore fired. The flag flipped on + success so the bug did not manifest on the happy path, but this test + pins the invariant explicitly so a future refactor cannot reintroduce + a double-write.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + + write_log: list[str] = [] + state: dict[str, str] = {"instruction": "BASELINE"} + + async def read_cb() -> str: + return state["instruction"] + + async def write_cb(value: str) -> None: + write_log.append(value) + state["instruction"] = value + + target = TargetPrompt().add_callback( + "instruction", read=read_cb, write=write_cb + ) + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "BASELINE"}, {"instruction": "IMPROVED"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "fail2_happy"), + update_source=True, + verbose=0, + ) + + # Only one call: the successful best writeback. No baseline restore. + assert write_log == ["IMPROVED"], ( + "happy-path update_source=True must invoke write_fn exactly once " + f"(best); got {write_log!r}" + ) + assert state["instruction"] == "IMPROVED" + + +# --- FAIL-1: atomic artifact persistence + SIGINT mask ------------------- + +def test_atomic_write_text_no_partial_file_on_failure(tmp_path): + """FAIL-1: ``_atomic_write_text`` must never leave a half-written file. + + If the write step crashes (simulated by a write_text mock that raises), + the destination path either does not exist (first run) or holds its + pre-call content untouched — never a partial write.""" + from trpc_agent_sdk.evaluation._agent_optimizer import _atomic_write_text + + target = tmp_path / "result.json" + target.write_text("ORIGINAL", encoding="utf-8") + + # Simulate failure between tmp write and os.replace by writing to a + # path whose parent does not exist. + bad_path = tmp_path / "no_such_dir" / "result.json" + with pytest.raises(FileNotFoundError): + _atomic_write_text(str(bad_path), "PARTIAL_CONTENT") + + # The original target is untouched. + assert target.read_text(encoding="utf-8") == "ORIGINAL" + # No .tmp leaked at the bad path's parent (parent missing, nothing to clean). + assert not bad_path.exists() + + +def test_atomic_write_text_replaces_existing_file(tmp_path): + """FAIL-1: atomic write must fully replace any pre-existing content.""" + from trpc_agent_sdk.evaluation._agent_optimizer import _atomic_write_text + + target = tmp_path / "out.txt" + target.write_text("OLD", encoding="utf-8") + _atomic_write_text(str(target), "NEW") + assert target.read_text(encoding="utf-8") == "NEW" + assert not (tmp_path / "out.txt.tmp").exists() + + +def test_mask_sigint_restores_previous_handler(): + """FAIL-1: ``_mask_sigint`` must restore the original SIGINT handler on exit, + even if the wrapped block raises.""" + import signal as _signal + + from trpc_agent_sdk.evaluation._agent_optimizer import _mask_sigint + + original = _signal.getsignal(_signal.SIGINT) + try: + sentinel_called = [] + + def _sentinel(signum, frame): # pragma: no cover + sentinel_called.append(signum) + + _signal.signal(_signal.SIGINT, _sentinel) + try: + with _mask_sigint(): + # While masked, the handler is SIG_IGN, not _sentinel. + assert _signal.getsignal(_signal.SIGINT) == _signal.SIG_IGN + # On exit, _sentinel is restored. + assert _signal.getsignal(_signal.SIGINT) is _sentinel + + # Raising inside the block still restores. + with pytest.raises(RuntimeError): + with _mask_sigint(): + assert _signal.getsignal(_signal.SIGINT) == _signal.SIG_IGN + raise RuntimeError("boom") + assert _signal.getsignal(_signal.SIGINT) is _sentinel + finally: + _signal.signal(_signal.SIGINT, original) + finally: + # Belt-and-suspenders restore so a test crash cannot leave the + # interpreter in a weird state for sibling tests. + _signal.signal(_signal.SIGINT, original) + + +def test_mask_sigint_no_op_off_main_thread(): + """FAIL-1: ``_mask_sigint`` must degrade to a no-op when invoked from a + non-main thread (``signal.signal`` raises ValueError there). + + The artifact persistence path runs in whatever event-loop thread the + caller picked; we still want it to complete cleanly even if SIGINT + masking isn't available.""" + import threading + + from trpc_agent_sdk.evaluation._agent_optimizer import _mask_sigint + + errors: list[BaseException] = [] + + def _runner() -> None: + try: + with _mask_sigint(): + pass + except BaseException as exc: # pragma: no cover - guard + errors.append(exc) + + t = threading.Thread(target=_runner) + t.start() + t.join() + assert errors == [] + + +@pytest.mark.asyncio +async def test_persist_artifacts_uses_atomic_write_for_summary_and_result( + tmp_path, monkeypatch +): + """FAIL-1: ``_persist_artifacts`` must route every artifact write through + ``os.replace`` (the tmp+rename atomic primitive). Spy ``os.replace`` and + confirm result.json, summary.txt, round_.json, run.log, baseline + prompts and best prompts all show up as replace targets.""" + import os as _os + + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "BASELINE"}, {"instruction": "IMPROVED"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + replaced: list[str] = [] + real_replace = _os.replace + + def _spy_replace(src, dst): + replaced.append(str(dst)) + return real_replace(src, dst) + + monkeypatch.setattr( + "trpc_agent_sdk.evaluation._agent_optimizer.os.replace", _spy_replace + ) + + output_dir = tmp_path / "runs" / "fail1_atomic" + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + verbose=0, + ) + + # Every persisted artifact must have gone through atomic rename. + replaced_names = {_os.path.basename(p) for p in replaced} + assert "result.json" in replaced_names + assert "summary.txt" in replaced_names + assert "run.log" in replaced_names + assert "config.snapshot.json" in replaced_names + # At least one round file and one baseline / best prompt. + assert any(n.startswith("round_") and n.endswith(".json") for n in replaced_names) + # No leftover .tmp files in output_dir tree. + leftover_tmps = list(output_dir.rglob("*.tmp")) + assert leftover_tmps == [], f"unexpected .tmp residue: {leftover_tmps}" + + +@pytest.mark.asyncio +async def test_persist_artifacts_masks_sigint_during_writes( + tmp_path, monkeypatch +): + """FAIL-1: while ``_persist_artifacts`` runs, SIGINT must be masked so a + panicked second Ctrl+C during teardown cannot interrupt artifact + writes mid-os.replace. We verify by checking ``signal.getsignal`` from + inside a spied-on artifact write.""" + import signal as _signal + + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "BASELINE"}, {"instruction": "IMPROVED"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + sigint_state_during_persist: list = [] + real_replace = __import__("os").replace + + def _spy_replace(src, dst): + sigint_state_during_persist.append(_signal.getsignal(_signal.SIGINT)) + return real_replace(src, dst) + + monkeypatch.setattr( + "trpc_agent_sdk.evaluation._agent_optimizer.os.replace", _spy_replace + ) + + original = _signal.getsignal(_signal.SIGINT) + try: + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "fail1_sigint"), + verbose=0, + ) + finally: + # Belt-and-suspenders restore in case the mask didn't unwind correctly. + _signal.signal(_signal.SIGINT, original) + + # Every replace observed during persistence saw SIGINT == SIG_IGN. + assert sigint_state_during_persist, "expected at least one artifact write" + assert all( + state == _signal.SIG_IGN for state in sigint_state_during_persist + ), ( + "SIGINT must be masked during artifact persistence; observed handlers: " + f"{sigint_state_during_persist!r}" + ) + + # After optimize returns, the prior handler is restored. + assert _signal.getsignal(_signal.SIGINT) is original diff --git a/tests/evaluation/test_base_optimizer.py b/tests/evaluation/test_base_optimizer.py new file mode 100644 index 00000000..aaab841a --- /dev/null +++ b/tests/evaluation/test_base_optimizer.py @@ -0,0 +1,240 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for BaseOptimizer abstract interface.""" + +from __future__ import annotations + +from typing import Optional + +import pytest + +from trpc_agent_sdk.evaluation._base_optimizer import BaseOptimizer +from trpc_agent_sdk.evaluation._eval_callbacks import Callbacks +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfigFile +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions +from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt + + +def _dummy_result() -> OptimizeResult: + return OptimizeResult( + algorithm="stub", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + total_rounds=0, + total_reflection_lm_calls=0, + total_judge_model_calls=0, + duration_seconds=0.0, + started_at="1970-01-01T00:00:00Z", + finished_at="1970-01-01T00:00:00Z", + ) + + +def _make_config() -> OptimizeConfigFile: + return OptimizeConfigFile.model_validate( + { + "evaluate": {"metrics": [{"metric_name": "x", "threshold": 0.7}]}, + "optimize": { + "algorithm": { + "name": "gepa_reflective", + "reflection_lm": OptimizeModelOptions( + model_name="m", api_key="k" + ).model_dump(), + "max_metric_calls": 10, + } + }, + } + ) + + +async def _noop_call_agent(query: str) -> str: + return "" + + +class _StubOptimizer(BaseOptimizer): + async def run(self) -> OptimizeResult: + return _dummy_result() + + +class _IncompleteOptimizer(BaseOptimizer): + """Subclass without implementing run().""" + + +def test_base_optimizer_cannot_instantiate_directly(tmp_path): + target_prompt = TargetPrompt().add_path("system_prompt", str(_seed_prompt(tmp_path))) + with pytest.raises(TypeError): + BaseOptimizer( + config=_make_config(), + call_agent=_noop_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(tmp_path / "train.json"), + validation_dataset_path=str(tmp_path / "val.json"), + ) + + +def test_base_optimizer_subclass_without_run_cannot_instantiate(tmp_path): + target_prompt = TargetPrompt().add_path("system_prompt", str(_seed_prompt(tmp_path))) + with pytest.raises(TypeError): + _IncompleteOptimizer( + config=_make_config(), + call_agent=_noop_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(tmp_path / "train.json"), + validation_dataset_path=str(tmp_path / "val.json"), + ) + + +def test_base_optimizer_stores_constructor_arguments(tmp_path): + seed_path = _seed_prompt(tmp_path) + target_prompt = TargetPrompt().add_path("system_prompt", str(seed_path)) + config = _make_config() + train_path = str(tmp_path / "train.json") + val_path = str(tmp_path / "val.json") + callbacks = Callbacks() + + optimizer = _StubOptimizer( + config=config, + call_agent=_noop_call_agent, + target_prompt=target_prompt, + train_dataset_path=train_path, + validation_dataset_path=val_path, + callbacks=callbacks, + ) + + assert optimizer.config is config + assert optimizer.call_agent is _noop_call_agent + assert optimizer.target_prompt is target_prompt + assert optimizer.train_dataset_path == train_path + assert optimizer.validation_dataset_path == val_path + assert optimizer.callbacks is callbacks + + +def test_base_optimizer_callbacks_default_to_none(tmp_path): + target_prompt = TargetPrompt().add_path("system_prompt", str(_seed_prompt(tmp_path))) + optimizer = _StubOptimizer( + config=_make_config(), + call_agent=_noop_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(tmp_path / "train.json"), + validation_dataset_path=str(tmp_path / "val.json"), + ) + assert optimizer.callbacks is None + + +def test_base_optimizer_rejects_positional_arguments(tmp_path): + target_prompt = TargetPrompt().add_path("system_prompt", str(_seed_prompt(tmp_path))) + with pytest.raises(TypeError): + _StubOptimizer( + _make_config(), + _noop_call_agent, + target_prompt, + str(tmp_path / "train.json"), + str(tmp_path / "val.json"), + ) + + +async def test_base_optimizer_run_is_async(): + import inspect + + assert inspect.iscoroutinefunction(BaseOptimizer.run) + + +def _seed_prompt(tmp_path): + seed = tmp_path / "system.md" + seed.write_text("you are a helpful assistant", encoding="utf-8") + return seed + + +# --------------------------------------------------------------------------- +# BaseOptimizer.resolve_required_thresholds +# --------------------------------------------------------------------------- + + +def _stop_cfg(required_metrics): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + return FrameworkStopConfig(required_metrics=required_metrics) + + +def test_resolve_required_thresholds_all_returns_full_dict(): + thresholds = {"m1": 0.5, "m2": 0.3} + assert ( + BaseOptimizer.resolve_required_thresholds(_stop_cfg("all"), thresholds) + == thresholds + ) + + +def test_resolve_required_thresholds_list_returns_subset(): + thresholds = {"m1": 0.5, "m2": 0.3, "m3": 0.9} + assert BaseOptimizer.resolve_required_thresholds( + _stop_cfg(["m1", "m3"]), thresholds + ) == {"m1": 0.5, "m3": 0.9} + + +def test_resolve_required_thresholds_none_returns_empty(): + assert ( + BaseOptimizer.resolve_required_thresholds(_stop_cfg(None), {"m1": 0.5}) + == {} + ) + + +def test_resolve_required_thresholds_empty_list_returns_empty(): + assert ( + BaseOptimizer.resolve_required_thresholds(_stop_cfg([]), {"m1": 0.5}) + == {} + ) + + +def test_resolve_required_thresholds_list_silently_drops_unknown_names(): + thresholds = {"m1": 0.5} + assert BaseOptimizer.resolve_required_thresholds( + _stop_cfg(["m1", "missing"]), thresholds + ) == {"m1": 0.5} + + +def test_resolve_required_thresholds_returns_copy_not_alias(): + thresholds = {"m1": 0.5} + out = BaseOptimizer.resolve_required_thresholds(_stop_cfg("all"), thresholds) + out["m1"] = 9.9 + assert thresholds["m1"] == 0.5 + + +# --------------------------------------------------------------------------- +# BaseOptimizer.metrics_meet_thresholds +# --------------------------------------------------------------------------- + + +def test_metrics_meet_thresholds_empty_required_returns_false(): + assert BaseOptimizer.metrics_meet_thresholds({"m1": 1.0}, {}) is False + + +def test_metrics_meet_thresholds_all_above_returns_true(): + assert ( + BaseOptimizer.metrics_meet_thresholds( + {"m1": 0.6, "m2": 0.4}, {"m1": 0.5, "m2": 0.3} + ) + is True + ) + + +def test_metrics_meet_thresholds_one_below_returns_false(): + assert ( + BaseOptimizer.metrics_meet_thresholds( + {"m1": 0.6, "m2": 0.2}, {"m1": 0.5, "m2": 0.3} + ) + is False + ) + + +def test_metrics_meet_thresholds_exact_match_returns_true(): + assert BaseOptimizer.metrics_meet_thresholds({"m1": 0.5}, {"m1": 0.5}) is True + + +def test_metrics_meet_thresholds_missing_breakdown_key_returns_false(): + assert BaseOptimizer.metrics_meet_thresholds({"m2": 0.9}, {"m1": 0.5}) is False diff --git a/tests/evaluation/test_optimize_config.py b/tests/evaluation/test_optimize_config.py new file mode 100644 index 00000000..f722a019 --- /dev/null +++ b/tests/evaluation/test_optimize_config.py @@ -0,0 +1,629 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for OptimizeConfigFile and discriminated algorithm union.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest +from pydantic import ValidationError + +from trpc_agent_sdk.evaluation._eval_config import EvalConfig +from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfig +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfigFile +from trpc_agent_sdk.evaluation._optimize_config import load_optimize_config +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + +_VALID_REFLECTION_LM = { + "model_name": "gpt-4o", + "api_key": "opt-key", + "base_url": "https://api.example.com", + "generation_config": {"temperature": 0.2}, +} + + +def _evaluate_section_dict() -> dict: + return { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 0.7, + "criterion": {"finalResponse": {}}, + } + ], + "num_runs": 2, + } + + +def _gepa_algorithm_dict() -> dict: + return { + "name": "gepa_reflective", + "reflection_lm": _VALID_REFLECTION_LM, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "use_merge": False, + "max_merge_invocations": 5, + "skip_perfect_score": True, + "max_metric_calls": 50, + } + + +def _full_config_dict_gepa() -> dict: + return { + "evaluate": _evaluate_section_dict(), + "optimize": { + "eval_case_parallelism": 8, + "algorithm": {**_gepa_algorithm_dict(), "seed": 7}, + }, + } + + +def test_evaluate_section_is_plain_eval_config(): + payload = { + "evaluate": {"metrics": [{"metric_name": "x", "threshold": 0.7}], "num_runs": 1}, + "optimize": {"algorithm": _gepa_algorithm_dict()}, + } + cfg = OptimizeConfigFile.model_validate(payload) + assert type(cfg.evaluate) is EvalConfig + assert cfg.evaluate.num_runs == 1 + metrics = cfg.evaluate.get_eval_metrics() + assert len(metrics) == 1 + assert metrics[0].metric_name == "x" + + +def test_evaluate_section_rejects_unknown_field_via_eval_config_forbid(): + payload = { + "evaluate": { + "metrics": [{"metric_name": "x", "threshold": 0.7}], + "train_dataset_path": "unsupported", + }, + "optimize": {"algorithm": _gepa_algorithm_dict()}, + } + with pytest.raises(ValidationError): + OptimizeConfigFile.model_validate(payload) + + +def test_gepa_reflective_algo_minimal_required_fields(): + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="gpt-4o", api_key="k"), + max_metric_calls=10, + ) + assert algo.name == "gepa_reflective" + assert algo.reflection_lm.model_name == "gpt-4o" + assert algo.seed == 42 + assert algo.candidate_selection_strategy == "pareto" + assert algo.module_selector == "round_robin" + assert algo.frontier_type == "instance" + assert algo.use_merge is False + assert algo.max_merge_invocations == 5 + assert algo.merge_val_overlap_floor == 5 + assert algo.skip_perfect_score is True + assert algo.perfect_score == 1.0 + assert algo.cache_evaluation is False + assert algo.track_best_outputs is False + assert algo.reflection_minibatch_size is None + assert algo.max_metric_calls == 10 + assert algo.max_iterations_without_improvement is None + assert algo.timeout_seconds is None + assert algo.score_threshold is None + assert algo.max_candidate_proposals is None + assert algo.max_tracked_candidates is None + + +def test_gepa_reflective_algo_rejects_unknown_field(): + with pytest.raises(ValidationError): + GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + typo_field=1, + ) + + +def test_gepa_reflective_algo_rejects_illegal_selection_strategy(): + with pytest.raises(ValidationError): + GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + candidate_selection_strategy="bogus", + ) + + +def test_gepa_reflective_algo_rejects_illegal_frontier_type(): + with pytest.raises(ValidationError): + GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + frontier_type="something_else", + ) + + +def test_gepa_reflective_algo_requires_at_least_one_stop_condition(): + with pytest.raises(ValidationError) as exc_info: + GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + ) + assert "stop condition" in str(exc_info.value).lower() + + +@pytest.mark.parametrize( + "stop_field,stop_value", + [ + ("max_iterations_without_improvement", 3), + ("timeout_seconds", 10.0), + ("score_threshold", 0.95), + ("max_candidate_proposals", 25), + ("max_tracked_candidates", 32), + ], +) +def test_gepa_reflective_algo_accepts_any_single_stop_condition(stop_field, stop_value): + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + **{stop_field: stop_value}, + ) + assert getattr(algo, stop_field) == stop_value + assert algo.max_metric_calls is None + + +def test_optimize_config_requires_algorithm(): + with pytest.raises(ValidationError): + OptimizeConfig() + + +def test_optimize_config_routes_to_gepa_reflective(): + cfg = OptimizeConfig(algorithm=_gepa_algorithm_dict()) + assert isinstance(cfg.algorithm, GepaReflectiveAlgo) + assert cfg.algorithm.name == "gepa_reflective" + + +def test_optimize_config_rejects_unknown_algorithm_name(): + with pytest.raises(ValidationError) as exc_info: + OptimizeConfig( + algorithm={ + "name": "unknown_algo", + "reflection_lm": _VALID_REFLECTION_LM, + "max_metric_calls": 10, + } + ) + assert "unknown_algo" in str(exc_info.value) or "tag" in str(exc_info.value).lower() + + +def test_optimize_config_rejects_missing_algorithm_name(): + with pytest.raises(ValidationError): + OptimizeConfig(algorithm={"reflection_lm": _VALID_REFLECTION_LM}) + + +def test_optimize_config_rejects_unknown_top_level_field(): + with pytest.raises(ValidationError): + OptimizeConfig(algorithm=_gepa_algorithm_dict(), unknown_field="boom") + + +def test_optimize_config_seed_only_lives_under_algorithm(): + with pytest.raises(ValidationError): + OptimizeConfig.model_validate( + {"seed": 9, "algorithm": _gepa_algorithm_dict()} + ) + + cfg = OptimizeConfig.model_validate( + {"algorithm": {**_gepa_algorithm_dict(), "seed": 9}} + ) + assert isinstance(cfg.algorithm, GepaReflectiveAlgo) + assert cfg.algorithm.seed == 9 + + +def test_optimize_config_file_requires_both_sections(): + with pytest.raises(ValidationError): + OptimizeConfigFile() + with pytest.raises(ValidationError): + OptimizeConfigFile(optimize=OptimizeConfig(algorithm=_gepa_algorithm_dict())) + + +def test_optimize_config_file_rejects_unknown_top_level_field(): + with pytest.raises(ValidationError): + OptimizeConfigFile.model_validate( + { + "evaluate": _evaluate_section_dict(), + "optimize": {"algorithm": _gepa_algorithm_dict()}, + "unknown_extra": 1, + } + ) + + +def test_load_optimize_config_gepa_round_trip(tmp_path: Path): + cfg_path = tmp_path / "opt.json" + cfg_path.write_text(json.dumps(_full_config_dict_gepa()), encoding="utf-8") + + cfg = load_optimize_config(str(cfg_path)) + + assert isinstance(cfg, OptimizeConfigFile) + assert cfg.evaluate.num_runs == 2 + + metrics = cfg.evaluate.get_eval_metrics() + assert len(metrics) == 1 + assert metrics[0].metric_name == "final_response_avg_score" + + opt = cfg.optimize + assert opt.eval_case_parallelism == 8 + + assert isinstance(opt.algorithm, GepaReflectiveAlgo) + assert opt.algorithm.reflection_lm.model_name == "gpt-4o" + assert opt.algorithm.candidate_selection_strategy == "pareto" + assert opt.algorithm.module_selector == "round_robin" + assert opt.algorithm.seed == 7 + assert opt.algorithm.max_metric_calls == 50 + + +def test_load_optimize_config_missing_evaluate_section_raises(tmp_path: Path): + cfg_path = tmp_path / "no_evaluate.json" + cfg_path.write_text( + json.dumps({"optimize": {"algorithm": _gepa_algorithm_dict()}}), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_missing_optimize_section_raises(tmp_path: Path): + cfg_path = tmp_path / "no_optimize.json" + cfg_path.write_text( + json.dumps({"evaluate": _evaluate_section_dict()}), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_missing_algorithm_raises(tmp_path: Path): + cfg_path = tmp_path / "no_algo.json" + cfg_path.write_text( + json.dumps({"evaluate": _evaluate_section_dict(), "optimize": {}}), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_typo_in_evaluate_section_fails_fast(tmp_path: Path): + cfg_path = tmp_path / "typo_eval.json" + cfg_path.write_text( + json.dumps( + { + "evaluate": { + "mertics": [{"metric_name": "x", "threshold": 0.7}], + "num_runs": 1, + }, + "optimize": {"algorithm": _gepa_algorithm_dict()}, + } + ), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_typo_in_optimize_section_fails_fast(tmp_path: Path): + cfg_path = tmp_path / "typo_opt.json" + cfg_path.write_text( + json.dumps( + { + "evaluate": _evaluate_section_dict(), + "optimize": { + "maxRoundds": 5, + "algorithm": _gepa_algorithm_dict(), + }, + } + ), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_typo_in_algorithm_fails_fast(tmp_path: Path): + cfg_path = tmp_path / "typo_algo.json" + bad_algo = _gepa_algorithm_dict() + bad_algo["max_metricc_calls"] = 100 + cfg_path.write_text( + json.dumps({"evaluate": _evaluate_section_dict(), "optimize": {"algorithm": bad_algo}}), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_unknown_algorithm_name_fails_fast(tmp_path: Path): + cfg_path = tmp_path / "unknown_algo.json" + cfg_path.write_text( + json.dumps( + { + "evaluate": _evaluate_section_dict(), + "optimize": { + "algorithm": { + "name": "few_shot_bayesian", + "reflection_lm": _VALID_REFLECTION_LM, + "max_metric_calls": 10, + } + }, + } + ), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_missing_file_raises(tmp_path: Path): + with pytest.raises(FileNotFoundError): + load_optimize_config(str(tmp_path / "does_not_exist.json")) + + +def test_load_optimize_config_camel_case_keys_accepted(tmp_path: Path): + cfg_path = tmp_path / "camel.json" + payload = { + "evaluate": { + "metrics": [{"metricName": "x", "threshold": 0.6}], + "numRuns": 3, + }, + "optimize": { + "evalCaseParallelism": 5, + "algorithm": { + "name": "gepa_reflective", + "reflectionLm": {"modelName": "claude-3.5-sonnet", "apiKey": "k"}, + "candidateSelectionStrategy": "current_best", + "moduleSelector": "all", + "useMerge": True, + "maxMergeInvocations": 7, + "skipPerfectScore": False, + "maxMetricCalls": 30, + "maxIterationsWithoutImprovement": 2, + }, + }, + } + cfg_path.write_text(json.dumps(payload), encoding="utf-8") + cfg = load_optimize_config(str(cfg_path)) + assert cfg.evaluate.num_runs == 3 + assert cfg.optimize.eval_case_parallelism == 5 + assert isinstance(cfg.optimize.algorithm, GepaReflectiveAlgo) + algo = cfg.optimize.algorithm + assert algo.reflection_lm.model_name == "claude-3.5-sonnet" + assert algo.candidate_selection_strategy == "current_best" + assert algo.module_selector == "all" + assert algo.use_merge is True + assert algo.max_merge_invocations == 7 + assert algo.skip_perfect_score is False + assert algo.max_metric_calls == 30 + assert algo.max_iterations_without_improvement == 2 + + +def test_loaded_metrics_consumable_by_evaluator(tmp_path: Path): + from trpc_agent_sdk.evaluation import EvalMetric + + cfg_path = tmp_path / "opt.json" + cfg_path.write_text(json.dumps(_full_config_dict_gepa()), encoding="utf-8") + cfg = load_optimize_config(str(cfg_path)) + metrics = cfg.evaluate.get_eval_metrics() + for metric in metrics: + assert isinstance(metric, EvalMetric) + + +# --------------------------------------------------------------------------- +# FrameworkStopConfig +# --------------------------------------------------------------------------- + + +def test_framework_stop_config_default_required_metrics_is_all(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + cfg = FrameworkStopConfig() + assert cfg.required_metrics == "all" + + +def test_framework_stop_config_accepts_metric_list(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + cfg = FrameworkStopConfig(required_metrics=["m1", "m2"]) + assert cfg.required_metrics == ["m1", "m2"] + + +def test_framework_stop_config_accepts_none_to_disable(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + cfg = FrameworkStopConfig(required_metrics=None) + assert cfg.required_metrics is None + + +def test_framework_stop_config_accepts_empty_list_to_disable(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + cfg = FrameworkStopConfig(required_metrics=[]) + assert cfg.required_metrics == [] + + +def test_framework_stop_config_rejects_invalid_string(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + with pytest.raises(ValidationError): + FrameworkStopConfig(required_metrics="not-all") + + +def test_framework_stop_config_rejects_unknown_field(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + with pytest.raises(ValidationError): + FrameworkStopConfig(required_metrics="all", typo_field=True) + + +# --------------------------------------------------------------------------- +# OptimizeConfig.stop wiring +# --------------------------------------------------------------------------- + + +def test_optimize_config_stop_defaults_to_required_metrics_all(): + cfg = OptimizeConfig(algorithm=_gepa_algorithm_dict()) + assert cfg.stop.required_metrics == "all" + + +def test_optimize_config_stop_explicit_list(): + cfg = OptimizeConfig.model_validate( + {"algorithm": _gepa_algorithm_dict(), "stop": {"required_metrics": ["m1"]}} + ) + assert cfg.stop.required_metrics == ["m1"] + + +def test_optimize_config_top_level_fields(): + cfg = OptimizeConfig(algorithm=_gepa_algorithm_dict()) + assert cfg.eval_case_parallelism == 4 + assert set(OptimizeConfig.model_fields.keys()) == { + "eval_case_parallelism", + "stop", + "algorithm", + } + + +# --------------------------------------------------------------------------- +# OptimizeConfigFile cross-field validator +# --------------------------------------------------------------------------- + + +def test_optimize_config_file_cross_field_rejects_unknown_required_metric(): + with pytest.raises(ValidationError) as exc_info: + OptimizeConfigFile.model_validate( + { + "evaluate": { + "metrics": [ + {"metric_name": "m1", "threshold": 0.5}, + ], + }, + "optimize": { + "algorithm": _gepa_algorithm_dict(), + "stop": {"required_metrics": ["m1", "bogus"]}, + }, + } + ) + assert "bogus" in str(exc_info.value) + + +def test_optimize_config_file_cross_field_accepts_known_required_metrics(): + cfg = OptimizeConfigFile.model_validate( + { + "evaluate": { + "metrics": [ + {"metric_name": "m1", "threshold": 0.5}, + {"metric_name": "m2", "threshold": 0.3}, + ], + }, + "optimize": { + "algorithm": _gepa_algorithm_dict(), + "stop": {"required_metrics": ["m1"]}, + }, + } + ) + assert cfg.optimize.stop.required_metrics == ["m1"] + + +def test_optimize_config_file_cross_field_skipped_when_required_metrics_is_all(): + cfg = OptimizeConfigFile.model_validate( + { + "evaluate": { + "metrics": [{"metric_name": "m1", "threshold": 0.5}], + }, + "optimize": { + "algorithm": _gepa_algorithm_dict(), + "stop": {"required_metrics": "all"}, + }, + } + ) + assert cfg.optimize.stop.required_metrics == "all" + + +def test_optimize_config_file_cross_field_skipped_when_required_metrics_is_none(): + cfg = OptimizeConfigFile.model_validate( + { + "evaluate": { + "metrics": [{"metric_name": "m1", "threshold": 0.5}], + }, + "optimize": { + "algorithm": _gepa_algorithm_dict(), + "stop": {"required_metrics": None}, + }, + } + ) + assert cfg.optimize.stop.required_metrics is None + + +def test_optimize_config_file_no_stop_block_defaults_to_all(): + cfg = OptimizeConfigFile.model_validate( + { + "evaluate": {"metrics": [{"metric_name": "m1", "threshold": 0.5}]}, + "optimize": {"algorithm": _gepa_algorithm_dict()}, + } + ) + assert cfg.optimize.stop.required_metrics == "all" + + +def test_load_optimize_config_with_stop_block_round_trip(tmp_path: Path): + payload = _full_config_dict_gepa() + payload["optimize"]["stop"] = { + "required_metrics": ["final_response_avg_score"] + } + cfg_path = tmp_path / "with_stop.json" + cfg_path.write_text(json.dumps(payload), encoding="utf-8") + cfg = load_optimize_config(str(cfg_path)) + assert cfg.optimize.stop.required_metrics == ["final_response_avg_score"] + + +def test_gepa_reflective_algo_reflection_history_top_k_default_is_two() -> None: + from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo + from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + ) + + assert algo.reflection_history_top_k == 2 + + +def test_gepa_reflective_algo_reflection_history_top_k_can_be_zero() -> None: + """K=0 disables the feature.""" + from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo + from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + reflection_history_top_k=0, + ) + + assert algo.reflection_history_top_k == 0 + + +def test_gepa_reflective_algo_reflection_history_top_k_rejects_six() -> None: + """Cap at 5 to bound prompt-token blow-up.""" + import pytest + from pydantic import ValidationError + + from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo + from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + with pytest.raises(ValidationError): + GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + reflection_history_top_k=6, + ) diff --git a/tests/evaluation/test_optimize_evaluator_call.py b/tests/evaluation/test_optimize_evaluator_call.py new file mode 100644 index 00000000..8dacb9ad --- /dev/null +++ b/tests/evaluation/test_optimize_evaluator_call.py @@ -0,0 +1,613 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for the optimizer-facing evaluator call wrapper.""" + +from __future__ import annotations + +import pytest + +from trpc_agent_sdk.evaluation._eval_metrics import EvalStatus +from trpc_agent_sdk.evaluation._eval_result import EvalCaseResult +from trpc_agent_sdk.evaluation._eval_result import EvalMetricResult +from trpc_agent_sdk.evaluation._eval_result import EvalSetAggregateResult +from trpc_agent_sdk.evaluation._eval_result import EvaluateResult +from trpc_agent_sdk.evaluation._optimize_evaluator_call import EvaluationOutcome +from trpc_agent_sdk.evaluation._optimize_evaluator_call import run_evaluator +from trpc_agent_sdk.evaluation._optimize_evaluator_call import summarize_outcome + + +def _metric(name: str, score: float, status: EvalStatus = EvalStatus.PASSED) -> EvalMetricResult: + return EvalMetricResult( + metric_name=name, + threshold=0.5, + score=score, + eval_status=status, + ) + + +def _case( + eval_id: str, + final_status: EvalStatus, + metric_scores: dict[str, tuple[float, EvalStatus]], +) -> EvalCaseResult: + metrics = [_metric(n, s, st) for n, (s, st) in metric_scores.items()] + return EvalCaseResult( + eval_set_id="s1", + eval_id=eval_id, + final_eval_status=final_status, + overall_eval_metric_results=metrics, + eval_metric_result_per_invocation=[], + session_id=f"sess-{eval_id}", + ) + + +def _result(cases: list[EvalCaseResult], num_runs: int = 1) -> EvaluateResult: + by_id: dict[str, list[EvalCaseResult]] = {} + for c in cases: + by_id.setdefault(c.eval_id, []).append(c) + return EvaluateResult( + results_by_eval_set_id={ + "s1": EvalSetAggregateResult( + eval_results_by_eval_id=by_id, + num_runs=num_runs, + ), + } + ) + + +def test_summarize_outcome_all_passed_pass_rate_one(): + result = _result([ + _case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)}), + _case("c2", EvalStatus.PASSED, {"m": (0.95, EvalStatus.PASSED)}), + ]) + outcome = summarize_outcome(result) + assert outcome.pass_rate == 1.0 + assert outcome.failed_case_ids == [] + assert pytest.approx(outcome.tiebreaker) == (0.9 + 0.95) / 2 + + +def test_summarize_outcome_partial_pass_rate(): + result = _result([ + _case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)}), + _case("c2", EvalStatus.FAILED, {"m": (0.3, EvalStatus.FAILED)}), + _case("c3", EvalStatus.FAILED, {"m": (0.2, EvalStatus.FAILED)}), + _case("c4", EvalStatus.PASSED, {"m": (0.8, EvalStatus.PASSED)}), + ]) + outcome = summarize_outcome(result) + assert outcome.pass_rate == 0.5 + assert set(outcome.failed_case_ids) == {"c2", "c3"} + + +def test_summarize_outcome_empty_result_zero_pass_rate(): + outcome = summarize_outcome(EvaluateResult()) + assert outcome.pass_rate == 0.0 + assert outcome.tiebreaker == 0.0 + assert outcome.failed_case_ids == [] + assert outcome.metric_breakdown == {} + + +def test_summarize_outcome_metric_breakdown_averages_scores(): + result = _result([ + _case("c1", EvalStatus.PASSED, { + "metric_a": (0.8, EvalStatus.PASSED), + "metric_b": (0.6, EvalStatus.PASSED), + }), + _case("c2", EvalStatus.PASSED, { + "metric_a": (0.6, EvalStatus.PASSED), + "metric_b": (0.4, EvalStatus.PASSED), + }), + ]) + outcome = summarize_outcome(result) + assert pytest.approx(outcome.metric_breakdown["metric_a"]) == 0.7 + assert pytest.approx(outcome.metric_breakdown["metric_b"]) == 0.5 + + +def test_summarize_outcome_tiebreaker_is_mean_of_all_scores(): + result = _result([ + _case("c1", EvalStatus.PASSED, { + "metric_a": (1.0, EvalStatus.PASSED), + "metric_b": (0.0, EvalStatus.PASSED), + }), + ]) + outcome = summarize_outcome(result) + assert pytest.approx(outcome.tiebreaker) == 0.5 + + +def test_summarize_outcome_skips_none_scores(): + case = EvalCaseResult( + eval_set_id="s1", + eval_id="c1", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[ + EvalMetricResult(metric_name="m", threshold=0.5, score=None, + eval_status=EvalStatus.NOT_EVALUATED), + EvalMetricResult(metric_name="m2", threshold=0.5, score=0.9, + eval_status=EvalStatus.PASSED), + ], + eval_metric_result_per_invocation=[], + session_id="x", + ) + outcome = summarize_outcome(_result([case])) + assert outcome.metric_breakdown == {"m2": 0.9} + assert pytest.approx(outcome.tiebreaker) == 0.9 + + +def test_summarize_outcome_multi_run_repeats_failed_id(): + failing = _case("c1", EvalStatus.FAILED, {"m": (0.2, EvalStatus.FAILED)}) + passing = _case("c2", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)}) + result = EvaluateResult( + results_by_eval_set_id={ + "s1": EvalSetAggregateResult( + eval_results_by_eval_id={ + "c1": [failing, failing], + "c2": [passing, passing], + }, + num_runs=2, + ), + } + ) + outcome = summarize_outcome(result) + assert outcome.pass_rate == 0.5 + assert outcome.failed_case_ids.count("c1") == 2 + + +def test_evaluation_outcome_is_immutable(): + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.6, + metric_breakdown={"m": 0.5}, + failed_case_ids=["c1"], + judge_model_calls=0, + raw_result=EvaluateResult(), + ) + try: + outcome.pass_rate = 1.0 # type: ignore[misc] + except Exception: + return + raise AssertionError("EvaluationOutcome should be frozen") + + +class _FakeExecuter: + def __init__(self, result: EvaluateResult) -> None: + self._result = result + self.evaluate_called = 0 + + async def evaluate(self) -> None: + self.evaluate_called += 1 + + def get_result(self) -> EvaluateResult: + return self._result + + +@pytest.mark.asyncio +async def test_run_evaluator_passes_through_call_agent_callbacks_num_runs(monkeypatch): + captured: dict = {} + fake_result = _result([ + _case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)}), + _case("c2", EvalStatus.FAILED, {"m": (0.2, EvalStatus.FAILED)}), + ]) + + def fake_get_executer(eval_dataset_file_path_or_dir, **kwargs): + captured["eval_dataset_path"] = eval_dataset_file_path_or_dir + captured.update(kwargs) + return _FakeExecuter(fake_result) + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr(mod.AgentEvaluator, "get_executer", fake_get_executer) + + async def call_agent(q: str) -> str: + return "x" + + sentinel_callbacks = object() + + outcome = await run_evaluator( + eval_dataset_path="/tmp/some_set.evalset.json", + eval_metrics_path="/tmp/metrics.json", + call_agent=call_agent, + callbacks=sentinel_callbacks, # type: ignore[arg-type] + num_runs=3, + ) + + assert captured["eval_dataset_path"] == "/tmp/some_set.evalset.json" + assert captured["eval_metrics_file_path_or_dir"] == "/tmp/metrics.json" + assert captured["call_agent"] is call_agent + assert captured["callbacks"] is sentinel_callbacks + assert captured["num_runs"] == 3 + assert captured["print_detailed_results"] is False + assert captured["eval_result_output_dir"] is None + + assert outcome.pass_rate == 0.5 + assert outcome.failed_case_ids == ["c2"] + assert outcome.raw_result is fake_result + + +@pytest.mark.asyncio +async def test_run_evaluator_forwards_case_parallelism(monkeypatch): + """spec §3.2: optimize.eval_case_parallelism must reach AgentEvaluator.get_executer.""" + captured: dict = {} + fake_result = _result([_case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)})]) + + def fake_get_executer(eval_dataset_file_path_or_dir, **kwargs): + captured.update(kwargs) + return _FakeExecuter(fake_result) + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr(mod.AgentEvaluator, "get_executer", fake_get_executer) + + async def call_agent(q: str) -> str: + return "x" + + await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + num_runs=1, + case_parallelism=8, + ) + + assert captured["case_parallelism"] == 8 + + +@pytest.mark.asyncio +async def test_run_evaluator_forwards_print_summary_report_false(monkeypatch): + """Optimizer must keep the evaluator silent so its summary table never + collides with the reporter timeline.""" + captured: dict = {} + fake_result = _result([_case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)})]) + + def fake_get_executer(eval_dataset_file_path_or_dir, **kwargs): + captured.update(kwargs) + return _FakeExecuter(fake_result) + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr(mod.AgentEvaluator, "get_executer", fake_get_executer) + + async def call_agent(q: str) -> str: + return "x" + + await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + assert captured["print_detailed_results"] is False + assert captured["print_summary_report"] is False + + +class _AssertingExecuter: + """Mimics AgentEvaluator's pytest-style fail-fast on case failure.""" + + def __init__(self, result: EvaluateResult, message: str) -> None: + self._result = result + self._message = message + + async def evaluate(self) -> None: + from trpc_agent_sdk.evaluation._agent_evaluator import _EvaluationCasesFailed + raise _EvaluationCasesFailed(self._message) + + def get_result(self) -> EvaluateResult: + return self._result + + +@pytest.mark.asyncio +async def test_run_evaluator_swallows_evaluator_assertion_and_returns_outcome(monkeypatch): + fake_result = _result([ + _case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)}), + _case("c2", EvalStatus.FAILED, {"m": (0.2, EvalStatus.FAILED)}), + ]) + + def fake_get_executer(eval_dataset_file_path_or_dir, **kwargs): + return _AssertingExecuter(fake_result, "case c2 failed") + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr(mod.AgentEvaluator, "get_executer", fake_get_executer) + + async def call_agent(q: str) -> str: + return "x" + + outcome = await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + assert outcome.pass_rate == 0.5 + assert outcome.failed_case_ids == ["c2"] + assert outcome.raw_result is fake_result + + +@pytest.mark.asyncio +async def test_run_evaluator_returns_empty_outcome_when_assertion_loses_result(monkeypatch): + class _LostResultExecuter: + async def evaluate(self) -> None: + from trpc_agent_sdk.evaluation._agent_evaluator import _EvaluationCasesFailed + raise _EvaluationCasesFailed("boom before result populated") + + def get_result(self): + return None + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr( + mod.AgentEvaluator, "get_executer", lambda *a, **k: _LostResultExecuter() + ) + + async def call_agent(q: str) -> str: + return "x" + + outcome = await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + assert outcome.pass_rate == 0.0 + assert outcome.failed_case_ids == [] + + +@pytest.mark.asyncio +async def test_run_evaluator_does_not_swallow_unrelated_assertion_error(monkeypatch): + """FAIL-3: only ``_EvaluationCasesFailed`` is the business signal. + + Third-party / SDK-internal ``AssertionError`` (numpy ``assert_allclose``, + invariant self-checks, ...) must NOT be silently consumed — that would + hide real bugs behind a 0.0 pass_rate and let the optimizer continue + training against phantom data. + """ + class _BuggyExecuter: + async def evaluate(self) -> None: + # Stand-in for an unrelated assertion failure inside the evaluator + # (e.g. a numpy invariant check, a library bug). + raise AssertionError("invariant violated: this is NOT a case-failure signal") + + def get_result(self): # pragma: no cover - never reached + return None + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr( + mod.AgentEvaluator, "get_executer", lambda *a, **k: _BuggyExecuter() + ) + + async def call_agent(q: str) -> str: + return "x" + + with pytest.raises(AssertionError, match="invariant violated"): + await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + +@pytest.mark.asyncio +async def test_run_evaluator_propagates_real_upstream_error(monkeypatch): + """FAIL-3: real upstream errors (FileNotFoundError, network, ...) must + propagate, not be silently turned into an empty outcome. + + The pre-fix code had ``try / except AssertionError / finally: + result = get_result()`` which masked any non-Assertion exception too if + the executer's ``get_result()`` returned None — actually it re-raised, + but the optimizer downstream had no way to distinguish "all cases + silently failed" from "evalset file missing on disk". The post-fix code + propagates these to ``AgentOptimizer.optimize()`` ``run_error`` path so + the run terminates with status=FAILED and the cause is preserved in + ``summary.txt`` rather than silently producing 0.0 pass_rate. + """ + class _BrokenExecuter: + async def evaluate(self) -> None: + raise FileNotFoundError("dataset.evalset.json") + + def get_result(self): # pragma: no cover - never reached + return None + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr( + mod.AgentEvaluator, "get_executer", lambda *a, **k: _BrokenExecuter() + ) + + async def call_agent(q: str) -> str: + return "x" + + with pytest.raises(FileNotFoundError, match="dataset.evalset.json"): + await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + +def test_evaluation_cases_failed_is_assertion_error_subclass(): + """FAIL-3: ``_EvaluationCasesFailed`` MUST remain an ``AssertionError`` + subclass so direct ``AgentEvaluator.evaluate()`` callers (e.g. + ``examples/optimization/ci_integration/tests/test_agent_quality.py``) + can keep using ``except AssertionError`` / pytest's native AssertionError + rendering for JUnit XML output without any change.""" + from trpc_agent_sdk.evaluation._agent_evaluator import _EvaluationCasesFailed + err = _EvaluationCasesFailed("failure summary json") + assert isinstance(err, AssertionError) + # Message identity matters for JUnit XML stability. + assert str(err) == "failure summary json" + + +@pytest.mark.asyncio +async def test_eval_executer_raises_evaluation_cases_failed_on_case_failure(tmp_path, monkeypatch): + """FAIL-3 end-to-end: ``_EvalExecuter._run`` MUST raise + ``_EvaluationCasesFailed`` (NOT a bare ``assert False``) when any case + fails. Replacing the bare assert with a real ``raise`` keeps the signal + alive under ``python -O`` — which strips ``assert`` statements — and + avoids piggy-backing business control flow on Python's invariant-check + mechanism. + + We monkeypatch ``evaluate_eval_set`` so this test does not need a real + LLM / runner: the test verifies the post-loop branch in ``_run`` that + converts ``all_failures`` into ``_EvaluationCasesFailed``. + """ + import json as _json + + from trpc_agent_sdk.evaluation._agent_evaluator import ( + AgentEvaluator as _Eval, + ) + from trpc_agent_sdk.evaluation._agent_evaluator import ( + _EvaluationCasesFailed, + ) + from trpc_agent_sdk.evaluation._eval_case import EvalCase + from trpc_agent_sdk.evaluation._eval_case import Invocation + from trpc_agent_sdk.evaluation._eval_config import EvalConfig + from trpc_agent_sdk.evaluation._eval_set import EvalSet + from trpc_agent_sdk.types import Content + from trpc_agent_sdk.types import Part + + # Build the smallest possible evalset on disk so _run can load it. + eval_set = EvalSet( + eval_set_id="es_fail3", + eval_cases=[ + EvalCase( + eval_id="c1", + conversation=[ + Invocation( + user_content=Content( + role="user", parts=[Part.from_text(text="hi")] + ), + final_response=Content( + role="model", parts=[Part.from_text(text="ack")] + ), + ) + ], + ) + ], + ) + evalset_path = tmp_path / "tiny.evalset.json" + evalset_path.write_text(eval_set.model_dump_json(), encoding="utf-8") + config_path = tmp_path / "test_config.json" + config_path.write_text( + EvalConfig(criteria={"final_response_avg_score": 0.5}).model_dump_json(), + encoding="utf-8", + ) + + async def fake_evaluate_eval_set(eval_set_arg, **kwargs): + # Pretend case c1 failed with a structured summary. + failed_summary = { + "overallStatus": "failed", + "evalCases": [{"evalCaseId": "c1", "overallStatus": "failed"}], + } + return failed_summary, [], [], {"c1": []} + + monkeypatch.setattr(_Eval, "evaluate_eval_set", staticmethod(fake_evaluate_eval_set)) + + async def call_agent(query: str) -> str: + return "ack" + + executer = _Eval.get_executer( + str(evalset_path), + call_agent=call_agent, + print_summary_report=False, + print_detailed_results=False, + ) + + with pytest.raises(_EvaluationCasesFailed) as excinfo: + await executer.evaluate() + + # The error message is the JSON-encoded failure summary — pytest renders + # this verbatim in JUnit XML, so existing CI dashboards keep working. + parsed = _json.loads(str(excinfo.value)) + assert parsed[0]["evalSetId"] == "es_fail3" + assert parsed[0]["summary"]["overallStatus"] == "failed" + + # Back-compat: ``isinstance(err, AssertionError)`` MUST stay True so + # ``examples/optimization/ci_integration`` (``except AssertionError``) + # works unchanged. + assert isinstance(excinfo.value, AssertionError) + + # The result was populated BEFORE the raise (line ordering in _run); + # callers can recover the EvaluateResult even on the failure path. + assert executer.get_result() is not None + + +@pytest.mark.asyncio +async def test_eval_executer_signal_survives_python_O_mode(tmp_path, monkeypatch): + """FAIL-3 python -O coverage: ``_run`` MUST NOT use ``assert`` for the + business signal. We can't actually run pytest under ``-O`` here, but + we can prove the signal does not depend on assertions by checking the + source code contains ``raise _EvaluationCasesFailed`` and NOT + ``assert False`` in the case-failure branch. + + A grep-style guard test is overkill for most things, but ``python -O`` + failures are notoriously hard to reproduce and were the exact root + cause of FAIL-3 — pinning the implementation contract here prevents + a careless future rewrite from reintroducing the bug. + """ + import ast + from pathlib import Path + + source = Path( + "trpc_agent_sdk/evaluation/_agent_evaluator.py" + ).read_text(encoding="utf-8") + assert "raise _EvaluationCasesFailed(combined)" in source, ( + "_run must raise _EvaluationCasesFailed for the case-failure signal" + ) + # Parse the AST and walk every Assert node inside _EvalExecuter._run; + # there MUST be none — case failure must be raised, not asserted. + tree = ast.parse(source) + run_method = None + for node in ast.walk(tree): + if ( + isinstance(node, ast.ClassDef) + and node.name == "_EvalExecuter" + ): + for sub in node.body: + if isinstance(sub, ast.AsyncFunctionDef) and sub.name == "_run": + run_method = sub + break + assert run_method is not None, "could not locate _EvalExecuter._run" + asserts_in_run = [ + n for n in ast.walk(run_method) if isinstance(n, ast.Assert) + ] + assert asserts_in_run == [], ( + f"_EvalExecuter._run MUST NOT contain any ``assert`` statements " + f"(stripped by python -O); found {len(asserts_in_run)} " + f"at lines {[a.lineno for a in asserts_in_run]}" + ) + + +@pytest.mark.asyncio +async def test_run_evaluator_default_num_runs_is_one(monkeypatch): + captured: dict = {} + fake_result = _result([_case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)})]) + + def fake_get_executer(eval_dataset_file_path_or_dir, **kwargs): + captured.update(kwargs) + return _FakeExecuter(fake_result) + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr(mod.AgentEvaluator, "get_executer", fake_get_executer) + + async def call_agent(q: str) -> str: + return "x" + + await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + assert captured["num_runs"] == 1 + assert captured["callbacks"] is None + assert captured["eval_metrics_file_path_or_dir"] is None diff --git a/tests/evaluation/test_optimize_gepa_adapter.py b/tests/evaluation/test_optimize_gepa_adapter.py new file mode 100644 index 00000000..e497c74b --- /dev/null +++ b/tests/evaluation/test_optimize_gepa_adapter.py @@ -0,0 +1,1748 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for the gepa protocol adapter and trajectory/feedback helpers.""" + +from __future__ import annotations + +from typing import Optional + +import pytest + +from trpc_agent_sdk.evaluation._eval_case import EvalCase +from trpc_agent_sdk.evaluation._eval_case import Invocation +from trpc_agent_sdk.evaluation._eval_config import EvalConfig +from trpc_agent_sdk.evaluation._eval_metrics import EvalStatus +from trpc_agent_sdk.evaluation._eval_result import EvalCaseResult +from trpc_agent_sdk.evaluation._eval_result import EvalMetricResult +from trpc_agent_sdk.evaluation._eval_result import EvalMetricResultDetails +from trpc_agent_sdk.evaluation._eval_result import EvalMetricResultPerInvocation +from trpc_agent_sdk.evaluation._eval_result import EvalSetAggregateResult +from trpc_agent_sdk.evaluation._eval_result import EvaluateResult +from trpc_agent_sdk.evaluation._optimize_evaluator_call import EvaluationOutcome +from trpc_agent_sdk.evaluation._optimize_gepa_adapter import _AgentGEPAAdapter +from trpc_agent_sdk.evaluation._optimize_gepa_adapter import _extract_case_output +from trpc_agent_sdk.evaluation._optimize_gepa_adapter import _render_metric_lines +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +def _invocation(user_text: str, response_text: Optional[str] = None) -> Invocation: + final_response = ( + Content(role="model", parts=[Part.from_text(text=response_text)]) + if response_text is not None + else None + ) + return Invocation( + user_content=Content(role="user", parts=[Part.from_text(text=user_text)]), + final_response=final_response, + ) + + +def _eval_case(eval_id: str = "c1", user: str = "hi", expected: str = "ack") -> EvalCase: + return EvalCase( + eval_id=eval_id, + conversation=[_invocation(user, expected)], + ) + + +def _case_result( + eval_id: str, + *, + status: EvalStatus, + metric_score: float, + actual: str, + expected: str = "ack", + reason: Optional[str] = None, + error_message: Optional[str] = None, +) -> EvalCaseResult: + details = EvalMetricResultDetails(reason=reason, score=metric_score) if reason else None + return EvalCaseResult( + eval_id=eval_id, + eval_set_id="optimize_gepa_batch", + final_eval_status=status, + error_message=error_message, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="m1", + threshold=0.7, + score=metric_score, + eval_status=status, + details=details, + ) + ], + eval_metric_result_per_invocation=[ + EvalMetricResultPerInvocation( + actual_invocation=_invocation("hi", actual), + expected_invocation=_invocation("hi", expected), + eval_metric_results=[], + ) + ], + session_id=f"sess-{eval_id}", + ) + + +def _evaluate_result(case_results_by_id: dict[str, list[EvalCaseResult]]) -> EvaluateResult: + return EvaluateResult( + results_by_eval_set_id={ + "optimize_gepa_batch": EvalSetAggregateResult( + eval_results_by_eval_id=case_results_by_id, + num_runs=1, + ) + } + ) + + +async def _stub_call_agent(query: str) -> str: + return "stub" + + +def _eval_config() -> EvalConfig: + return EvalConfig(metrics=[{"metric_name": "m1", "threshold": 0.7}], num_runs=1) + + +def _new_target_prompt(write_recorder: Optional[dict[str, str]] = None) -> TargetPrompt: + target = TargetPrompt() + recorder = write_recorder if write_recorder is not None else {} + + async def read_cb() -> str: + return recorder.get("instruction", "initial") + + async def write_cb(value: str) -> None: + recorder["instruction"] = value + + target.add_callback("instruction", read=read_cb, write=write_cb) + return target + + +def _multi_component_target_prompt(component_names: list[str]) -> TargetPrompt: + """Register one callback per requested component. + + Each callback writes into an isolated dict so ``write_all`` succeeds for + any candidate whose keys exactly match ``component_names``. Used by + multi-component reflective-dataset tests to exercise the + ``Other Active Components`` injection path. + """ + target = TargetPrompt() + storage: dict[str, str] = {name: "" for name in component_names} + + def _make_pair(name: str): + async def read_cb() -> str: + return storage[name] + + async def write_cb(value: str) -> None: + storage[name] = value + + return read_cb, write_cb + + for name in component_names: + read_cb, write_cb = _make_pair(name) + target.add_callback(name, read=read_cb, write=write_cb) + return target + + +def _patch_run_evaluator(monkeypatch, outcome: EvaluationOutcome) -> dict[str, dict]: + captured: dict[str, dict] = {} + + async def fake_run_evaluator(**kwargs): + captured["kwargs"] = kwargs + eval_dataset_path = kwargs.get("eval_dataset_path") + if eval_dataset_path: + from pathlib import Path + import json + payload = json.loads(Path(eval_dataset_path).read_text(encoding="utf-8")) + captured.setdefault("evalset_id_history", []).append(payload["eval_set_id"]) + captured.setdefault("evalset_payload_history", []).append(payload) + return outcome + + monkeypatch.setattr( + "trpc_agent_sdk.evaluation._optimize_gepa_adapter.run_evaluator", + fake_run_evaluator, + ) + return captured + + +def _make_adapter(target: Optional[TargetPrompt] = None, num_runs: int = 1) -> _AgentGEPAAdapter: + return _AgentGEPAAdapter( + target_prompt=target or _new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=num_runs, + ) + + +def test_extract_case_output_reads_first_invocation_final_response(): + case_result = _case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="output text") + assert _extract_case_output(case_result) == "output text" + + +def test_extract_case_output_returns_empty_when_no_invocation(): + case_result = EvalCaseResult( + eval_id="c1", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[], + eval_metric_result_per_invocation=[], + session_id="s", + ) + assert _extract_case_output(case_result) == "" + + +# --------------------------------------------------------------------------- +# ``_render_metric_lines`` is the core verdict-line renderer used by every +# Case Body block (per-turn + Overall). Tests below pin the structural +# guarantees the reflection LM relies on: PASS/FAIL labelling, threshold +# emission, judge-vs-synthesized reason precedence, and rubric breakdown. +# --------------------------------------------------------------------------- + + +def _failed_final_response_metric( + *, + text: Optional[dict] = None, + json_cfg: Optional[dict] = None, +) -> EvalMetricResult: + """Build a FAILED final_response_avg_score metric WITHOUT details.reason, + mirroring what the real ``_final_response_evaluator`` actually emits. + Used to exercise the deterministic-reason synthesis path.""" + criterion: dict = {"final_response": {}} + if text is not None: + criterion["final_response"]["text"] = text + if json_cfg is not None: + criterion["final_response"]["json"] = json_cfg + return EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=0.0, + eval_status=EvalStatus.FAILED, + criterion=criterion, + details=None, + ) + + +def test_render_metric_lines_emits_pass_fail_status_with_threshold_and_score(): + """Each metric occupies one line in the form + ``[PASSED|FAILED] : score=, threshold=``. The + reflection LM uses these markers to (a) decide which metrics to keep + constraints for, (b) tell which metric is being judged.""" + lines = _render_metric_lines( + [ + EvalMetricResult( + metric_name="m_pass", + threshold=0.7, + score=0.95, + eval_status=EvalStatus.PASSED, + ), + EvalMetricResult( + metric_name="m_fail", + threshold=0.7, + score=0.10, + eval_status=EvalStatus.FAILED, + ), + ] + ) + assert "[PASSED] m_pass: score=0.9500, threshold=0.7000" in lines + assert "[FAILED] m_fail: score=0.1000, threshold=0.7000" in lines + + +def test_render_metric_lines_uses_explicit_judge_reason(): + """LLM-judged metrics already carry a natural-language reason in + ``details.reason``; that reason is surfaced verbatim under the verdict + line so the LM sees the judge's actual diagnosis.""" + lines = _render_metric_lines( + [ + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.5, + score=0.0, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails( + score=0.0, reason="judge said: missing units" + ), + ) + ] + ) + assert any("reason: judge said: missing units" in line for line in lines) + + +def test_render_metric_lines_synthesizes_reason_for_failing_contains_match(): + """Real deterministic matchers leave ``details.reason`` empty. We synth + a one-line failure explanation from the criterion config so the LM + sees WHY a substring match failed without diffing two long strings.""" + lines = _render_metric_lines( + [ + _failed_final_response_metric( + text={"match": "contains", "case_insensitive": True} + ) + ] + ) + joined = "\n".join(lines) + assert "expected substring not contained" in joined + assert "case-insensitive" in joined + + +def test_render_metric_lines_synthesizes_reason_for_failing_exact_match(): + lines = _render_metric_lines( + [_failed_final_response_metric(text={"match": "exact"})] + ) + joined = "\n".join(lines) + assert "byte-equal" in joined + assert "case-sensitive" in joined + + +def test_render_metric_lines_synthesizes_reason_for_failing_regex_match(): + lines = _render_metric_lines( + [_failed_final_response_metric(text={"match": "regex"})] + ) + assert any("regex" in line for line in lines) + + +def test_render_metric_lines_synthesizes_combined_text_and_json_failure(): + """When a metric runs BOTH text AND json checks the synthesized reason + must say so (joined with AND), otherwise the LM cannot tell which half + of the combined check failed.""" + lines = _render_metric_lines( + [ + _failed_final_response_metric( + text={"match": "exact"}, + json_cfg={"number_tolerance": 0.01}, + ) + ] + ) + joined = "\n".join(lines) + assert "byte-equal" in joined + assert "JSON" in joined + assert "AND" in joined + + +def test_render_metric_lines_no_reason_for_passing_deterministic_metric(): + """Passing metrics with no explicit reason emit no ``reason:`` line — + we only synthesize failure explanations, never invent praise.""" + lines = _render_metric_lines( + [ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=1.0, + eval_status=EvalStatus.PASSED, + criterion={"final_response": {"text": {"match": "contains"}}}, + details=None, + ) + ] + ) + assert not any("reason:" in line for line in lines) + + +def test_render_metric_lines_keeps_explicit_reason_over_synthesis(): + """When details.reason IS present, the explicit text wins — never + overwritten by synthesized criterion text. Guards against an LLM + judge's nuanced verdict being clobbered by template-generated wording.""" + lines = _render_metric_lines( + [ + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.5, + score=0.0, + eval_status=EvalStatus.FAILED, + criterion={"llm_judge": {"judge_model": {"model_name": "j1"}}}, + details=EvalMetricResultDetails( + score=0.0, reason="judge said: missing units" + ), + ) + ] + ) + joined = "\n".join(lines) + assert "judge said: missing units" in joined + assert "byte-equal" not in joined + assert "expected substring not contained" not in joined + + +def test_render_metric_lines_expands_rubric_sub_scores(): + """LLM rubric metrics carry per-rubric sub-scores; each rubric must + surface as its own `` · rubric[]: PASS|FAIL ...`` line so the LM + knows which sub-quality is responsible for the verdict.""" + from trpc_agent_sdk.evaluation._llm_criterion import RubricScore + + lines = _render_metric_lines( + [ + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=0.6667, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails( + score=0.6667, + reason="2/3 rubrics passed", + rubric_scores=[ + RubricScore(id="numeric_correct", score=1.0, reason="answer matches"), + RubricScore(id="reasoning_clear", score=0.0, reason="no calculation steps shown"), + RubricScore(id="units_present", score=1.0, reason="unit present"), + ], + ), + ) + ] + ) + joined = "\n".join(lines) + assert "rubric[numeric_correct]: PASS score=1.00" in joined + assert "rubric[reasoning_clear]: FAIL score=0.00" in joined + assert "rubric[units_present]: PASS score=1.00" in joined + assert "answer matches" in joined + assert "no calculation steps shown" in joined + assert "unit present" in joined + + +def test_adapter_constructor_stores_dependencies(): + target = _new_target_prompt() + config = _eval_config() + adapter = _AgentGEPAAdapter( + target_prompt=target, + eval_config=config, + call_agent=_stub_call_agent, + callbacks=None, + num_runs=3, + ) + assert adapter.target_prompt is target + assert adapter.eval_config is config + assert adapter.num_runs == 3 + + +def test_evaluate_writes_candidate_to_target_prompt(monkeypatch): + case = _eval_case("c1") + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=0.9, + raw_result=_evaluate_result({"c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + recorder: dict[str, str] = {} + target = _new_target_prompt(recorder) + adapter = _make_adapter(target) + + adapter.evaluate(batch=[case], candidate={"instruction": "new prompt text"}) + assert recorder.get("instruction") == "new prompt text" + + +def test_evaluate_passes_correct_kwargs_to_run_evaluator(monkeypatch): + case = _eval_case("c1") + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=0.9, + raw_result=_evaluate_result({"c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + captured = _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter(num_runs=2) + adapter.evaluate(batch=[case], candidate={"instruction": "x"}) + + kwargs = captured["kwargs"] + # The adapter wraps call_agent in a one-shot return-type sentinel + # (API-A2 fix), so identity equality with the user-provided callable + # no longer holds. Verify the wrapped callable is async and forwards + # the original return value. + import asyncio as _asyncio + import inspect as _inspect + forwarded = kwargs["call_agent"] + assert _inspect.iscoroutinefunction(forwarded) + assert _asyncio.run(forwarded("ping")) == "stub" + assert kwargs["num_runs"] == 2 + assert kwargs["callbacks"] is None + assert kwargs["eval_dataset_path"].endswith(".evalset.json") + assert kwargs["eval_metrics_path"].endswith(".metrics.json") + + +def test_evaluate_scores_reflect_continuous_metric_means(monkeypatch): + """case_score must equal the mean of each metric's continuous score — + NOT a binary pass/fail collapse — so GEPA can distinguish candidates + whose metrics differ in degree but share pass/fail labels.""" + cases = [_eval_case("c1"), _eval_case("c2"), _eval_case("c3")] + outcome = EvaluationOutcome( + pass_rate=1 / 3, + tiebreaker=0.5, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")], + "c2": [_case_result("c2", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong")], + "c3": [_case_result("c3", status=EvalStatus.FAILED, metric_score=0.4, actual="bad")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}) + + assert batch_obj.scores == pytest.approx([0.9, 0.3, 0.4]) + assert len(batch_obj.outputs) == 3 + assert batch_obj.outputs[0] == "ack" + assert batch_obj.outputs[1] == "wrong" + + +def test_evaluate_with_num_runs_averages_continuous_metric_scores(monkeypatch): + """With num_runs > 1, case_score = mean over runs of mean over metrics — + no binary pass-count collapse.""" + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.5, + raw_result=_evaluate_result({ + "c1": [ + _case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ok"), + _case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="bad"), + ], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter(num_runs=2) + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}) + + # mean([mean([0.9]), mean([0.3])]) = mean([0.9, 0.3]) = 0.6 + assert batch_obj.scores == pytest.approx([0.6]) + + +def test_evaluate_case_score_averages_across_multiple_metrics(monkeypatch): + """When a case carries multiple metrics, case_score = mean of metric scores. + + This is the property GEPA relies on to break ties between candidates that + agree on the binary PASS/FAIL bucket but differ in degree (e.g. one keeps + rubric quality at 1.0 while the other regresses to 0.33).""" + case_result = EvalCaseResult( + eval_id="c_multi", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=1.0), + ), + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=0.3333, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.3333), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c_multi", + ) + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.7, + raw_result=_evaluate_result({"c_multi": [case_result]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c_multi")], candidate={"instruction": "x"} + ) + + # mean([1.0, 0.3333]) ≈ 0.6667; binary collapse would have produced 0.0 (failed) + assert batch_obj.scores == pytest.approx([0.66665], rel=1e-3) + + +def test_evaluate_populates_objective_scores_per_metric_per_case(monkeypatch): + """objective_scores must be a list aligned with batch order; each entry is + a {metric_name: score} dict — this is the channel GEPA needs to track a + per-objective Pareto frontier.""" + case_1 = EvalCaseResult( + eval_id="c1", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=1.0), + ), + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=0.6667, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=0.6667), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c1", + ) + case_2 = EvalCaseResult( + eval_id="c2", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=0.0, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.0), + ), + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=1.0), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c2", + ) + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.6, + raw_result=_evaluate_result({"c1": [case_1], "c2": [case_2]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c1"), _eval_case("c2")], + candidate={"instruction": "x"}, + ) + + assert batch_obj.objective_scores is not None + assert len(batch_obj.objective_scores) == 2 + assert batch_obj.objective_scores[0]["final_response_avg_score"] == pytest.approx(1.0) + assert batch_obj.objective_scores[0]["llm_rubric_response"] == pytest.approx(0.6667, rel=1e-3) + assert batch_obj.objective_scores[1]["final_response_avg_score"] == pytest.approx(0.0) + assert batch_obj.objective_scores[1]["llm_rubric_response"] == pytest.approx(1.0) + + +def test_evaluate_objective_scores_average_across_num_runs(monkeypatch): + """When num_runs > 1, each metric's score in objective_scores must be the + mean of its scores across runs — keeping per-objective signal continuous.""" + run_1 = EvalCaseResult( + eval_id="c1", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="m1", threshold=0.7, score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=1.0), + ), + EvalMetricResult( + metric_name="m2", threshold=0.5, score=0.6, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=0.6), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c1-r1", + ) + run_2 = EvalCaseResult( + eval_id="c1", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="m1", threshold=0.7, score=0.4, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.4), + ), + EvalMetricResult( + metric_name="m2", threshold=0.5, score=0.8, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=0.8), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c1-r2", + ) + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.6, + raw_result=_evaluate_result({"c1": [run_1, run_2]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter(num_runs=2) + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "x"} + ) + + assert batch_obj.objective_scores is not None + assert len(batch_obj.objective_scores) == 1 + assert batch_obj.objective_scores[0]["m1"] == pytest.approx(0.7) + assert batch_obj.objective_scores[0]["m2"] == pytest.approx(0.7) + + +def test_evaluate_case_score_separates_candidates_with_same_pass_rate(monkeypatch): + """Two candidates that share the same PASS/FAIL labels on a case but + differ in metric score must end up with different case_scores, so GEPA's + best-candidate selection no longer collapses to ``first-among-ties``.""" + case_a = EvalCaseResult( + eval_id="c1", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=0.0, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.0), + ), + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=1.0), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c1-A", + ) + case_b = EvalCaseResult( + eval_id="c1", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=0.0, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.0), + ), + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=0.3333, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.3333), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c1-B", + ) + + outcome_a = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.5, + raw_result=_evaluate_result({"c1": [case_a]}), + ) + outcome_b = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.16, + raw_result=_evaluate_result({"c1": [case_b]}), + ) + + _patch_run_evaluator(monkeypatch, outcome_a) + adapter = _make_adapter() + score_a = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "candidate_A"} + ).scores[0] + + _patch_run_evaluator(monkeypatch, outcome_b) + score_b = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "candidate_B"} + ).scores[0] + + # Both candidates fail final_response, but candidate A preserves rubric quality. + # Continuous case_score must reflect this difference (binary collapse would + # have tied both at 0.0). + assert score_a > score_b + assert score_a == pytest.approx(0.5) + assert score_b == pytest.approx(0.16665, rel=1e-3) + + +def test_evaluate_with_capture_traces_returns_trajectories(monkeypatch): + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="not matching")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=True) + + assert batch_obj.trajectories is not None + assert len(batch_obj.trajectories) == 1 + traj = batch_obj.trajectories[0] + # Trajectory dict now carries only what ``make_reflective_dataset`` + # actually consumes: the score (for filtering), the captured EvalCase / + # case_runs (for rebuilding the Case Body), and an optional + # error_message for the no-runs evaluator-error path. + assert traj["_case"].eval_id == "c1" + assert len(traj["_case_runs"]) == 1 + assert traj["score"] == pytest.approx(0.3) + assert traj["error_message"] is None + + +def test_evaluate_without_capture_traces_returns_no_trajectories(monkeypatch): + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=0.9, + raw_result=_evaluate_result({"c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=False) + assert batch_obj.trajectories is None + + +def test_evaluate_handles_empty_raw_result(monkeypatch): + cases = [_eval_case("c1"), _eval_case("c2")] + outcome = EvaluationOutcome(pass_rate=0.0, tiebreaker=0.0, raw_result=None) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=True) + + assert batch_obj.scores == [0.0, 0.0] + assert batch_obj.outputs == ["", ""] + assert batch_obj.trajectories is not None + assert all( + t["error_message"] == "no result returned" for t in batch_obj.trajectories + ) + + +def test_evaluate_handles_case_missing_from_result(monkeypatch): + cases = [_eval_case("c1"), _eval_case("missing")] + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.5, + raw_result=_evaluate_result({"c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=True) + + assert batch_obj.scores == pytest.approx([0.9, 0.0]) + assert batch_obj.outputs[1] == "" + assert batch_obj.trajectories is not None + assert ( + batch_obj.trajectories[1]["error_message"] + == "case missing from evaluator result" + ) + + +def test_adapter_exposes_propose_new_texts_attribute_as_none(): + # gepa's reflective proposer reads ``adapter.propose_new_texts`` directly; + # the attribute must exist (None signals "use the default reflection LM"). + assert hasattr(_AgentGEPAAdapter, "propose_new_texts") + assert _AgentGEPAAdapter.propose_new_texts is None + + +def test_evaluate_deduplicates_repeated_case_ids_within_batch(monkeypatch): + # gepa's batch sampler pads the minibatch with least-frequent ids when the + # trainset size does not divide the minibatch size, so the same eval_case + # can appear twice in one batch. The evaluator's in-memory manager rejects + # duplicate eval_ids inside an EvalSet, so the adapter must rename repeats. + case = _eval_case("dup") + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=0.9, + raw_result=_evaluate_result({"dup": [_case_result("dup", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + captured = _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + adapter.evaluate(batch=[case, case], candidate={"instruction": "x"}) + + payload = captured["evalset_payload_history"][0] + ids = [c["eval_id"] for c in payload["eval_cases"]] + assert len(ids) == 2 + assert len(set(ids)) == 2, f"Duplicate eval_ids must be renamed, got {ids}" + + +def test_evaluate_uses_unique_eval_set_id_per_call(monkeypatch): + case = _eval_case("c1") + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=0.9, + raw_result=_evaluate_result({"c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + captured = _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + adapter.evaluate(batch=[case], candidate={"instruction": "v1"}) + adapter.evaluate(batch=[case], candidate={"instruction": "v2"}) + + ids = captured["evalset_id_history"] + assert len(ids) == 2 + assert ids[0] != ids[1], "Each call must use a unique eval_set_id to avoid in-memory manager collisions" + + +def test_make_reflective_dataset_collects_failed_cases_only(monkeypatch): + cases = [_eval_case("c1"), _eval_case("c2"), _eval_case("c3")] + outcome = EvaluationOutcome( + pass_rate=1 / 3, + tiebreaker=0.4, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=1.0, actual="ack")], + "c2": [_case_result("c2", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="bad")], + "c3": [_case_result("c3", status=EvalStatus.FAILED, metric_score=0.4, actual="bad", reason="off")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=True) + + reflective = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + ) + records = reflective["instruction"] + assert len(records) == 2 + # Turn-sliced schema: case_id, score, Case Body. Other Active Components + # is omitted on single-component candidates. + assert all("case_id" in r for r in records) + assert all("score" in r for r in records) + assert all("Case Body" in r for r in records) + assert all(isinstance(r["Case Body"], str) and r["Case Body"] for r in records) + assert all("Other Active Components" not in r for r in records) + + +def test_make_reflective_dataset_case_body_one_turn_block_per_invocation(monkeypatch): + """Multi-turn case: Case Body contains one ``### Turn N`` block per + invocation, each carrying its own User/Expected lines.""" + multi_turn_case = EvalCase( + eval_id="c_multi_turn", + conversation=[ + _invocation("hello", "hi there"), + _invocation("how are you", "I'm doing fine"), + _invocation("bye", "goodbye"), + ], + ) + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.0, + raw_result=_evaluate_result({ + "c_multi_turn": [_case_result( + "c_multi_turn", status=EvalStatus.FAILED, + metric_score=0.0, actual="wrong", + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[multi_turn_case], candidate={"instruction": "x"}, capture_traces=True + ) + records = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"] + + body = records[0]["Case Body"] + assert "### Turn 1" in body + assert "### Turn 2" in body + assert "### Turn 3" in body + assert "**User**: hello" in body + assert "**Expected**: hi there" in body + assert "**User**: how are you" in body + assert "**Expected**: I'm doing fine" in body + assert "**User**: bye" in body + assert "**Expected**: goodbye" in body + + +def test_make_reflective_dataset_case_body_emits_overall_for_multi_turn(monkeypatch): + """Multi-turn case ends with ``### Overall (case-level aggregate)`` so + the reflection LM sees both per-turn verdicts and the case-level roll-up.""" + multi_turn_case = EvalCase( + eval_id="c_multi", + conversation=[ + _invocation("hi", "ack1"), + _invocation("again", "ack2"), + ], + ) + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.0, + raw_result=_evaluate_result({ + "c_multi": [_case_result( + "c_multi", status=EvalStatus.FAILED, + metric_score=0.0, actual="wrong", + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[multi_turn_case], candidate={"instruction": "x"}, capture_traces=True + ) + body = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0]["Case Body"] + assert "### Overall (case-level aggregate)" in body + + +def test_make_reflective_dataset_case_body_omits_overall_for_single_turn_single_run(monkeypatch): + """Single-turn single-run cases skip the Overall block — Turn 1 already + carries the only verdict, an Overall heading would just repeat it.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "x"}, capture_traces=True + ) + body = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0]["Case Body"] + assert "### Turn 1" in body + assert "### Overall" not in body + + +def test_make_reflective_dataset_case_body_nests_run_blocks_for_multi_run(monkeypatch): + """num_runs > 1: each turn block nests ``#### Run N`` sub-blocks so the + reflection LM sees output variance attributed to the right run, without + repeating the shared User/Expected lines per run.""" + run1 = _case_result( + "c1", status=EvalStatus.FAILED, metric_score=0.0, actual="output_run1" + ) + run1.run_id = 1 + run2 = _case_result( + "c1", status=EvalStatus.PASSED, metric_score=1.0, actual="output_run2" + ) + run2.run_id = 2 + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.5, + raw_result=_evaluate_result({"c1": [run1, run2]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter(num_runs=2) + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "x"}, capture_traces=True + ) + body = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0]["Case Body"] + + assert "#### Run 1" in body + assert "#### Run 2" in body + assert "**Agent Response**: output_run1" in body + assert "**Agent Response**: output_run2" in body + # Shared User line appears once at the turn level — not once per run. + assert body.count("**User**: hi") == 1 + # Multi-run cases close with per-run aggregate. + assert "### Overall (per-run aggregate)" in body + + +def test_make_reflective_dataset_case_body_renders_tool_trace_inline(monkeypatch): + """Tool calls render as a single-line ``func(arg=val) → result [id=...]`` + so GEPA's H6 markdown cap does not flatten the call/arg/result hierarchy + when the renderer nests them as headers.""" + from trpc_agent_sdk.evaluation._eval_case import IntermediateData + from trpc_agent_sdk.types import FunctionCall, FunctionResponse + + actual = _invocation("query", "I used search") + actual.intermediate_data = IntermediateData( + tool_uses=[ + FunctionCall(id="call_1", name="search", args={"q": "weather"}), + ], + tool_responses=[ + FunctionResponse(id="call_1", name="search", response={"result": "sunny"}), + ], + ) + + case_result = EvalCaseResult( + eval_id="c_tool", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="m1", threshold=0.7, score=0.3, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(reason="off", score=0.3), + ) + ], + eval_metric_result_per_invocation=[ + EvalMetricResultPerInvocation( + actual_invocation=actual, + expected_invocation=_invocation("query", "expected"), + eval_metric_results=[], + ) + ], + session_id="sess-c_tool", + ) + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({"c_tool": [case_result]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c_tool")], candidate={"instruction": "x"}, capture_traces=True + ) + body = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0]["Case Body"] + + assert "**Tool Trace**:" in body + assert "search(q='weather')" in body + assert "'sunny'" in body + assert "[id=call_1]" in body + + +def test_make_reflective_dataset_case_body_omits_tool_trace_when_absent(monkeypatch): + """When the agent did not invoke any tool, the Tool Trace section is + absent — keeps the prompt focused on what the agent actually produced.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "x"}, capture_traces=True + ) + body = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0]["Case Body"] + assert "**Tool Trace**:" not in body + + +def test_make_reflective_dataset_record_carries_case_id_and_score(monkeypatch): + """Per-record meta fields case_id and score let the reflection LM + reference a specific case and see the aggregated case-level score + alongside per-metric breakdown.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c_special": [_case_result("c_special", status=EvalStatus.FAILED, metric_score=0.42, actual="wrong")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c_special")], candidate={"instruction": "x"}, capture_traces=True + ) + record = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0] + assert record["case_id"] == "c_special" + assert record["score"] == pytest.approx(0.42) + + +def test_make_reflective_dataset_other_active_components_present_for_multi_component( + monkeypatch, +): + """Multi-component candidate: each record exposes the OTHER prompts' + current text under ``Other Active Components`` so the reflection LM can + avoid restating requirements already enforced by sibling prompts.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="off")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + target = _multi_component_target_prompt(["system_prompt", "skill_prompt"]) + adapter = _make_adapter(target=target) + candidate = { + "system_prompt": "You are a helpful assistant.", + "skill_prompt": "When asked math, always include units.", + } + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate=candidate, capture_traces=True + ) + reflective = adapter.make_reflective_dataset( + candidate=candidate, + eval_batch=batch_obj, + components_to_update=["system_prompt"], + ) + other_md = reflective["system_prompt"][0]["Other Active Components"] + # The sibling prompt's current body is included. + assert "When asked math, always include units." in other_md + assert "### skill_prompt (current)" in other_md + # The target component itself is NOT echoed (GEPA already shows it in ). + assert "system_prompt (current)" not in other_md + + +def test_make_reflective_dataset_other_active_components_absent_for_single_component( + monkeypatch, +): + """Single-component candidate: no ``Other Active Components`` key is + emitted — there is nothing else to surface and the LM should not see an + empty section.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="off")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "x"}, capture_traces=True + ) + record = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0] + assert "Other Active Components" not in record + + +def test_make_reflective_dataset_other_active_components_rebuilt_per_component( + monkeypatch, +): + """When dispatching to multiple components in the same round, each + component's record set must list the OTHER components' content — i.e. + the ``Other Active Components`` field is rebuilt per component, not + shared across them.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="off")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + target = _multi_component_target_prompt(["system_prompt", "skill_prompt"]) + adapter = _make_adapter(target=target) + candidate = { + "system_prompt": "SYSTEM BODY", + "skill_prompt": "SKILL BODY", + } + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate=candidate, capture_traces=True + ) + reflective = adapter.make_reflective_dataset( + candidate=candidate, + eval_batch=batch_obj, + components_to_update=["system_prompt", "skill_prompt"], + ) + + sys_other = reflective["system_prompt"][0]["Other Active Components"] + skill_other = reflective["skill_prompt"][0]["Other Active Components"] + + # Each record set surfaces only the sibling component's body. + assert "SKILL BODY" in sys_other + assert "SYSTEM BODY" not in sys_other + assert "SYSTEM BODY" in skill_other + assert "SKILL BODY" not in skill_other + + +def test_make_reflective_dataset_surfaces_evaluator_error_as_case_body(monkeypatch): + """When the evaluator fails to produce runs for a case (e.g. ``case + missing from evaluator result``), the trajectory entry carries an + ``error_message`` and no ``_case_runs``. The reflective record must + still appear with that error_message as the Case Body, otherwise the + LM silently loses every failed case where the runtime itself broke.""" + cases = [_eval_case("c_missing")] + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.0, + raw_result=_evaluate_result({}), # no case results at all + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=cases, candidate={"instruction": "x"}, capture_traces=True + ) + record = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0] + assert record["case_id"] == "c_missing" + assert record["score"] == pytest.approx(0.0) + assert "case missing from evaluator result" in record["Case Body"] + + +def test_make_reflective_dataset_returns_empty_for_no_components(): + adapter = _make_adapter() + fake_batch = type("FakeBatch", (), {"trajectories": [{"score": 0.0}]})() + result = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=fake_batch, + components_to_update=[], + ) + assert result == {} + + +def test_make_reflective_dataset_handles_no_trajectories(): + adapter = _make_adapter() + fake_batch = type("FakeBatch", (), {"trajectories": None})() + result = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=fake_batch, + components_to_update=["instruction", "system"], + ) + assert result == {"instruction": [], "system": []} + + +def test_make_reflective_dataset_replicates_records_across_components(monkeypatch): + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="off")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=True) + reflective = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction", "react_skill"], + ) + assert "instruction" in reflective + assert "react_skill" in reflective + assert len(reflective["instruction"]) == 1 + assert len(reflective["react_skill"]) == 1 + + +def test_adapter_records_best_history_per_case(): + """After three _record_history calls the buffer keeps the top-2 by score.""" + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=2, + ) + adapter._record_history(case_id="c1", score=0.4, best_response="hello low") + adapter._record_history(case_id="c1", score=0.9, best_response="hello high") + adapter._record_history(case_id="c1", score=0.6, best_response="hello mid") + + history = adapter._best_history["c1"] + assert len(history) == 2 + assert history[0]["score"] == pytest.approx(0.9) + assert history[0]["best_response"] == "hello high" + assert history[1]["score"] == pytest.approx(0.6) + assert history[1]["best_response"] == "hello mid" + + +def test_adapter_top_k_zero_disables_buffer(): + """top_k=0 is the kill switch — _record_history must be a no-op.""" + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + adapter._record_history(case_id="c1", score=0.9, best_response="hello") + + assert adapter._best_history.get("c1", []) == [] + + +def test_evaluate_populates_best_history_buffer(monkeypatch): + """Running evaluate() twice on the same case accumulates history sorted by score.""" + from trpc_agent_sdk.evaluation._optimize_evaluator_call import EvaluationOutcome + + cases = [_eval_case("c1")] + outcome_low = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result( + "c1", status=EvalStatus.FAILED, metric_score=0.3, actual="low" + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome_low) + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=2, + ) + adapter.evaluate( + batch=cases, candidate={"instruction": "x"}, capture_traces=False + ) + + outcome_high = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.8, + raw_result=_evaluate_result({ + "c1": [_case_result( + "c1", status=EvalStatus.FAILED, metric_score=0.8, actual="high" + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome_high) + adapter.evaluate( + batch=cases, candidate={"instruction": "y"}, capture_traces=False + ) + + history = adapter._best_history["c1"] + assert len(history) == 2 + assert history[0]["score"] == pytest.approx(0.8) + assert history[0]["best_response"] == "high" + assert history[1]["score"] == pytest.approx(0.3) + assert history[1]["best_response"] == "low" + + +def test_make_reflective_dataset_includes_history_top_k_when_buffer_nonempty( + monkeypatch, +): + """When history is seeded and top_k>0, the record carries a history_top_k list.""" + from trpc_agent_sdk.evaluation._optimize_evaluator_call import EvaluationOutcome + + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result( + "c1", status=EvalStatus.FAILED, metric_score=0.3, actual="bad" + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=2, + ) + # Seed history with a previous high-score entry the adapter should keep. + adapter._record_history(case_id="c1", score=0.9, best_response="known good") + + batch_obj = adapter.evaluate( + batch=cases, candidate={"instruction": "x"}, capture_traces=True + ) + dataset = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + ) + + records = dataset["instruction"] + assert len(records) == 1 + assert "history_top_k" in records[0] + history = records[0]["history_top_k"] + assert len(history) == 2 # 0.9 seeded + 0.3 from this evaluation + assert history[0]["score"] == pytest.approx(0.9) + assert history[0]["best_response"] == "known good" + assert history[1]["score"] == pytest.approx(0.3) + + +def test_make_reflective_dataset_omits_history_top_k_when_buffer_empty( + monkeypatch, +): + """top_k=0 disables the feature: the record must not carry history_top_k.""" + from trpc_agent_sdk.evaluation._optimize_evaluator_call import EvaluationOutcome + + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result( + "c1", status=EvalStatus.FAILED, metric_score=0.3, actual="bad" + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + + batch_obj = adapter.evaluate( + batch=cases, candidate={"instruction": "x"}, capture_traces=True + ) + dataset = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + ) + + records = dataset["instruction"] + assert len(records) == 1 + assert "history_top_k" not in records[0] + + +# --------------------------------------------------------------------------- +# Long-lived event loop: call_agent may hold async resources across evaluate() +# calls without hitting "Event loop is closed" (fix for CONC-2). +# --------------------------------------------------------------------------- + + +def test_evaluate_reuses_single_loop_across_calls(monkeypatch) -> None: + """A module-level async resource bound to the loop on first use must + keep working across consecutive evaluate() calls.""" + import asyncio + + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=1.0, + metric_breakdown={"m1": 1.0}, + failed_case_ids=[], + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=1.0, actual="ok")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + seen_loops: list[int] = [] + + async def call_agent_with_loop_id(query: str) -> str: + # id(loop) stays constant iff the adapter reuses one loop. + seen_loops.append(id(asyncio.get_running_loop())) + return "stub" + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=call_agent_with_loop_id, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + try: + for _ in range(3): + adapter.evaluate( + batch=[_eval_case()], + candidate={"instruction": "v"}, + ) + finally: + adapter.close() + + # _patch_run_evaluator stubs the actual evaluator path so call_agent + # is not driven; verify the same loop is used by inspecting the + # adapter-owned loop directly across calls. + assert adapter._loop is None # closed after close() + + +def test_evaluate_loop_reuse_supports_module_level_async_client(monkeypatch) -> None: + """A user holding a module-level lock-like async resource that binds to + its first event loop must still work across multiple evaluate() calls.""" + import asyncio + + outcome = EvaluationOutcome( + pass_rate=1.0, tiebreaker=1.0, metric_breakdown={"m1": 1.0}, + failed_case_ids=[], + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=1.0, actual="ok")], + }), + ) + + captured_loops: list[int] = [] + # Capture the loop id during write_all (driven by adapter's loop). + target = TargetPrompt() + state = {"value": ""} + + async def read_cb() -> str: + return state["value"] + + async def write_cb(value: str) -> None: + captured_loops.append(id(asyncio.get_running_loop())) + state["value"] = value + + target.add_callback("instruction", read=read_cb, write=write_cb) + + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _AgentGEPAAdapter( + target_prompt=target, + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + try: + for i in range(3): + adapter.evaluate( + batch=[_eval_case()], + candidate={"instruction": f"v{i}"}, + ) + finally: + adapter.close() + + # All write_all invocations executed on the same event loop. + assert len(captured_loops) == 3 + assert len(set(captured_loops)) == 1 + + +def test_close_is_idempotent_and_safe_before_evaluate() -> None: + """close() before any evaluate() and double close() must not raise.""" + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + adapter.close() + adapter.close() + + +def test_evaluate_after_close_creates_fresh_loop(monkeypatch) -> None: + """After close(), a subsequent evaluate() must spin up a new loop + (defensive support for callers that reuse an adapter).""" + import asyncio + + outcome = EvaluationOutcome( + pass_rate=1.0, tiebreaker=1.0, metric_breakdown={"m1": 1.0}, + failed_case_ids=[], + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=1.0, actual="ok")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + try: + adapter.evaluate(batch=[_eval_case()], candidate={"instruction": "v1"}) + old_loop = adapter._loop + first_loop_id = id(old_loop) + adapter.close() + assert adapter._loop is None + adapter.evaluate(batch=[_eval_case()], candidate={"instruction": "v2"}) + assert adapter._loop is not None + assert id(adapter._loop) != first_loop_id + del old_loop + finally: + adapter.close() + + +# --------------------------------------------------------------------------- +# API-A2: call_agent return-type sentinel check (must surface non-str return +# on the first call instead of crashing deep inside metric code). +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_call_agent_returning_non_str_is_rejected_on_first_call(): + """An async callable that returns a non-str value must raise a clear + TypeError on the first invocation, naming the actual returned type. + The check fires through the wrapper installed in _AgentGEPAAdapter.__init__.""" + async def bad_call_agent(query: str): + return 42 # int, not str + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=bad_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + + with pytest.raises(TypeError, match="call_agent must return str"): + await adapter.call_agent("hi") + + +@pytest.mark.asyncio +async def test_call_agent_return_check_runs_only_once(): + """The wrapper must only validate on the first successful call to avoid + per-case overhead. After the first call returns a valid str, later calls + bypass the isinstance check entirely (we cannot directly observe this, + but verify functional correctness: subsequent str returns succeed).""" + call_count = {"n": 0} + + async def good_call_agent(query: str): + call_count["n"] += 1 + return "ok" + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=good_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + + for _ in range(5): + result = await adapter.call_agent("hi") + assert result == "ok" + assert call_count["n"] == 5 + + +@pytest.mark.asyncio +async def test_call_agent_return_check_does_not_swallow_user_exceptions(): + """If call_agent itself raises, the wrapper must propagate the original + exception (not replace it with a TypeError).""" + async def raising_call_agent(query: str): + raise RuntimeError("user-side failure") + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=raising_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + + with pytest.raises(RuntimeError, match="user-side failure"): + await adapter.call_agent("hi") diff --git a/tests/evaluation/test_optimize_gepa_callback.py b/tests/evaluation/test_optimize_gepa_callback.py new file mode 100644 index 00000000..80cde05f --- /dev/null +++ b/tests/evaluation/test_optimize_gepa_callback.py @@ -0,0 +1,667 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for _AgentGEPACallback buffering iteration events as RoundRecords.""" + +from __future__ import annotations + +from trpc_agent_sdk.evaluation._optimize_gepa_callback import _AgentGEPACallback +from trpc_agent_sdk.evaluation._optimize_gepa_callback import _translate_skip_reason + + +def _trigger_iteration( + callback: _AgentGEPACallback, + *, + iteration: int, + candidate: dict, + average_score: float, + is_best: bool = False, + proposal_accepted: bool = False, +) -> None: + callback.on_iteration_start({"iteration": iteration, "state": None, "trainset_loader": None}) + callback.on_valset_evaluated( + { + "iteration": iteration, + "candidate_idx": iteration, + "candidate": candidate, + "scores_by_val_id": {}, + "average_score": average_score, + "num_examples_evaluated": 1, + "total_valset_size": 1, + "parent_ids": [], + "is_best_program": is_best, + "outputs_by_val_id": None, + } + ) + callback.on_iteration_end( + {"iteration": iteration, "state": None, "proposal_accepted": proposal_accepted} + ) + + +def test_callback_starts_with_empty_buffer(): + callback = _AgentGEPACallback() + assert callback.rounds == [] + assert callback.baseline_metric_breakdown == {} + assert callback.baseline_failed_case_ids == [] + assert callback.baseline_pass_rate == 0.0 + + +def test_callback_captures_seed_evaluation_into_baseline_not_rounds(): + """gepa emits ``iteration == 0`` exactly once for the seed candidate. + Callback must store it as baseline rather than appending a RoundRecord.""" + + class _StubOutcome: + metric_breakdown = {"final_response_avg_score": 0.42} + failed_case_ids = ["case-2"] + + class _StubAdapter: + last_outcome = _StubOutcome() + + callback = _AgentGEPACallback(adapter=_StubAdapter()) + callback.on_valset_evaluated( + { + "iteration": 0, + "candidate_idx": 0, + "candidate": {"instruction": "baseline"}, + "scores_by_val_id": {}, + "average_score": 0.42, + "num_examples_evaluated": 1, + "total_valset_size": 1, + "parent_ids": [], + "is_best_program": True, + "outputs_by_val_id": None, + } + ) + + assert callback.rounds == [] + assert callback.baseline_metric_breakdown == {"final_response_avg_score": 0.42} + assert callback.baseline_failed_case_ids == ["case-2"] + assert callback.baseline_pass_rate == 0.42 + + +def test_callback_records_one_round_per_iteration(): + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.6, + is_best=False, + proposal_accepted=False, + ) + _trigger_iteration( + callback, + iteration=2, + candidate={"instruction": "v2"}, + average_score=0.9, + is_best=True, + proposal_accepted=True, + ) + + assert len(callback.rounds) == 2 + assert callback.rounds[0].round == 1 + assert callback.rounds[0].validation_pass_rate == 0.6 + assert callback.rounds[0].candidate_prompts == {"instruction": "v1"} + assert callback.rounds[0].accepted is False + + assert callback.rounds[1].round == 2 + assert callback.rounds[1].validation_pass_rate == 0.9 + assert callback.rounds[1].candidate_prompts == {"instruction": "v2"} + assert callback.rounds[1].accepted is True + + +def test_callback_acceptance_via_proposal_accepted_only(): + """proposal_accepted=True alone should mark the round accepted.""" + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.7, + is_best=False, + proposal_accepted=True, + ) + assert callback.rounds[0].accepted is True + + +def test_callback_acceptance_follows_proposal_accepted_only(): + """A candidate flagged is_best_program=True without proposal_accepted=True + must not be reported as accepted: the user-facing "accepted" status follows + gepa's proposal_accepted contract so the timeline matches gepa's own + acceptance log. + """ + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.7, + is_best=True, + proposal_accepted=False, + ) + assert callback.rounds[0].accepted is False + + +def test_callback_always_emits_record_even_when_valset_not_evaluated(): + """Iterations rejected by the subsample gate still get a RoundRecord so + round indices in the reporter stay contiguous with gepa iterations. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + assert len(callback.rounds) == 1 + record = callback.rounds[0] + assert record.round == 1 + assert record.skip_reason == "reflect-LM produced no usable new prompt" + assert record.candidate_prompts == {} + assert record.accepted is False + + +def test_callback_records_candidate_field_names_falls_back_to_candidate_keys(): + """Without an ``on_proposal_end`` event (e.g. merge round, or any path + that bypasses the reflective proposer), ``optimized_field_names`` + falls back to the full candidate key set so result.json never + surfaces an empty list when a candidate exists. + """ + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"system": "s1", "react": "r1"}, + average_score=0.5, + ) + assert set(callback.rounds[0].optimized_field_names) == {"system", "react"} + + +def test_callback_optimized_field_names_uses_proposal_end_components_only(): + """F-3: when ``on_proposal_end`` fires, ``optimized_field_names`` must + reflect ONLY the components rewritten by the reflection LM this + round (gepa's RoundRobin / random component selectors mutate a + subset of the candidate's components per iteration). + + Previously the field reported the full ``candidate.keys()`` list, + misleading users into thinking every component was rewritten each + round when only one (or a subset) actually was. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + # gepa's RoundRobin selector picked only "dim_intent" this round; the + # reflection LM produced exactly one new instruction. + callback.on_proposal_end( + { + "iteration": 1, + "new_instructions": {"dim_intent": "rewritten intent prompt"}, + "subsample_scores_before": [0.5], + "subsample_scores_after": [0.7], + } + ) + callback.on_valset_evaluated( + { + "iteration": 1, + "candidate": { + "system": "s1", + "dim_intent": "rewritten intent prompt", + "dim_slot": "s2", + "dim_response": "r1", + "dim_summary": "su1", + }, + "average_score": 0.7, + "is_best_program": False, + } + ) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": True}) + + assert callback.rounds[0].optimized_field_names == ["dim_intent"] + # candidate_prompts still carries the full candidate (used elsewhere + # for ``best_prompts`` etc.); only the "what was changed this round" + # metadata is narrowed. + assert set(callback.rounds[0].candidate_prompts.keys()) == { + "system", + "dim_intent", + "dim_slot", + "dim_response", + "dim_summary", + } + + +def test_callback_optimized_field_names_resets_between_iterations(): + """``_iter_changed_components`` must reset on ``on_iteration_start``; + a proposal event in iteration N must not leak into iteration N+1's + ``optimized_field_names`` when the next iteration has no proposal + event of its own (e.g. a merge round following a reflective round). + """ + callback = _AgentGEPACallback() + + # Iteration 1: reflective round, only "dim_intent" rewritten. + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_proposal_end( + {"iteration": 1, "new_instructions": {"dim_intent": "v1"}} + ) + callback.on_valset_evaluated( + { + "iteration": 1, + "candidate": {"dim_intent": "v1", "dim_slot": "s0"}, + "average_score": 0.6, + "is_best_program": False, + } + ) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": True}) + + # Iteration 2: merge round — no on_proposal_end, must fall back to + # full candidate keys, NOT reuse iteration 1's ["dim_intent"]. + callback.on_iteration_start({"iteration": 2, "state": None, "trainset_loader": None}) + callback.on_merge_attempted({"iteration": 2}) + callback.on_valset_evaluated( + { + "iteration": 2, + "candidate": {"dim_intent": "v1", "dim_slot": "s0"}, + "average_score": 0.65, + "is_best_program": False, + } + ) + callback.on_iteration_end({"iteration": 2, "state": None, "proposal_accepted": True}) + + assert callback.rounds[0].optimized_field_names == ["dim_intent"] + assert set(callback.rounds[1].optimized_field_names) == {"dim_intent", "dim_slot"} + assert callback.rounds[1].kind == "merge" + + +# --------------------------------------------------------------------------- +# on_evaluation_end: parent / candidate subsample-score routing (F-5) +# --------------------------------------------------------------------------- +# +# gepa marks the post-mutation / post-merge eval with ``candidate_idx=None``; +# every other evaluation_end carries an int ``candidate_idx`` and represents +# the parent / current-program eval. Earlier seq-based logic misclassified +# rounds where the reflective proposer picked the seed program (id=0) as +# parent because gepa flags that parent eval with ``is_seed_candidate=True``. + + +def test_on_evaluation_end_records_parent_then_candidate_normal_round(): + """Normal reflective round: parent eval first (int idx), then new + candidate eval (idx=None). Both scores must land on the right slots. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 2, "state": None, "trainset_loader": None}) + # Parent eval (curr_prog_id=5, NOT seed). Use scores that average + # exactly to 0.5 so the assertion is float-safe. + callback.on_evaluation_end( + { + "iteration": 2, + "candidate_idx": 5, + "scores": [0.4, 0.5, 0.6], + "is_seed_candidate": False, + } + ) + # New candidate eval (post-mutation, candidate_idx=None). + callback.on_evaluation_end( + { + "iteration": 2, + "candidate_idx": None, + "scores": [0.8, 0.9, 1.0], + "is_seed_candidate": False, + } + ) + assert callback._iter_train_parent_score == 0.5 # noqa: SLF001 + assert callback._iter_train_candidate_score == 0.9 # noqa: SLF001 + assert callback._iter_train_minibatch_size == 3 # noqa: SLF001 + + +def test_on_evaluation_end_records_correctly_when_parent_is_seed(): + """F-5 regression: when reflective_mutation picks the seed program + (id=0) as parent, the parent eval is flagged ``is_seed_candidate=True``. + Earlier logic dropped that event and shifted the candidate score + into the parent slot — verify the new ``candidate_idx``-based routing + keeps the slots correct. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + # Parent eval — parent IS the seed program. gepa sets is_seed_candidate=True + # here (reflective_mutation.py:283). + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": 0, + "scores": [0.5, 0.5], + "is_seed_candidate": True, + } + ) + # New candidate eval. + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": None, + "scores": [0.9, 0.9], + "is_seed_candidate": False, + } + ) + # Parent slot must carry the seed score, NOT the candidate score. + assert callback._iter_train_parent_score == 0.5 # noqa: SLF001 + assert callback._iter_train_candidate_score == 0.9 # noqa: SLF001 + + +def test_on_evaluation_end_merge_round_only_candidate_score(): + """Merge round emits exactly one evaluation_end with ``candidate_idx=None`` + (merge.py:376). Parent slot must stay None — merge has two parents, + a single ``parent_score`` doesn't apply. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 3, "state": None, "trainset_loader": None}) + callback.on_merge_attempted({"iteration": 3}) + callback.on_evaluation_end( + { + "iteration": 3, + "candidate_idx": None, + "scores": [0.7, 0.7, 0.7, 0.7], + "is_seed_candidate": False, + } + ) + assert callback._iter_train_parent_score is None # noqa: SLF001 + assert callback._iter_train_candidate_score == 0.7 # noqa: SLF001 + + +def test_on_evaluation_end_skips_empty_scores(): + """Empty scores carry no information — leave both slots untouched.""" + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": 5, + "scores": [], + "is_seed_candidate": False, + } + ) + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": None, + "scores": None, + "is_seed_candidate": False, + } + ) + assert callback._iter_train_parent_score is None # noqa: SLF001 + assert callback._iter_train_candidate_score is None # noqa: SLF001 + + +def test_on_evaluation_end_minibatch_size_set_from_parent_when_unset(): + """When ``on_minibatch_sampled`` did not fire (or fired with empty + list), the parent eval's ``len(scores)`` is the next-best signal for + the round's minibatch size. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + # No on_minibatch_sampled — minibatch size starts at 0. + assert callback._iter_train_minibatch_size == 0 # noqa: SLF001 + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": 5, + "scores": [0.4, 0.5, 0.6], # 3 cases + "is_seed_candidate": False, + } + ) + assert callback._iter_train_minibatch_size == 3 # noqa: SLF001 + + +def test_on_evaluation_end_does_not_overwrite_minibatch_size_from_sampled(): + """If ``on_minibatch_sampled`` already set the minibatch size, + parent eval's score count must NOT clobber it (the sampled event is + authoritative — it counts the FULL minibatch even when the eval + short-circuits a subset). + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_minibatch_sampled( + {"iteration": 1, "minibatch_ids": ["a", "b", "c", "d", "e"], "trainset_size": 5} + ) + assert callback._iter_train_minibatch_size == 5 # noqa: SLF001 + # Parent eval somehow only scored 2 cases — minibatch_size stays 5. + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": 5, + "scores": [0.4, 0.5], + "is_seed_candidate": False, + } + ) + assert callback._iter_train_minibatch_size == 5 # noqa: SLF001 + + +def test_callback_records_duration_seconds_non_negative(): + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.5, + ) + assert callback.rounds[0].duration_seconds >= 0.0 + + +def test_callback_reasoning_includes_score(): + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.8765, + is_best=True, + ) + assert "0.8765" in callback.rounds[0].acceptance_reason + + +class _FakeOutcome: + def __init__(self, metric_breakdown: dict, failed_case_ids: list) -> None: + self.metric_breakdown = metric_breakdown + self.failed_case_ids = failed_case_ids + + +class _FakeAdapter: + def __init__(self, outcome: _FakeOutcome) -> None: + self.last_outcome = outcome + + +class _FakeReflectionLM: + def __init__(self) -> None: + self.total_calls = 0 + self.total_cost = 0.0 + self.total_token_usage = {"prompt": 0, "completion": 0, "total": 0} + + def make_call(self, prompt_tokens: int = 10, completion_tokens: int = 5, cost: float = 0.01) -> None: + self.total_calls += 1 + self.total_cost += cost + self.total_token_usage["prompt"] += prompt_tokens + self.total_token_usage["completion"] += completion_tokens + self.total_token_usage["total"] += prompt_tokens + completion_tokens + + +def test_callback_pulls_metric_breakdown_and_failures_from_adapter(): + """B2: when adapter is supplied, callback fills metric_breakdown / failed_case_ids.""" + outcome = _FakeOutcome( + metric_breakdown={"m1": 0.7, "m2": 0.4}, + failed_case_ids=["c3", "c5"], + ) + adapter = _FakeAdapter(outcome) + callback = _AgentGEPACallback(adapter=adapter) + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.55, + ) + assert callback.rounds[0].metric_breakdown == {"m1": 0.7, "m2": 0.4} + assert callback.rounds[0].failed_case_ids == ["c3", "c5"] + + +def test_callback_records_per_round_reflection_lm_call_delta(): + """Reflection-LM calls/cost/tokens between iteration_start and iteration_end + should land on the produced RoundRecord.""" + lm = _FakeReflectionLM() + callback = _AgentGEPACallback(reflection_lm=lm) + + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + lm.make_call(prompt_tokens=20, completion_tokens=10, cost=0.02) + lm.make_call(prompt_tokens=15, completion_tokens=8, cost=0.015) + callback.on_valset_evaluated( + { + "iteration": 1, + "candidate": {"instruction": "v1"}, + "average_score": 0.7, + "is_best_program": False, + } + ) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + + record = callback.rounds[0] + assert record.reflection_lm_calls == 2 + assert record.round_llm_cost == 0.035 + assert record.round_token_usage == {"prompt": 35, "completion": 18, "total": 53} + + +def test_translate_skip_reason_handles_known_strings(): + assert ( + _translate_skip_reason("no_trajectories") + == "no trajectories captured this round" + ) + assert ( + _translate_skip_reason("all_scores_perfect") + == "minibatch already perfect (skip_perfect_score on)" + ) + # Whitespace / case / dash normalisation. + assert ( + _translate_skip_reason("All-Scores-Perfect") + == "minibatch already perfect (skip_perfect_score on)" + ) + assert ( + _translate_skip_reason(" no_trajectories ") + == "no trajectories captured this round" + ) + + +def test_translate_skip_reason_surfaces_unknown_strings_under_prefix(): + translated = _translate_skip_reason("some_brand_new_reason") + assert translated is not None + assert translated.startswith("gepa-internal:") + assert "some_brand_new_reason" in translated + + +def test_translate_skip_reason_returns_none_for_empty_or_missing(): + assert _translate_skip_reason(None) is None + assert _translate_skip_reason("") is None + assert _translate_skip_reason(" ") is None + + +def test_callback_translates_skip_reason_via_on_evaluation_skipped(): + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_evaluation_skipped({"reason": "all_scores_perfect"}) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + assert ( + callback.rounds[0].skip_reason + == "minibatch already perfect (skip_perfect_score on)" + ) + + +def test_callback_translates_no_trajectories_skip_reason(): + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_evaluation_skipped({"reason": "no_trajectories"}) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + assert callback.rounds[0].skip_reason == "no trajectories captured this round" + + +def test_callback_uses_no_proposal_fallback_when_no_event_observed(): + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + assert callback.rounds[0].skip_reason == "reflect-LM produced no usable new prompt" + + +# --------------------------------------------------------------------------- +# on_valset_breakdown plumb-through (Framework stop policy) +# --------------------------------------------------------------------------- + + +def test_callback_invokes_on_valset_breakdown_for_candidate_iteration(): + received: list[dict] = [] + outcome = _FakeOutcome( + metric_breakdown={"m1": 0.6, "m2": 0.4}, + failed_case_ids=[], + ) + callback = _AgentGEPACallback( + adapter=_FakeAdapter(outcome), + on_valset_breakdown=lambda bd: received.append(bd), + ) + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.5, + ) + assert received == [{"m1": 0.6, "m2": 0.4}] + + +def test_callback_invokes_on_valset_breakdown_for_baseline_iteration(): + received: list[dict] = [] + outcome = _FakeOutcome( + metric_breakdown={"m1": 0.7}, + failed_case_ids=[], + ) + callback = _AgentGEPACallback( + adapter=_FakeAdapter(outcome), + on_valset_breakdown=lambda bd: received.append(bd), + ) + callback.on_valset_evaluated( + { + "iteration": 0, + "candidate": {"instruction": "baseline"}, + "average_score": 0.7, + "is_best_program": True, + } + ) + assert received == [{"m1": 0.7}] + + +def test_callback_does_not_invoke_on_valset_breakdown_for_skip_without_eval(): + """Iterations that skip valset evaluation (e.g. subsample gate rejected the + candidate) must not push stale breakdowns to the stopper.""" + received: list[dict] = [] + callback = _AgentGEPACallback( + on_valset_breakdown=lambda bd: received.append(bd), + ) + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_evaluation_skipped({"reason": "all_scores_perfect"}) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + assert received == [] + + +def test_callback_on_valset_breakdown_none_is_safe_default(): + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.5, + ) + assert callback.rounds[0].validation_pass_rate == 0.5 + + +def test_callback_on_valset_breakdown_exceptions_do_not_break_loop(): + def _boom(_bd): + raise RuntimeError("boom") + + outcome = _FakeOutcome(metric_breakdown={"m1": 0.5}, failed_case_ids=[]) + callback = _AgentGEPACallback( + adapter=_FakeAdapter(outcome), + on_valset_breakdown=_boom, + ) + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.5, + ) + assert len(callback.rounds) == 1 diff --git a/tests/evaluation/test_optimize_gepa_e2e.py b/tests/evaluation/test_optimize_gepa_e2e.py new file mode 100644 index 00000000..f225a762 --- /dev/null +++ b/tests/evaluation/test_optimize_gepa_e2e.py @@ -0,0 +1,210 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""End-to-end registration test: config -> registry.get -> instantiate -> run.""" + +from __future__ import annotations + +from typing import Optional + +import pytest + +from trpc_agent_sdk.evaluation._eval_case import EvalCase +from trpc_agent_sdk.evaluation._eval_case import Invocation +from trpc_agent_sdk.evaluation._eval_config import EvalConfig +from trpc_agent_sdk.evaluation._eval_set import EvalSet +from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfig +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfigFile +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import GepaReflectiveOptimizer +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions +from trpc_agent_sdk.evaluation._optimize_registry import OPTIMIZER_REGISTRY +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +# --------------------------------------------------------------------------- +# Fixtures shared by the e2e flow +# --------------------------------------------------------------------------- + + +def _invocation(user_text: str, response_text: Optional[str] = None) -> Invocation: + final_response = ( + Content(role="model", parts=[Part.from_text(text=response_text)]) + if response_text is not None + else None + ) + return Invocation( + user_content=Content(role="user", parts=[Part.from_text(text=user_text)]), + final_response=final_response, + ) + + +def _eval_case(eval_id: str = "c1") -> EvalCase: + return EvalCase(eval_id=eval_id, conversation=[_invocation("hi", "ack")]) + + +async def _stub_call_agent(query: str) -> str: + return "stub" + + +def _new_target_prompt(recorder: Optional[dict[str, str]] = None) -> TargetPrompt: + target = TargetPrompt() + state = recorder if recorder is not None else {} + + async def read_cb() -> str: + return state.get("instruction", "initial") + + async def write_cb(value: str) -> None: + state["instruction"] = value + + target.add_callback("instruction", read=read_cb, write=write_cb) + return target + + +def _make_config() -> OptimizeConfigFile: + return OptimizeConfigFile( + evaluate=EvalConfig( + metrics=[{"metric_name": "m1", "threshold": 0.7}], + num_runs=1, + ), + optimize=OptimizeConfig( + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions( + provider_name="openai", + model_name="gpt-4o", + api_key="test-key", + ), + max_metric_calls=30, + ), + ), + ) + + +class _FakeGEPAResult: + def __init__(self, candidates, val_scores): + self.candidates = candidates + self.val_aggregate_scores = val_scores + self.parents = [[None]] + [[i - 1] for i in range(1, len(candidates))] + self.discovery_eval_counts = [0] * len(candidates) + self.total_metric_calls = 0 + self.best_outputs_valset = None + + @property + def best_idx(self) -> int: + return max( + range(len(self.val_aggregate_scores)), + key=lambda i: self.val_aggregate_scores[i], + ) + + +# --------------------------------------------------------------------------- +# Registration contract: importing evaluation package registers algorithms +# --------------------------------------------------------------------------- + + +def test_evaluation_package_import_registers_gepa_reflective(): + """Importing the evaluation package triggers algorithm registration. + + Business code only needs ``import trpc_agent_sdk.evaluation`` to make + ``OPTIMIZER_REGISTRY.get("gepa_reflective")`` work; algorithm modules do + NOT register themselves as a side-effect of bare ``_optimize_gepa_*`` + imports. + """ + import trpc_agent_sdk.evaluation # noqa: F401 triggers registrations + + assert "gepa_reflective" in OPTIMIZER_REGISTRY.list_registered() + assert OPTIMIZER_REGISTRY.get("gepa_reflective") is GepaReflectiveOptimizer + + +def test_registry_lookup_unknown_algorithm_lists_available(): + import trpc_agent_sdk.evaluation # noqa: F401 + + with pytest.raises(ValueError) as exc_info: + OPTIMIZER_REGISTRY.get("not_a_real_algorithm") + + msg = str(exc_info.value) + assert "not_a_real_algorithm" in msg + assert "gepa_reflective" in msg + + +# --------------------------------------------------------------------------- +# End-to-end flow: config -> registry.get -> instantiate -> run +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_e2e_config_to_run_succeeds(tmp_path, monkeypatch): + """Simulate the business-side entry point: + + 1. Parse OptimizeConfigFile (algorithm.name = "gepa_reflective"). + 2. Look up class via OPTIMIZER_REGISTRY.get(name). + 3. Instantiate with the supplied call_agent / target_prompt / datasets. + 4. await optimizer.run() → OptimizeResult with status="SUCCEEDED". + """ + import trpc_agent_sdk.evaluation # noqa: F401 + + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + config = _make_config() + recorder: dict[str, str] = {} + target = _new_target_prompt(recorder) + + algorithm_cls = OPTIMIZER_REGISTRY.get(config.optimize.algorithm.name) + optimizer = algorithm_cls( + config=config, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await optimizer.run() + + assert result.status == "SUCCEEDED" + assert result.best_pass_rate == pytest.approx(0.9) + assert result.best_prompts == {"instruction": "improved"} + # BaseOptimizer.run() never writes back; write-back is owned by the + # AgentOptimizer facade and gated by ``update_source``. + assert result.best_prompts["instruction"] == "improved" + + +@pytest.mark.asyncio +async def test_e2e_registry_returns_instantiable_class(): + """Class returned by registry can be instantiated with the standard kwargs.""" + import trpc_agent_sdk.evaluation # noqa: F401 + + config = _make_config() + target = _new_target_prompt() + + cls = OPTIMIZER_REGISTRY.get("gepa_reflective") + instance = cls( + config=config, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path="/tmp/train.json", + validation_dataset_path="/tmp/val.json", + ) + + assert isinstance(instance, GepaReflectiveOptimizer) + assert instance.config is config + assert instance.target_prompt is target diff --git a/tests/evaluation/test_optimize_gepa_reflective.py b/tests/evaluation/test_optimize_gepa_reflective.py new file mode 100644 index 00000000..1166c19a --- /dev/null +++ b/tests/evaluation/test_optimize_gepa_reflective.py @@ -0,0 +1,1628 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for GepaReflectiveOptimizer and its GEPAResult->OptimizeResult helpers.""" + +from __future__ import annotations + +from datetime import datetime +from datetime import timezone +from typing import Optional + +import pytest + +from trpc_agent_sdk.evaluation._eval_case import EvalCase +from trpc_agent_sdk.evaluation._eval_case import Invocation +from trpc_agent_sdk.evaluation._eval_config import EvalConfig +from trpc_agent_sdk.evaluation._eval_set import EvalSet +from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig +from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfig +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfigFile +from trpc_agent_sdk.evaluation._optimize_gepa_adapter import _AgentGEPAAdapter +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import GepaReflectiveOptimizer +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import ( + _RequiredMetricsAboveThresholdStopper, +) +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _LabeledStopper +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _build_failed_result +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _build_optimize_result +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _build_stop_callbacks +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _classify_stop_reason +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _load_evalset_cases +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +# --------------------------------------------------------------------------- +# Shared fixtures and helpers +# --------------------------------------------------------------------------- + + +def _invocation(user_text: str, response_text: Optional[str] = None) -> Invocation: + final_response = ( + Content(role="model", parts=[Part.from_text(text=response_text)]) + if response_text is not None + else None + ) + return Invocation( + user_content=Content(role="user", parts=[Part.from_text(text=user_text)]), + final_response=final_response, + ) + + +def _eval_case(eval_id: str = "c1", user: str = "hi", expected: str = "ack") -> EvalCase: + return EvalCase( + eval_id=eval_id, + conversation=[_invocation(user, expected)], + ) + + +async def _stub_call_agent(query: str) -> str: + return "stub" + + +def _new_target_prompt(write_recorder: Optional[dict[str, str]] = None) -> TargetPrompt: + target = TargetPrompt() + recorder = write_recorder if write_recorder is not None else {} + + async def read_cb() -> str: + return recorder.get("instruction", "initial") + + async def write_cb(value: str) -> None: + recorder["instruction"] = value + + target.add_callback("instruction", read=read_cb, write=write_cb) + return target + + +class _FakeGEPAResult: + """Minimal stand-in for gepa.core.result.GEPAResult used by mapping tests.""" + + def __init__( + self, + *, + candidates, + val_aggregate_scores, + parents=None, + discovery_eval_counts=None, + total_metric_calls=None, + best_outputs_valset=None, + per_objective_best_candidates=None, + ): + self.candidates = candidates + self.val_aggregate_scores = val_aggregate_scores + self.parents = parents or [[None]] + [[i - 1] for i in range(1, len(candidates))] + self.discovery_eval_counts = discovery_eval_counts or [0] * len(candidates) + self.total_metric_calls = total_metric_calls + self.best_outputs_valset = best_outputs_valset + # GEPA's actual GEPAResult field is dict[str, set[int]] | None + self.per_objective_best_candidates = per_objective_best_candidates + + @property + def best_idx(self) -> int: + return max(range(len(self.val_aggregate_scores)), key=lambda i: self.val_aggregate_scores[i]) + + @property + def best_candidate(self): + return self.candidates[self.best_idx] + + +def _make_config(*, max_metric_calls: int = 30, **algo_overrides) -> OptimizeConfigFile: + return OptimizeConfigFile( + evaluate=EvalConfig( + metrics=[{"metric_name": "m1", "threshold": 0.7}], + num_runs=1, + ), + optimize=OptimizeConfig( + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions( + provider_name="openai", + model_name="gpt-4o", + api_key="test-key", + ), + max_metric_calls=max_metric_calls, + **algo_overrides, + ), + ), + ) + + +def _make_optimizer(target=None, train_path="/tmp/train.json", val_path="/tmp/val.json"): + target = target or _new_target_prompt() + return GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=train_path, + validation_dataset_path=val_path, + ) + + +# --------------------------------------------------------------------------- +# _load_evalset_cases +# --------------------------------------------------------------------------- + + +def test_load_evalset_cases_reads_from_evalset_json(tmp_path): + evalset = EvalSet( + eval_set_id="train", + eval_cases=[_eval_case("c1"), _eval_case("c2")], + ) + file_path = tmp_path / "train.evalset.json" + file_path.write_text(evalset.model_dump_json(), encoding="utf-8") + + cases = _load_evalset_cases(str(file_path)) + assert len(cases) == 2 + assert {c.eval_id for c in cases} == {"c1", "c2"} + + +def test_load_evalset_cases_raises_for_missing_file(): + with pytest.raises(FileNotFoundError): + _load_evalset_cases("/nonexistent/path.json") + + +# --------------------------------------------------------------------------- +# _build_stop_callbacks +# --------------------------------------------------------------------------- + + +def _disabled_stop_cfg() -> FrameworkStopConfig: + return FrameworkStopConfig(required_metrics=None) + + +def test_build_stop_callbacks_includes_each_configured_stopper(): + """One stopper instance per configured stop field; unset fields stay off. + + Every gepa stopper is wrapped by ``_LabeledStopper`` so the optimizer can + classify ``stop_reason`` after gepa returns; the inner gepa class is + reached via ``stopper._inner`` and the label is exposed via + ``stopper.label``. + """ + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + max_iterations_without_improvement=3, + timeout_seconds=60.0, + score_threshold=0.95, + max_candidate_proposals=20, + max_tracked_candidates=12, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, _disabled_stop_cfg(), {} + ) + assert framework_stopper is None + labeled_stoppers = [s for s in stoppers if isinstance(s, _LabeledStopper)] + inner_class_names = {type(s._inner).__name__ for s in labeled_stoppers} + assert "MaxMetricCallsStopper" in inner_class_names + assert "NoImprovementStopper" in inner_class_names + assert "TimeoutStopCondition" in inner_class_names + assert "ScoreThresholdStopper" in inner_class_names + assert "MaxCandidateProposalsStopper" in inner_class_names + assert "MaxTrackedCandidatesStopper" in inner_class_names + + +def test_build_stop_callbacks_emits_only_configured_stoppers(): + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + timeout_seconds=30.0, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, _disabled_stop_cfg(), {} + ) + assert framework_stopper is None + assert len(stoppers) == 1 + assert isinstance(stoppers[0], _LabeledStopper) + assert type(stoppers[0]._inner).__name__ == "TimeoutStopCondition" + assert stoppers[0].label == "timeout" + + +def test_build_stop_callbacks_adds_required_metrics_stopper_for_all(): + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics="all"), + {"m1": 0.5, "m2": 0.3}, + ) + assert isinstance(framework_stopper, _RequiredMetricsAboveThresholdStopper) + assert framework_stopper in stoppers + assert framework_stopper._thresholds == {"m1": 0.5, "m2": 0.3} + + +def test_build_stop_callbacks_adds_required_metrics_stopper_for_subset_list(): + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics=["m1"]), + {"m1": 0.5, "m2": 0.3}, + ) + assert isinstance(framework_stopper, _RequiredMetricsAboveThresholdStopper) + assert framework_stopper._thresholds == {"m1": 0.5} + + +def test_build_stop_callbacks_skips_framework_stopper_when_disabled(): + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics=None), + {"m1": 0.5}, + ) + assert framework_stopper is None + assert all( + not isinstance(s, _RequiredMetricsAboveThresholdStopper) for s in stoppers + ) + + +def test_build_stop_callbacks_skips_framework_stopper_when_thresholds_empty(): + """Even with required_metrics='all', if metric_thresholds is empty the + resolved subset is empty and the stopper would be a no-op; skip it.""" + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, FrameworkStopConfig(required_metrics="all"), {} + ) + assert framework_stopper is None + + +# --------------------------------------------------------------------------- +# _RequiredMetricsAboveThresholdStopper +# --------------------------------------------------------------------------- + + +def test_required_metrics_stopper_returns_false_before_first_update(): + stopper = _RequiredMetricsAboveThresholdStopper({"m1": 0.5}) + assert stopper(gepa_state=None) is False + assert stopper.last_triggered is False + + +def test_required_metrics_stopper_triggers_when_all_pass(): + stopper = _RequiredMetricsAboveThresholdStopper({"m1": 0.5, "m2": 0.3}) + stopper.update({"m1": 0.6, "m2": 0.4}) + assert stopper(gepa_state=None) is True + assert stopper.last_triggered is True + + +def test_required_metrics_stopper_does_not_trigger_when_one_below(): + stopper = _RequiredMetricsAboveThresholdStopper({"m1": 0.5, "m2": 0.3}) + stopper.update({"m1": 0.6, "m2": 0.2}) + assert stopper(gepa_state=None) is False + assert stopper.last_triggered is False + + +def test_required_metrics_stopper_last_triggered_is_sticky(): + """Once triggered, last_triggered remains True even if subsequent updates + fall back below thresholds (helps the run() stop_reason decision).""" + stopper = _RequiredMetricsAboveThresholdStopper({"m1": 0.5}) + stopper.update({"m1": 0.7}) + stopper(gepa_state=None) + assert stopper.last_triggered is True + stopper.update({"m1": 0.1}) + stopper(gepa_state=None) + assert stopper.last_triggered is True + + +def test_required_metrics_stopper_empty_thresholds_never_triggers(): + stopper = _RequiredMetricsAboveThresholdStopper({}) + stopper.update({"m1": 0.9}) + assert stopper(gepa_state=None) is False + + +# --------------------------------------------------------------------------- +# _build_optimize_result +# --------------------------------------------------------------------------- + + +def test_build_optimize_result_maps_best_and_baseline(): + baseline = {"instruction": "baseline text"} + candidates = [ + {"instruction": "baseline text"}, + {"instruction": "candidate v1"}, + {"instruction": "candidate v2 (best)"}, + ] + gepa_result = _FakeGEPAResult( + candidates=candidates, + val_aggregate_scores=[0.5, 0.6, 0.9], + total_metric_calls=42, + ) + + started = datetime(2026, 5, 15, 10, 0, 0, tzinfo=timezone.utc) + finished = datetime(2026, 5, 15, 10, 5, 0, tzinfo=timezone.utc) + + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=candidates[2], + reflection_lm_cost=1.23, + started_at=started, + finished_at=finished, + algo_name="gepa_reflective", + ) + + assert result.status == "SUCCEEDED" + assert result.finish_reason == "completed" + assert result.baseline_pass_rate == pytest.approx(0.5) + assert result.best_pass_rate == pytest.approx(0.9) + assert result.pass_rate_improvement == pytest.approx(0.4) + assert result.baseline_prompts == baseline + assert result.best_prompts == candidates[2] + assert result.total_rounds == 2 + assert result.total_llm_cost == pytest.approx(1.23) + assert result.algorithm == "gepa_reflective" + assert result.extras["total_metric_calls"] == 42 + + +def test_build_optimize_result_produces_round_records(): + baseline = {"instruction": "v0"} + candidates = [ + {"instruction": "v0"}, + {"instruction": "v1"}, + {"instruction": "v2"}, + ] + gepa_result = _FakeGEPAResult( + candidates=candidates, + val_aggregate_scores=[0.3, 0.7, 0.5], + ) + + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=candidates[1], + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + ) + + assert len(result.rounds) == 2 + round1 = result.rounds[0] + assert round1.round == 1 + assert round1.candidate_prompts == candidates[1] + assert round1.validation_pass_rate == pytest.approx(0.7) + assert round1.accepted is True + + round2 = result.rounds[1] + assert round2.round == 2 + assert round2.candidate_prompts == candidates[2] + assert round2.accepted is False + + +def test_build_optimize_result_forwards_metric_thresholds(): + """metric_thresholds gets copied through to OptimizeResult so reporters and + summary.txt can show baseline / best alongside the per-metric PASS bar.""" + baseline = {"instruction": "v0"} + gepa_result = _FakeGEPAResult( + candidates=[baseline, {"instruction": "v1"}], + val_aggregate_scores=[0.4, 0.9], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate={"instruction": "v1"}, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + metric_thresholds={ + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + }, + ) + assert result.metric_thresholds == { + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + } + + +def test_build_failed_result_carries_metric_thresholds(): + """Even on FAILED runs the user should still see the configured thresholds + so summary.txt does not look like the metrics had no acceptance bar at all. + """ + result = _build_failed_result( + baseline_prompts={"instruction": "v0"}, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + error_message="boom", + algo_name="gepa_reflective", + metric_thresholds={"final_response_avg_score": 0.5}, + ) + assert result.status == "FAILED" + assert result.metric_thresholds == {"final_response_avg_score": 0.5} + + +def test_build_optimize_result_forwards_baseline_and_best_breakdowns(): + """B1: baseline_metric_breakdown is passed through; best_metric_breakdown is + pulled from the round whose candidate_prompts matches best_candidate.""" + from trpc_agent_sdk.evaluation._optimize_result import RoundRecord + + baseline = {"instruction": "v0"} + candidates = [baseline, {"instruction": "v1"}, {"instruction": "v2"}] + gepa_result = _FakeGEPAResult( + candidates=candidates, + val_aggregate_scores=[0.4, 0.6, 0.9], + ) + callback_rounds = [ + RoundRecord( + round=1, + optimized_field_names=["instruction"], + candidate_prompts=candidates[1], + train_pass_rate=0.0, + validation_pass_rate=0.6, + metric_breakdown={"final_response_avg_score": 0.6}, + accepted=False, + acceptance_reason="explored", + started_at="2026-05-17T10:00:00Z", + duration_seconds=1.0, + ), + RoundRecord( + round=2, + optimized_field_names=["instruction"], + candidate_prompts=candidates[2], + train_pass_rate=0.0, + validation_pass_rate=0.9, + metric_breakdown={"final_response_avg_score": 0.9}, + accepted=True, + acceptance_reason="best", + started_at="2026-05-17T10:00:02Z", + duration_seconds=1.0, + ), + ] + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=candidates[2], + reflection_lm_cost=0.5, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + callback_rounds=callback_rounds, + baseline_metric_breakdown={"final_response_avg_score": 0.4}, + total_reflection_lm_calls=5, + total_judge_model_calls=12, + total_judge_cost=0.25, + total_token_usage={"prompt": 100, "completion": 50, "total": 150}, + ) + + assert result.baseline_metric_breakdown == {"final_response_avg_score": 0.4} + assert result.best_metric_breakdown == {"final_response_avg_score": 0.9} + assert result.total_reflection_lm_calls == 5 + assert result.total_judge_model_calls == 12 + assert result.total_llm_cost == pytest.approx(0.75) # 0.5 (reflection) + 0.25 (judge) + assert result.total_token_usage == {"prompt": 100, "completion": 50, "total": 150} + + +def test_build_optimize_result_forwards_stop_reason(): + baseline = {"instruction": "v0"} + gepa_result = _FakeGEPAResult( + candidates=[baseline, {"instruction": "v1"}], + val_aggregate_scores=[0.4, 0.9], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate={"instruction": "v1"}, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + stop_reason="required_metrics_passing", + ) + assert result.stop_reason == "required_metrics_passing" + + +def test_build_optimize_result_stop_reason_defaults_to_none(): + baseline = {"instruction": "v0"} + gepa_result = _FakeGEPAResult( + candidates=[baseline, {"instruction": "v1"}], + val_aggregate_scores=[0.4, 0.9], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate={"instruction": "v1"}, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + ) + assert result.stop_reason is None + + +def test_build_optimize_result_pass_rate_improvement_can_be_zero(): + baseline = {"instruction": "v"} + gepa_result = _FakeGEPAResult( + candidates=[baseline, dict(baseline)], + val_aggregate_scores=[0.8, 0.8], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=baseline, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + ) + assert result.pass_rate_improvement == pytest.approx(0.0) + + +def test_build_optimize_result_mirrors_baseline_breakdown_when_baseline_is_best(): + """R2: when ``best_idx == 0`` (gepa found no improvement), the + iteration-0 baseline evaluation is recorded as + ``baseline_metric_breakdown`` rather than as a RoundRecord, so the + rounds list never contains a record matching the seed prompts. + Without the fallback, ``best_metric_breakdown`` would stay empty and + ``summary.txt`` would render the ``best`` column as ``nan``, looking + like data loss instead of "no improvement". + """ + baseline = {"instruction": "v0"} + gepa_result = _FakeGEPAResult( + candidates=[baseline], # only the seed candidate + val_aggregate_scores=[0.6667], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=baseline, # baseline IS the best + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + baseline_metric_breakdown={ + "final_response_avg_score": 0.6667, + "tool_trajectory_avg_score": 0.5, + }, + ) + + assert result.best_metric_breakdown == { + "final_response_avg_score": 0.6667, + "tool_trajectory_avg_score": 0.5, + } + # And it should match the baseline breakdown 1:1. + assert result.best_metric_breakdown == result.baseline_metric_breakdown + + +def test_build_optimize_result_does_not_mirror_when_a_round_already_matches(): + """The mirror-from-baseline fallback must NOT overwrite a real round + breakdown — if a RoundRecord matches ``best_candidate`` (e.g. the + candidate happens to equal baseline as a string but a round still + re-evaluated it on the valset), prefer the round's actual + metric_breakdown. + """ + baseline = {"instruction": "v0"} + # callback_rounds carries a record matching baseline with REAL data. + from trpc_agent_sdk.evaluation._optimize_result import RoundRecord + + callback_rounds = [ + RoundRecord( + round=1, + optimized_field_names=["instruction"], + candidate_prompts=baseline, + train_pass_rate=0.0, + validation_pass_rate=0.6667, + metric_breakdown={"final_response_avg_score": 0.7}, + accepted=False, + acceptance_reason="explored", + started_at=datetime.now(timezone.utc).isoformat(), + duration_seconds=0.1, + ), + ] + gepa_result = _FakeGEPAResult( + candidates=[baseline, baseline], + val_aggregate_scores=[0.6667, 0.6667], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=baseline, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + callback_rounds=callback_rounds, + baseline_metric_breakdown={"final_response_avg_score": 0.0}, # different! + ) + + # Round's real data wins; baseline_metric_breakdown is NOT used. + assert result.best_metric_breakdown == {"final_response_avg_score": 0.7} + + +def test_build_optimize_result_no_mirror_when_baseline_breakdown_empty(): + """When both ``baseline_metric_breakdown`` and any matching round + record are empty, ``best_metric_breakdown`` stays empty — there is + simply no data to mirror. + """ + baseline = {"instruction": "v0"} + gepa_result = _FakeGEPAResult( + candidates=[baseline], + val_aggregate_scores=[0.0], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=baseline, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + # baseline_metric_breakdown is omitted (None → empty dict) + ) + + assert result.best_metric_breakdown == {} + + +# --------------------------------------------------------------------------- +# _build_failed_result +# --------------------------------------------------------------------------- + + +def test_build_failed_result_marks_status_failed(): + baseline = {"instruction": "v0"} + started = datetime(2026, 5, 15, 10, 0, 0, tzinfo=timezone.utc) + finished = datetime(2026, 5, 15, 10, 0, 1, tzinfo=timezone.utc) + + result = _build_failed_result( + baseline_prompts=baseline, + started_at=started, + finished_at=finished, + error_message="boom", + algo_name="gepa_reflective", + ) + + assert result.status == "FAILED" + assert result.finish_reason == "error" + assert result.error_message == "boom" + assert result.baseline_prompts == baseline + assert result.best_prompts == baseline + assert result.baseline_pass_rate == 0.0 + assert result.best_pass_rate == 0.0 + assert result.total_rounds == 0 + assert result.algorithm == "gepa_reflective" + + +# --------------------------------------------------------------------------- +# GepaReflectiveOptimizer construction and run +# --------------------------------------------------------------------------- + + +def test_optimizer_constructor_stores_dataset_paths(): + optimizer = _make_optimizer(train_path="/tmp/t.json", val_path="/tmp/v.json") + assert optimizer.train_dataset_path == "/tmp/t.json" + assert optimizer.validation_dataset_path == "/tmp/v.json" + + +@pytest.mark.asyncio +async def test_optimizer_run_returns_best_without_writing_back(tmp_path, monkeypatch): + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + recorder: dict[str, str] = {} + target = _new_target_prompt(recorder) + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.9], + total_metric_calls=20, + ) + + captured: dict = {} + + async def fake_call_gepa(self, **kwargs): + captured["kwargs"] = kwargs + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await optimizer.run() + + assert result.status == "SUCCEEDED" + assert result.best_pass_rate == pytest.approx(0.9) + assert result.best_prompts == {"instruction": "improved"} + # BaseOptimizer.run() must not write back; the AgentOptimizer facade is the + # sole owner of the write-back path (gated by ``update_source``). + # The recorder may stay empty here because gepa.optimize is mocked and never + # actually invokes adapter.evaluate(...); what matters is that ``result`` + # exposes the best prompts without persisting them. + assert recorder.get("instruction") != "improved" + + kwargs = captured["kwargs"] + assert kwargs["seed_candidate"] == {"instruction": "initial"} + assert len(kwargs["trainset"]) == 1 + assert len(kwargs["valset"]) == 1 + assert kwargs["reflection_lm"] is not None + assert isinstance(kwargs["adapter"], _AgentGEPAAdapter) + assert kwargs["candidate_selection_strategy"] == "pareto" + assert kwargs["module_selector"] == "round_robin" + assert kwargs["seed"] == 42 + # The reflection prompt template must reach gepa.optimize and keep both + # placeholders so GEPA's InstructionProposalSignature validation passes. + template = kwargs.get("reflection_prompt_template", "") + assert "" in template + assert "" in template + + +@pytest.mark.asyncio +async def test_optimizer_run_injects_metric_reference_doc_into_reflection_template( + tmp_path, monkeypatch +): + """For built-in criterion-based metrics, the metric reference doc must + travel into gepa.optimize's reflection_prompt_template so the reflection + LM understands what every per-case feedback row means.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + # Use a real criterion-based built-in metric so the doc renders actual + # content (not the empty-doc fallback path covered by the previous test). + config = OptimizeConfigFile( + evaluate=EvalConfig( + metrics=[{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "contains"}}}, + }], + num_runs=1, + ), + optimize=OptimizeConfig( + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions( + provider_name="openai", + model_name="gpt-4o", + api_key="test-key", + ), + max_metric_calls=30, + ), + ), + ) + optimizer = GepaReflectiveOptimizer( + config=config, + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + captured: dict = {} + + async def fake_call_gepa(self, **kwargs): + captured["kwargs"] = kwargs + return _FakeGEPAResult( + candidates=[{"instruction": "initial"}], + val_aggregate_scores=[1.0], + total_metric_calls=10, + ) + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + await optimizer.run() + + template = captured["kwargs"]["reflection_prompt_template"] + # Required GEPA placeholders preserved + assert "" in template + assert "" in template + # The injected metric doc surfaces its metric name and config knobs + assert "final_response_avg_score" in template + assert "contains" in template + # The metric doc sits between and + assert template.index("") < template.index("final_response_avg_score") + assert template.index("final_response_avg_score") < template.index("") + + +@pytest.mark.asyncio +async def test_optimizer_run_surfaces_per_metric_best_candidates(tmp_path, monkeypatch): + """When GEPA reports per_objective_best_candidates, OptimizeResult must + forward it (converting set -> sorted list) so users can see which + candidate excels on which metric independent of the aggregated best.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.4, 0.9], + total_metric_calls=20, + per_objective_best_candidates={ + "final_response_avg_score": {1}, + "llm_rubric_response": {0, 1}, + }, + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await optimizer.run() + + assert result.per_metric_best_candidates == { + "final_response_avg_score": [1], + "llm_rubric_response": [0, 1], + } + + +@pytest.mark.asyncio +async def test_optimizer_run_per_metric_best_candidates_empty_when_gepa_omits_it( + tmp_path, monkeypatch +): + """Older GEPA builds or algorithms without per-objective tracking return + ``per_objective_best_candidates=None``; OptimizeResult must keep an empty + dict (not raise) so consumers can rely on the field always being a dict.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "x"}], + val_aggregate_scores=[0.5], + total_metric_calls=5, + per_objective_best_candidates=None, + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await optimizer.run() + assert result.per_metric_best_candidates == {} + + +@pytest.mark.asyncio +async def test_optimizer_run_returns_failed_when_baseline_evaluation_raises(tmp_path, monkeypatch): + """If the explicit baseline evaluation throws, surface a FAILED result with + the captured error message instead of propagating a raw exception.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + recorder: dict[str, str] = {} + target = _new_target_prompt(recorder) + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + def explode(self, *args, **kwargs): + raise RuntimeError("evaluator exploded during baseline") + + monkeypatch.setattr(_AgentGEPAAdapter, "evaluate", explode) + + result = await optimizer.run() + assert result.status == "FAILED" + assert result.finish_reason == "error" + assert "evaluator exploded during baseline" in result.error_message + assert result.best_prompts == result.baseline_prompts + + +@pytest.mark.asyncio +async def test_optimizer_run_stop_reason_required_metrics_passing( + tmp_path, monkeypatch +): + """When the framework stopper fires (its last_triggered flips True before + gepa returns), run() must persist stop_reason='required_metrics_passing'.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.9], + total_metric_calls=15, + ) + + async def fake_call_gepa(self, **kwargs): + for s in kwargs["stop_callbacks"]: + if isinstance(s, _RequiredMetricsAboveThresholdStopper): + s.update({"m1": 0.9}) + s(gepa_state=None) + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + result = await optimizer.run() + assert result.status == "SUCCEEDED" + assert result.stop_reason == "required_metrics_passing" + + +@pytest.mark.asyncio +async def test_optimizer_run_stop_reason_completed_when_no_stopper_fires( + tmp_path, monkeypatch +): + """When gepa returns without firing any wrapped stopper (mock path), + stop_reason must be 'completed' rather than the legacy 'budget_exhausted' + catch-all so users can tell apart "loop drained naturally" from a real + budget cap hit.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.6], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + result = await optimizer.run() + assert result.status == "SUCCEEDED" + assert result.stop_reason == "completed" + + +@pytest.mark.asyncio +async def test_optimizer_run_stop_reason_no_improvement_when_that_stopper_fires( + tmp_path, monkeypatch +): + """When the wrapped NoImprovementStopper signals last_triggered (by gepa + polling it past the configured patience), stop_reason must be + 'no_improvement' so reporters and summary.txt can attribute the stop + correctly instead of falsely blaming the budget.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(max_iterations_without_improvement=3), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.6], + ) + + async def fake_call_gepa(self, **kwargs): + for stopper in kwargs["stop_callbacks"]: + if isinstance(stopper, _LabeledStopper) and stopper.label == "no_improvement": + stopper.last_triggered = True + break + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + result = await optimizer.run() + assert result.status == "SUCCEEDED" + assert result.stop_reason == "no_improvement" + + +@pytest.mark.asyncio +async def test_optimizer_run_stop_reason_budget_exhausted_when_max_metric_calls_fires( + tmp_path, monkeypatch +): + """When MaxMetricCallsStopper is the only fired wrapper, stop_reason is + 'budget_exhausted'. This locks the label mapping for the legacy + catch-all so a budget cap hit still carries the historical name users + see in reports.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.6], + ) + + async def fake_call_gepa(self, **kwargs): + for stopper in kwargs["stop_callbacks"]: + if isinstance(stopper, _LabeledStopper) and stopper.label == "budget_exhausted": + stopper.last_triggered = True + break + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + result = await optimizer.run() + assert result.status == "SUCCEEDED" + assert result.stop_reason == "budget_exhausted" + + +def test_labeled_stopper_records_last_triggered_only_when_inner_returns_true(): + """``_LabeledStopper.__call__`` delegates the return value to the inner + stopper and flips ``last_triggered`` sticky once the inner ever returns + True; subsequent False results never clear the flag.""" + calls: list[bool] = [] + + class _ScriptedInner: + def __call__(self, *_args, **_kwargs): + return calls.pop(0) + + wrapper = _LabeledStopper(_ScriptedInner(), "no_improvement") + assert wrapper.label == "no_improvement" + assert wrapper.last_triggered is False + + calls.extend([False, True, False]) + assert wrapper() is False + assert wrapper.last_triggered is False + assert wrapper() is True + assert wrapper.last_triggered is True + assert wrapper() is False + assert wrapper.last_triggered is True + + +def test_build_stop_callbacks_wraps_each_gepa_stopper_with_a_labeled_stopper(): + """Every algorithm-side stop knob the user enables must end up wrapped in + a ``_LabeledStopper`` carrying the matching label, so the optimizer can + classify ``stop_reason`` precisely after gepa returns.""" + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(provider_name="openai", model_name="m"), + max_metric_calls=10, + max_iterations_without_improvement=3, + timeout_seconds=60.0, + score_threshold=0.95, + max_candidate_proposals=5, + max_tracked_candidates=4, + ) + stop_callbacks, _framework = _build_stop_callbacks( + algo=algo, + stop_config=FrameworkStopConfig(required_metrics=None), + metric_thresholds={"m1": 1.0}, + ) + labels = { + s.label + for s in stop_callbacks + if isinstance(s, _LabeledStopper) + } + assert labels == { + "budget_exhausted", + "no_improvement", + "timeout", + "score_threshold", + "max_candidate_proposals", + "max_tracked_candidates", + } + + +def test_classify_stop_reason_prefers_framework_stopper_over_labeled_ones(): + """When both the framework stopper and a labeled gepa stopper fired in + the same run, ``required_metrics_passing`` wins because it represents + the user's explicit opt-in stop policy.""" + framework = _RequiredMetricsAboveThresholdStopper({"m": 0.5}) + framework.last_triggered = True + labeled = _LabeledStopper(lambda *_: False, "no_improvement") + labeled.last_triggered = True + assert ( + _classify_stop_reason( + stop_callbacks=[labeled, framework], + framework_stopper=framework, + ) + == "required_metrics_passing" + ) + + +def test_classify_stop_reason_returns_completed_when_no_stopper_fires(): + """No stopper triggered ⇒ gepa loop ended naturally. The ``completed`` + label distinguishes this from any real stop cap so users can tell the + difference in summary.txt and the terminal banner.""" + framework = _RequiredMetricsAboveThresholdStopper({"m": 0.5}) + labeled = _LabeledStopper(lambda *_: False, "timeout") + assert ( + _classify_stop_reason( + stop_callbacks=[labeled, framework], + framework_stopper=framework, + ) + == "completed" + ) + + +@pytest.mark.asyncio +async def test_optimizer_run_wires_stopper_update_into_callback( + tmp_path, monkeypatch +): + """The callback must receive the stopper's update as on_valset_breakdown so + in a real gepa run the stopper's _latest tracks the most recent valset.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.9], + ) + captured: dict = {} + + async def fake_call_gepa(self, **kwargs): + captured["stop_callbacks"] = kwargs["stop_callbacks"] + captured["gepa_callback"] = kwargs["callbacks"][0] + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + await optimizer.run() + stopper = next( + s + for s in captured["stop_callbacks"] + if isinstance(s, _RequiredMetricsAboveThresholdStopper) + ) + gepa_callback = captured["gepa_callback"] + assert gepa_callback._on_valset_breakdown == stopper.update + + +@pytest.mark.asyncio +async def test_optimizer_run_returns_failed_when_gepa_raises(tmp_path, monkeypatch): + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + recorder: dict[str, str] = {"instruction": "initial"} + target = _new_target_prompt(recorder) + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + async def fake_call_gepa(self, **kwargs): + raise RuntimeError("simulated gepa failure") + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await optimizer.run() + + assert result.status == "FAILED" + assert result.finish_reason == "error" + assert "simulated gepa failure" in result.error_message + assert recorder["instruction"] == "initial" + + +def test_stop_reason_literal_includes_user_requested_stop() -> None: + from typing import get_args + + from trpc_agent_sdk.evaluation._optimize_result import StopReason + + assert "user_requested_stop" in get_args(StopReason) + + +def test_optimizer_constructor_stores_output_dir(tmp_path) -> None: + """BaseOptimizer surfaces output_dir so subclasses can wire FileStopper.""" + config = OptimizeConfigFile( + evaluate=EvalConfig( + metrics=[{"metric_name": "m", "threshold": 0.5}], + num_runs=1, + ), + optimize=OptimizeConfig( + stop=FrameworkStopConfig(required_metrics=None), + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + ), + ), + ) + + async def _call_agent(_q: str) -> str: + return "" + + target_prompt = TargetPrompt().add_path("p", str(tmp_path / "p.md")) + (tmp_path / "p.md").write_text("seed", encoding="utf-8") + + opt = GepaReflectiveOptimizer( + config=config, + call_agent=_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(tmp_path / "t.json"), + validation_dataset_path=str(tmp_path / "v.json"), + output_dir=str(tmp_path / "runs/x"), + ) + + assert opt.output_dir == str(tmp_path / "runs/x") + + +def test_build_stop_callbacks_installs_file_stopper_when_output_dir_set(tmp_path) -> None: + """When output_dir is provided, FileStopper labels new stops as user_requested_stop.""" + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=10, + ) + + callbacks, _ = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics=None), + metric_thresholds={}, + output_dir=str(tmp_path), + ) + + labels = [cb.label for cb in callbacks if isinstance(cb, _LabeledStopper)] + assert "user_requested_stop" in labels + + +def test_file_stopper_fires_after_optimize_stop_file_appears(tmp_path) -> None: + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=10, + ) + + callbacks, _ = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics=None), + metric_thresholds={}, + output_dir=str(tmp_path), + ) + stopper = next( + cb for cb in callbacks + if isinstance(cb, _LabeledStopper) + and cb.label == "user_requested_stop" + ) + + assert stopper(gepa_state=None) is False + (tmp_path / "optimize.stop").write_text("", encoding="utf-8") + assert stopper(gepa_state=None) is True + assert stopper.last_triggered is True + + +def test_build_stop_callbacks_skips_file_stopper_when_output_dir_none() -> None: + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=10, + ) + + callbacks, _ = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics=None), + metric_thresholds={}, + output_dir=None, + ) + + labels = [cb.label for cb in callbacks if isinstance(cb, _LabeledStopper)] + assert "user_requested_stop" not in labels + + +def test_run_forwards_reflection_history_top_k_into_adapter(tmp_path, monkeypatch): + """algo.reflection_history_top_k must reach the adapter constructor as top_k_per_case.""" + import asyncio + import json + from types import SimpleNamespace + + from trpc_agent_sdk.evaluation._eval_config import EvalConfig + from trpc_agent_sdk.evaluation._optimize_config import ( + FrameworkStopConfig, + GepaReflectiveAlgo, + OptimizeConfig, + OptimizeConfigFile, + ) + from trpc_agent_sdk.evaluation._optimize_gepa_reflective import ( + GepaReflectiveOptimizer, + ) + from trpc_agent_sdk.evaluation._optimize_model_options import ( + OptimizeModelOptions, + ) + from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt + + async def _call_agent(_q: str) -> str: + return "" + + (tmp_path / "p.md").write_text("seed", encoding="utf-8") + train_path = tmp_path / "t.json" + val_path = tmp_path / "v.json" + train_path.write_text( + json.dumps({"eval_set_id": "t", "eval_cases": []}), encoding="utf-8" + ) + val_path.write_text( + json.dumps({"eval_set_id": "v", "eval_cases": []}), encoding="utf-8" + ) + + captured_kwargs: dict = {} + + def fake_init(self, **kwargs): + captured_kwargs.update(kwargs) + self.target_prompt = kwargs["target_prompt"] + self.eval_config = kwargs["eval_config"] + self.call_agent = kwargs["call_agent"] + self.callbacks = kwargs.get("callbacks") + self.num_runs = kwargs.get("num_runs", 1) + self.case_parallelism = kwargs.get("case_parallelism") + self._top_k = int(kwargs.get("top_k_per_case", 0)) + self._best_history = {} + self.last_outcome = None + + monkeypatch.setattr( + "trpc_agent_sdk.evaluation._optimize_gepa_adapter._AgentGEPAAdapter.__init__", + fake_init, + ) + + async def _fake_call(self, **kwargs): + return SimpleNamespace( + best_idx=0, + candidates=[{"p": "seed"}], + val_aggregate_scores=[0.5], + ) + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", _fake_call) + + config = OptimizeConfigFile( + evaluate=EvalConfig(metrics=[{"metric_name": "m", "threshold": 0.5}]), + optimize=OptimizeConfig( + stop=FrameworkStopConfig(required_metrics=None), + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + reflection_history_top_k=3, + ), + ), + ) + target_prompt = TargetPrompt().add_path("p", str(tmp_path / "p.md")) + + opt = GepaReflectiveOptimizer( + config=config, + call_agent=_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=None, + ) + asyncio.run(opt.run()) + + assert captured_kwargs["top_k_per_case"] == 3 + + +def test_optimizer_constructor_stores_extra_callbacks(tmp_path) -> None: + """BaseOptimizer.__init__ must accept and store extra_stop/gepa_callbacks.""" + import json + + from trpc_agent_sdk.evaluation._eval_config import EvalConfig + from trpc_agent_sdk.evaluation._optimize_config import ( + FrameworkStopConfig, + GepaReflectiveAlgo, + OptimizeConfig, + OptimizeConfigFile, + ) + from trpc_agent_sdk.evaluation._optimize_model_options import ( + OptimizeModelOptions, + ) + from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt + + async def _call_agent(_q: str) -> str: + return "" + + (tmp_path / "p.md").write_text("seed", encoding="utf-8") + train_path = tmp_path / "t.json" + val_path = tmp_path / "v.json" + train_path.write_text( + json.dumps({"eval_set_id": "t", "eval_cases": []}), encoding="utf-8" + ) + val_path.write_text( + json.dumps({"eval_set_id": "v", "eval_cases": []}), encoding="utf-8" + ) + + config = OptimizeConfigFile( + evaluate=EvalConfig(metrics=[{"metric_name": "m", "threshold": 0.5}]), + optimize=OptimizeConfig( + stop=FrameworkStopConfig(required_metrics=None), + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + ), + ), + ) + target_prompt = TargetPrompt().add_path("p", str(tmp_path / "p.md")) + + def sentinel_stopper(gepa_state=None): + return False + + sentinel_callback = object() + + opt = GepaReflectiveOptimizer( + config=config, + call_agent=_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs/x"), + extra_stop_callbacks=[sentinel_stopper], + extra_gepa_callbacks=[sentinel_callback], + ) + + assert sentinel_stopper in opt.extra_stop_callbacks + assert sentinel_callback in opt.extra_gepa_callbacks + + +def test_run_extends_stop_callbacks_with_user_supplied_extras(tmp_path, monkeypatch): + """User-supplied extras must be appended to stop_callbacks and the callbacks list.""" + import asyncio + import json + from types import SimpleNamespace + + from trpc_agent_sdk.evaluation._eval_config import EvalConfig + from trpc_agent_sdk.evaluation._optimize_config import ( + FrameworkStopConfig, + GepaReflectiveAlgo, + OptimizeConfig, + OptimizeConfigFile, + ) + from trpc_agent_sdk.evaluation._optimize_gepa_reflective import ( + GepaReflectiveOptimizer, + ) + from trpc_agent_sdk.evaluation._optimize_model_options import ( + OptimizeModelOptions, + ) + from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt + + async def _call_agent(_q: str) -> str: + return "" + + (tmp_path / "p.md").write_text("seed", encoding="utf-8") + train_path = tmp_path / "t.json" + val_path = tmp_path / "v.json" + train_path.write_text( + json.dumps({"eval_set_id": "t", "eval_cases": []}), encoding="utf-8" + ) + val_path.write_text( + json.dumps({"eval_set_id": "v", "eval_cases": []}), encoding="utf-8" + ) + + def sentinel_stopper_a(gepa_state=None): + return False + + def sentinel_stopper_b(gepa_state=None): + return False + + sentinel_callback = SimpleNamespace(tag="user-cb") + + captured: dict = {} + + async def _fake_call(self, **kwargs): + captured["stop_callbacks"] = list(kwargs.get("stop_callbacks") or []) + captured["callbacks"] = list(kwargs.get("callbacks") or []) + return SimpleNamespace( + best_idx=0, + candidates=[{"p": "seed"}], + val_aggregate_scores=[0.5], + ) + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", _fake_call) + + config = OptimizeConfigFile( + evaluate=EvalConfig(metrics=[{"metric_name": "m", "threshold": 0.5}]), + optimize=OptimizeConfig( + stop=FrameworkStopConfig(required_metrics=None), + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + ), + ), + ) + target_prompt = TargetPrompt().add_path("p", str(tmp_path / "p.md")) + + opt = GepaReflectiveOptimizer( + config=config, + call_agent=_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=None, + extra_stop_callbacks=[sentinel_stopper_a, sentinel_stopper_b], + extra_gepa_callbacks=[sentinel_callback], + ) + asyncio.run(opt.run()) + + assert sentinel_stopper_a in captured["stop_callbacks"] + assert sentinel_stopper_b in captured["stop_callbacks"] + assert sentinel_callback in captured["callbacks"] diff --git a/tests/evaluation/test_optimize_metric_info.py b/tests/evaluation/test_optimize_metric_info.py new file mode 100644 index 00000000..c29ef522 --- /dev/null +++ b/tests/evaluation/test_optimize_metric_info.py @@ -0,0 +1,630 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for the optimize-side metric reference doc builder. + +The doc is the static "syllabus" injected into the reflection LM's prompt +template alongside the dynamic per-case feedback. Every code path tested +here describes a knob the user can turn in optimizer.json's +``evaluate.metrics[]`` array, and the doc must render that knob so the +reflection LM understands how the metric scores its rewrites. +""" + +from __future__ import annotations + +import math + +import pytest + +from trpc_agent_sdk.evaluation._eval_config import EvalConfig +from trpc_agent_sdk.evaluation._optimize_metric_info import ( + build_metric_reference_doc, + build_metric_section, + build_reflection_prompt_template, +) + + +def _config_with(metric_dicts: list[dict]) -> EvalConfig: + """Wrap a list of metric dicts into an EvalConfig (Pydantic round-trip safe).""" + return EvalConfig(metrics=metric_dicts, num_runs=1) + + +# -------- Exclusion rules -------- + + +def test_skip_tool_trajectory_metric(): + cfg = _config_with([ + {"metric_name": "tool_trajectory_avg_score", "threshold": 1.0}, + ]) + doc = build_metric_reference_doc(cfg) + assert "tool_trajectory_avg_score" not in doc + + +def test_skip_rouge_metric(): + cfg = _config_with([ + {"metric_name": "response_match_score", "threshold": 0.5}, + ]) + doc = build_metric_reference_doc(cfg) + assert "response_match_score" not in doc + + +def test_empty_metrics_renders_placeholder(): + cfg = _config_with([]) + doc = build_metric_reference_doc(cfg) + assert doc.strip() # non-empty header at minimum + + +# -------- final_response_avg_score -------- + + +def test_final_response_text_contains_case_insensitive(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": True} + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "final_response_avg_score" in doc + assert "contains" in doc + assert "case-insensitive" in doc.lower() + assert "1.0000" in doc # threshold rendered + + +def test_final_response_text_exact_case_sensitive(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "exact"}}}, + }]) + doc = build_metric_reference_doc(cfg) + assert "exact" in doc + assert "case-sensitive" in doc.lower() + + +def test_final_response_text_regex_mode(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "regex"}}}, + }]) + doc = build_metric_reference_doc(cfg) + assert "regex" in doc + assert "re.search" in doc or "regular expression" in doc.lower() + + +def test_final_response_text_ignored(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "exact", "ignore": True}}}, + }]) + doc = build_metric_reference_doc(cfg) + # ignore=True means text comparison is skipped + assert "skipped" in doc.lower() or "ignore" in doc.lower() + + +def test_final_response_json_with_ignore_tree_and_tolerance(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "json": { + "ignore_tree": {"id": True, "meta": {"ts": True}}, + "number_tolerance": 0.001, + } + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "JSON" in doc + assert "ignore_tree" in doc or "ignored" in doc.lower() + assert "0.001" in doc + + +def test_final_response_text_and_json_combined_uses_and_logic(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": {"match": "exact"}, + "json": {"number_tolerance": 0.01}, + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "AND" in doc or "both" in doc.lower() + + +def test_final_response_custom_compare_overrides_text_and_json(): + """When the user registers a custom compare via EVALUATOR_REGISTRY, + the doc must explicitly tell the reflection LM that text/json + strategies are overridden by user code.""" + from trpc_agent_sdk.evaluation._evaluator_registry import EVALUATOR_REGISTRY + + def my_compare(actual, expected): # pragma: no cover - registered then removed + return True + + EVALUATOR_REGISTRY.set_criterion_compare("final_response_avg_score", my_compare) + try: + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "exact"}}}, + }]) + doc = build_metric_reference_doc(cfg) + assert "custom" in doc.lower() + assert "override" in doc.lower() + finally: + # cleanup: this is a global registry, leaking would affect later tests + EVALUATOR_REGISTRY._criterion_compares.pop("final_response_avg_score", None) + + +# -------- llm_rubric_response -------- + + +def test_llm_rubric_single_judge_with_rubrics(): + cfg = _config_with([{ + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "glm-5.1-w4afp8", + "num_samples": 1, + "generation_config": {"max_tokens": 1024, "temperature": 0.2}, + }, + "rubrics": [ + { + "id": "numeric_correct", + "description": "数字答案与参考答案一致", + "content": {"text": "最终给出的数字答案是否与参考答案一致。"}, + }, + { + "id": "reasoning_clear", + "description": "推理步骤清晰", + "content": {"text": "回答中是否给出清晰的推理过程。"}, + }, + ], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "llm_rubric_response" in doc + assert "glm-5.1-w4afp8" in doc + assert "numeric_correct" in doc + assert "数字答案与参考答案一致" in doc + assert "reasoning_clear" in doc + assert "0.6600" in doc + # judge config fields surfaced + assert "temperature=0.2" in doc + assert "max_tokens=1024" in doc + + +def test_llm_rubric_multi_judge_with_weighted_avg(): + cfg = _config_with([{ + "metric_name": "llm_rubric_response", + "threshold": 0.5, + "criterion": { + "llm_judge": { + "judge_models": [ + {"model_name": "judge-A", "weight": 2.0}, + {"model_name": "judge-B", "weight": 1.0}, + ], + "models_aggregator": "weighted_avg", + "parallel": True, + "rubrics": [ + {"id": "r1", "description": "d1", "content": {"text": "rubric text 1"}}, + ], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "judge-A" in doc + assert "judge-B" in doc + assert "weight=2.0" in doc + assert "weighted_avg" in doc + assert "parallel" in doc.lower() + + +@pytest.mark.parametrize("aggregator,must_contain", [ + ("all_pass", "all"), + ("any_pass", "any"), + ("majority_pass", "majority"), + ("avg", "mean"), + ("weighted_avg", "weighted"), + ("weighted_majority", "weighted"), +]) +def test_each_aggregator_has_explanation(aggregator, must_contain): + cfg = _config_with([{ + "metric_name": "llm_rubric_response", + "threshold": 0.5, + "criterion": { + "llm_judge": { + "judge_models": [ + {"model_name": "j1", "weight": 1.0}, + {"model_name": "j2", "weight": 1.0}, + ], + "models_aggregator": aggregator, + "rubrics": [{"id": "r1", "description": "d", "content": {"text": "x"}}], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert aggregator in doc + assert must_contain.lower() in doc.lower() + + +def test_llm_rubric_threshold_translates_to_min_pass_count(): + cfg = _config_with([{ + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "j1"}, + "rubrics": [ + {"id": f"r{i}", "description": "d", "content": {"text": "x"}} + for i in range(3) + ], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + # 0.66 * 3 = 1.98 -> ceil = 2; reflection LM needs to see this concretely + min_pass = math.ceil(0.66 * 3) + assert str(min_pass) in doc + + +# -------- llm_rubric_knowledge_recall -------- + + +def test_llm_rubric_knowledge_recall_renders_tool_names(): + cfg = _config_with([{ + "metric_name": "llm_rubric_knowledge_recall", + "threshold": 0.5, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "j1"}, + "rubrics": [{"id": "kr1", "description": "d", "content": {"text": "k"}}], + "knowledge_tool_names": ["search_docs", "retrieve_chunks"], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "search_docs" in doc + assert "retrieve_chunks" in doc + assert "knowledge" in doc.lower() + + +def test_llm_rubric_knowledge_recall_default_tools_noted_when_unset(): + cfg = _config_with([{ + "metric_name": "llm_rubric_knowledge_recall", + "threshold": 0.5, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "j1"}, + "rubrics": [{"id": "kr1", "description": "d", "content": {"text": "k"}}], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + # default knowledge tool set should be mentioned + assert "default" in doc.lower() + + +# -------- llm_final_response -------- + + +def test_llm_final_response_binary_judge(): + cfg = _config_with([{ + "metric_name": "llm_final_response", + "threshold": 1.0, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "j1"}, + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "llm_final_response" in doc + assert "binary" in doc.lower() or "valid" in doc.lower() + + +# -------- Cross-cutting -------- + + +def test_metrics_listed_in_user_configured_order(): + cfg = _config_with([ + { + "metric_name": "llm_rubric_response", + "threshold": 0.5, + "criterion": {"llm_judge": { + "judge_model": {"model_name": "j1"}, + "rubrics": [{"id": "r1", "description": "d", "content": {"text": "x"}}], + }}, + }, + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "exact"}}}, + }, + ]) + doc = build_metric_reference_doc(cfg) + assert doc.index("llm_rubric_response") < doc.index("final_response_avg_score") + + +def test_doc_contains_per_case_feedback_field_list(): + cfg = _config_with([{ + "metric_name": "llm_rubric_response", + "threshold": 0.5, + "criterion": {"llm_judge": { + "judge_model": {"model_name": "j1"}, + "rubrics": [{"id": "r1", "description": "d", "content": {"text": "x"}}], + }}, + }]) + doc = build_metric_reference_doc(cfg) + # rubric metric must tell the LM that rubric_scores appear in per-case feedback + assert "rubric_scores" in doc + assert "reason" in doc + + +def test_doc_contains_rewriting_guidelines_section(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "exact"}}}, + }]) + doc = build_metric_reference_doc(cfg) + # the footer "rewriting rules" is essential — it tells the LM how to use + # the per-metric info above when proposing changes + assert "Rewriting" in doc or "Guideline" in doc or "Preserve" in doc + + +def test_build_metric_section_returns_markdown_for_single_metric(): + from trpc_agent_sdk.evaluation._eval_metrics import EvalMetric + + metric = EvalMetric( + metric_name="final_response_avg_score", + threshold=1.0, + criterion={"final_response": {"text": {"match": "contains"}}}, + ) + section = build_metric_section(metric) + assert "final_response_avg_score" in section + assert "contains" in section + assert "1.0000" in section + + +def test_quickstart_config_renders_complete_doc(): + """End-to-end smoke test using a close clone of quickstart/optimizer.json.""" + cfg = _config_with([ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": {"text": {"match": "contains", "case_insensitive": True}} + }, + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": {"llm_judge": { + "judge_model": { + "model_name": "glm-5.1-w4afp8", + "num_samples": 1, + "generation_config": {"max_tokens": 1024, "temperature": 0.2}, + }, + "rubrics": [ + {"id": "numeric_correct", "description": "数字答案与参考答案一致", + "content": {"text": "最终给出的数字答案是否与参考答案一致。"}}, + {"id": "reasoning_clear", "description": "推理步骤清晰", + "content": {"text": "回答中是否给出清晰、可追溯的推理或计算步骤。"}}, + {"id": "units_present", "description": "答案带正确单位", + "content": {"text": "最终数字答案是否带有正确的单位。"}}, + ], + }}, + }, + ]) + doc = build_metric_reference_doc(cfg) + + # Both metrics surface + assert "final_response_avg_score" in doc + assert "llm_rubric_response" in doc + + # final_response_avg_score config knobs + assert "contains" in doc + assert "case-insensitive" in doc.lower() + + # llm_rubric_response judge config + assert "glm-5.1-w4afp8" in doc + assert "temperature=0.2" in doc + + # All three rubrics with their bodies + for rid in ("numeric_correct", "reasoning_clear", "units_present"): + assert rid in doc + + # Thresholds rendered + assert "1.0000" in doc + assert "0.6600" in doc + + # Min-pass count for rubric metric (ceil(0.66 * 3) = 2) + assert " 2" in doc or "2 " in doc + + +# -------- build_reflection_prompt_template -------- + + +def test_reflection_prompt_template_keeps_required_placeholders(): + """GEPA validates the template — both and + must remain or gepa.optimize raises.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + assert "" in template + assert "" in template + + +def test_reflection_prompt_template_embeds_metric_doc_between_placeholders(): + metric_doc = "## Metrics Reference\n\nMARKER_FOR_TEST\n" + template = build_reflection_prompt_template(metric_doc) + assert "MARKER_FOR_TEST" in template + # placement: metric doc sits AFTER (current prompt) so the LM + # has the current text first, then learns the metrics, then sees feedback + assert template.index("") < template.index("MARKER_FOR_TEST") + assert template.index("MARKER_FOR_TEST") < template.index("") + + +def test_reflection_prompt_template_handles_empty_metric_doc(): + """When metric_doc is empty (no eligible metrics), template still must be + a valid GEPA template — placeholders intact, no spurious markdown.""" + template = build_reflection_prompt_template("") + assert "" in template + assert "" in template + # GEPA will validate; no exception means template is well-formed + + +def test_reflection_prompt_template_does_not_inline_describe_self_evident_fields(): + """GEPA's prompt_renderer emits every record-dict key as ``## `` + markdown header automatically. For keys whose meaning is self-evident + from the header alone (``case_id`` — obviously an identifier), our + static template must NOT re-narrate them ahead of ````. + + The template is allowed (and expected) to keep semantic guidance GEPA + cannot infer from markdown alone: the score's [0, 1] aggregate range, + the ``Case Body`` inner turn-sliced format, the ``Tool Trace`` line + grammar, and the ``Other Active Components`` cross-component context. + """ + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + pre_side_info = template.split("", 1)[0] + + # case_id should be left fully self-evident — the header name says it + # all, no narration needed. + forbidden_phrases = ( + "stable identifier for the case", + "stable id for the case", + "unique id for the case", + ) + for phrase in forbidden_phrases: + assert phrase not in pre_side_info, ( + f"static template still inline-describes a self-evident field " + f"via phrase {phrase!r}; GEPA's auto-rendered ``## case_id`` " + f"header already conveys this — remove the narration" + ) + + +def test_reflection_prompt_template_documents_score_aggregate_range(): + """``score`` is the case-level aggregate on [0, 1] — not a per-metric + score and not the threshold. The template must clarify this so the LM + does not confuse the case score with the per-metric scores inside the + Verdict lines.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + pre_side_info = template.split("", 1)[0] + assert "[0, 1]" in pre_side_info + assert "case-level" in pre_side_info.lower() or "case level" in pre_side_info.lower() + + +def test_reflection_prompt_template_documents_case_body_turn_layout(): + """``Case Body`` is a free-text markdown string; GEPA dumps it as-is. + The static template must spell out the ``### Turn N`` header layout, + the ``**User**``/``**Expected**``/``**Agent Response**``/``**Verdict**`` + field markers, and the per-metric line grammar — otherwise the LM has + to reverse-engineer the convention from raw text.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + assert "### Turn N" in template + assert "**User**" in template + assert "**Expected**" in template + assert "**Agent Response**" in template + assert "**Verdict**" in template + assert "[PASSED|FAILED]" in template + assert "threshold=" in template + assert "rubric[" in template + + +def test_reflection_prompt_template_documents_multi_run_nested_run_blocks(): + """Multi-run cases nest ``#### Run N`` inside each turn; the template + must announce this layout up front so the LM knows variance is + attributable per run rather than averaged out.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + assert "#### Run" in template + assert "num_runs" in template.lower() or "multi-run" in template.lower() + + +def test_reflection_prompt_template_documents_tool_trace_line_grammar(): + """``Tool Trace`` lines are rendered inline (``func(arg=val) → result + [id=...]``) instead of nested dict headers — the template must + document the line grammar because GEPA's renderer cannot infer it.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + assert "Tool Trace" in template + # The line skeleton must be visible so the LM knows how to parse it. + assert "fn_name" in template or "" in template + assert "→" in template + assert "[id=" in template + + +def test_reflection_prompt_template_documents_other_active_components_semantics(): + """``Other Active Components`` is the cross-component context: every + OTHER prompt's current body, present iff the candidate has more than + one prompt. The template must explain that: + - the LM only sees the target prompt at the top of the message + - the verdict came from ALL prompts running together + so the LM uses these contents to avoid duplication and contradiction. + The template must NOT mention ```` by name because GEPA's + prompt_renderer substitutes that placeholder everywhere it appears + in the template, leaking the prompt text into the documentation.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + pre_side_info = template.split("", 1)[0] + assert "Other Active Components" in pre_side_info + # Regression guard: never document ```` by name in the + # static template, otherwise it gets substituted into garbage. + assert "" not in pre_side_info.replace( + "```\n\n```", "" + ).replace("", "", 1) or True # placeholder usage is fine + # The actual regression assertion: the substring shouldn't appear + # twice in the pre-side-info region (once for placeholder, never in narration). + assert pre_side_info.count("") == 1, ( + "```` should appear exactly once in the template " + "(the placeholder itself); referencing it in narration causes " + "GEPA's prompt_renderer to leak the prompt text into the docs." + ) + # The cross-component intent must surface, regardless of exact wording. + lowered = pre_side_info.lower() + assert ( + "avoid restating" in lowered + or "avoid contradicting" in lowered + or "all prompts" in lowered + ) + + +def test_reflection_prompt_template_warns_against_regressing_passing_metrics(): + """A rewrite that fixes a FAILING metric but regresses a PASSING one + is a regression, not progress. The template must surface this rule so + the LM treats PASSING metrics as hard constraints, not noise.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + lowered = template.lower() + assert ( + "passing metrics stay passing" in lowered + or "passing metrics as constraints" in lowered + or "regressing a passing" in lowered + ) + + +def test_reflection_prompt_template_documents_history_top_k() -> None: + """The reflection LM must be told how to read history_top_k.""" + from trpc_agent_sdk.evaluation._optimize_metric_info import build_reflection_prompt_template + + template = build_reflection_prompt_template("") + + assert "## history_top_k" in template or "``## history_top_k``" in template + assert "preserve" in template.lower() or "anchor" in template.lower() + + +def test_reflection_prompt_template_explains_history_top_k_is_optional() -> None: + from trpc_agent_sdk.evaluation._optimize_metric_info import build_reflection_prompt_template + + template = build_reflection_prompt_template("") + + assert "present iff" in template or "optional" in template.lower() diff --git a/tests/evaluation/test_optimize_model_callable.py b/tests/evaluation/test_optimize_model_callable.py new file mode 100644 index 00000000..ece86223 --- /dev/null +++ b/tests/evaluation/test_optimize_model_callable.py @@ -0,0 +1,261 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for _OptimizeModelCallable (gepa-compatible LanguageModel wrapper).""" + +from __future__ import annotations + +import inspect +from unittest.mock import MagicMock + +import pytest + +from trpc_agent_sdk.evaluation._optimize_model_callable import _OptimizeModelCallable +from trpc_agent_sdk.evaluation._optimize_model_callable import _build_optimize_generation_config +from trpc_agent_sdk.evaluation._optimize_model_callable import _create_optimize_model +from trpc_agent_sdk.evaluation._optimize_model_callable import _extract_final_text +from trpc_agent_sdk.evaluation._optimize_model_callable import _flatten_messages +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + +def _make_opts(**overrides) -> OptimizeModelOptions: + defaults = { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + "base_url": "https://api.example.com", + "generation_config": {"temperature": 0.2, "max_tokens": 100}, + } + defaults.update(overrides) + return OptimizeModelOptions(**defaults) + + +def _stub_event(text: str): + event = MagicMock() + event.is_final_response.return_value = True + part = MagicMock() + part.text = text + part.thought = False + event.content = MagicMock() + event.content.parts = [part] + return event + + +def _install_fake_run_async(instance: _OptimizeModelCallable, return_text: str) -> list[str]: + """Replace ``_run_async`` and record the flattened user_text it received. + + The bound method swap isolates tests from LlmAgent / InvocationContext setup + while still exercising ``_flatten_messages`` via the public ``__call__`` path. + """ + seen: list[str] = [] + + async def fake_run_async(user_text: str) -> str: + seen.append(user_text) + return return_text + + instance._run_async = fake_run_async # type: ignore[method-assign] + return seen + + +def test_flatten_messages_passes_through_string(): + assert _flatten_messages("hello") == "hello" + + +def test_flatten_messages_concatenates_dict_list(): + out = _flatten_messages( + [ + {"role": "system", "content": "you are helpful"}, + {"role": "user", "content": "say hi"}, + ] + ) + assert "you are helpful" in out + assert "say hi" in out + assert "[system]" in out + assert "[user]" in out + + +def test_flatten_messages_handles_content_list_parts(): + out = _flatten_messages( + [{"role": "user", "content": [{"text": "first"}, {"text": "second"}]}] + ) + assert "first" in out + assert "second" in out + + +def test_create_optimize_model_with_openai_provider(): + model = _create_optimize_model(_make_opts(provider_name="openai")) + assert model is not None + assert type(model).__name__ == "OpenAIModel" + + +def test_create_optimize_model_with_empty_provider_uses_openai(): + model = _create_optimize_model(_make_opts(provider_name="")) + assert type(model).__name__ == "OpenAIModel" + + +def test_build_generation_config_returns_tuple_with_thinking_none(): + cfg, thinking_config = _build_optimize_generation_config(_make_opts()) + assert cfg is not None + assert cfg.temperature == 0.2 + assert cfg.max_output_tokens == 100 + assert thinking_config is None + + +def test_build_generation_config_with_think_true_returns_thinking_config(): + opts = _make_opts(think=True) + cfg, thinking_config = _build_optimize_generation_config(opts) + assert thinking_config is not None + assert thinking_config.include_thoughts is True + + +def test_build_generation_config_with_think_false_returns_disabled_thinking(): + opts = _make_opts(think=False) + cfg, thinking_config = _build_optimize_generation_config(opts) + assert thinking_config is not None + assert thinking_config.include_thoughts is False + assert thinking_config.thinking_budget == 0 + + +def test_build_generation_config_uses_defaults_when_generation_config_missing(): + opts = OptimizeModelOptions(model_name="m", api_key="k") + cfg, _ = _build_optimize_generation_config(opts) + assert cfg.max_output_tokens == 4096 + assert cfg.temperature == 0.8 + + +def test_callable_constructor_initialises_total_cost_to_zero(): + instance = _OptimizeModelCallable(_make_opts()) + assert instance.total_cost == 0.0 + + +def test_callable_constructor_initialises_total_calls_to_zero(): + instance = _OptimizeModelCallable(_make_opts()) + assert instance.total_calls == 0 + assert instance.total_token_usage == {"prompt": 0, "completion": 0, "total": 0} + + +def test_callable_increments_total_calls_on_each_invocation(): + instance = _OptimizeModelCallable(_make_opts()) + _install_fake_run_async(instance, "reply") + instance("p1") + instance("p2") + instance("p3") + assert instance.total_calls == 3 + + +def test_callable_accumulate_usage_handles_google_style_attrs(): + instance = _OptimizeModelCallable(_make_opts()) + + class _U: + prompt_token_count = 100 + candidates_token_count = 50 + total_token_count = 150 + + instance._accumulate_usage(_U()) + assert instance.total_token_usage == {"prompt": 100, "completion": 50, "total": 150} + + +def test_callable_accumulate_usage_handles_openai_style_dict(): + instance = _OptimizeModelCallable(_make_opts()) + instance._accumulate_usage({"prompt_tokens": 20, "completion_tokens": 10, "total_tokens": 30}) + instance._accumulate_usage({"prompt_tokens": 5, "completion_tokens": 4, "total_tokens": 9}) + assert instance.total_token_usage == {"prompt": 25, "completion": 14, "total": 39} + + +def test_callable_accumulate_usage_computes_total_when_missing(): + instance = _OptimizeModelCallable(_make_opts()) + instance._accumulate_usage({"prompt_tokens": 7, "completion_tokens": 3}) + assert instance.total_token_usage == {"prompt": 7, "completion": 3, "total": 10} + + +def test_callable_exposes_languagemodel_protocol_surface(): + instance = _OptimizeModelCallable(_make_opts()) + assert callable(instance) + assert hasattr(instance, "total_cost") + assert isinstance(instance.total_cost, float) + + +def test_callable_invokes_agent_with_string_prompt(): + instance = _OptimizeModelCallable(_make_opts()) + seen = _install_fake_run_async(instance, "reply text") + result = instance("any prompt") + assert result == "reply text" + assert seen == ["any prompt"] + + +def test_callable_handles_messages_list_prompt(): + instance = _OptimizeModelCallable(_make_opts()) + seen = _install_fake_run_async(instance, "ok") + result = instance( + [ + {"role": "system", "content": "be helpful"}, + {"role": "user", "content": "hi"}, + ] + ) + assert result == "ok" + assert len(seen) == 1 + flattened = seen[0] + assert "be helpful" in flattened + assert "hi" in flattened + assert "[system]" in flattened + assert "[user]" in flattened + + +def test_callable_run_async_is_coroutine_function(): + instance = _OptimizeModelCallable(_make_opts()) + assert inspect.iscoroutinefunction(instance._run_async) + + +def test_extract_final_text_returns_empty_for_non_final_event(): + event = MagicMock() + event.is_final_response.return_value = False + assert _extract_final_text(event) == "" + + +def test_extract_final_text_returns_empty_when_no_content(): + event = MagicMock() + event.is_final_response.return_value = True + event.content = None + assert _extract_final_text(event) == "" + + +def test_extract_final_text_returns_empty_when_parts_missing(): + event = MagicMock() + event.is_final_response.return_value = True + event.content = MagicMock() + event.content.parts = [] + assert _extract_final_text(event) == "" + + +def test_extract_final_text_skips_thought_parts(): + event = MagicMock() + event.is_final_response.return_value = True + thought = MagicMock() + thought.text = "internal monologue" + thought.thought = True + actual = MagicMock() + actual.text = "user-visible" + actual.thought = False + event.content = MagicMock() + event.content.parts = [thought, actual] + result = _extract_final_text(event) + assert "internal monologue" not in result + assert "user-visible" in result + + +def test_extract_final_text_joins_multiple_non_thought_parts(): + event = MagicMock() + event.is_final_response.return_value = True + a = MagicMock() + a.text = "first" + a.thought = False + b = MagicMock() + b.text = "second" + b.thought = False + event.content = MagicMock() + event.content.parts = [a, b] + result = _extract_final_text(event) + assert "first" in result + assert "second" in result diff --git a/tests/evaluation/test_optimize_model_options.py b/tests/evaluation/test_optimize_model_options.py new file mode 100644 index 00000000..28317ca5 --- /dev/null +++ b/tests/evaluation/test_optimize_model_options.py @@ -0,0 +1,113 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for OptimizeModelOptions.""" + +from __future__ import annotations + +import ast +from pathlib import Path + +import pytest + +from trpc_agent_sdk.evaluation._llm_criterion import JudgeModelOptions +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + +_MODULE_PATH = ( + Path(__file__).resolve().parents[2] + / "trpc_agent_sdk" + / "evaluation" + / "_optimize_model_options.py" +) + + +def test_default_construction_matches_judge_default_field_values(): + opt = OptimizeModelOptions() + judge = JudgeModelOptions() + expected = { + "provider_name": judge.provider_name, + "model_name": judge.model_name, + "variant": judge.variant, + "base_url": judge.base_url, + "api_key": judge.api_key, + "extra_fields": judge.extra_fields, + "num_samples": judge.num_samples, + "generation_config": judge.generation_config, + "weight": judge.weight, + "think": judge.think, + } + actual = {key: getattr(opt, key) for key in expected} + assert actual == expected + + +def test_field_set_mirrors_judge_field_set(): + optimize_fields = set(OptimizeModelOptions.model_fields.keys()) + judge_fields = set(JudgeModelOptions.model_fields.keys()) + assert optimize_fields == judge_fields, ( + f"OptimizeModelOptions / JudgeModelOptions field set drift: " + f"only in optimize={optimize_fields - judge_fields}, " + f"only in judge={judge_fields - optimize_fields}" + ) + + +def test_is_distinct_class_not_judge_subclass(): + assert OptimizeModelOptions is not JudgeModelOptions + assert not issubclass(OptimizeModelOptions, JudgeModelOptions) + assert not issubclass(JudgeModelOptions, OptimizeModelOptions) + + +def test_module_file_has_no_import_of_llm_criterion(): + source = _MODULE_PATH.read_text(encoding="utf-8") + tree = ast.parse(source) + for node in ast.walk(tree): + if isinstance(node, ast.ImportFrom): + module = node.module or "" + assert "_llm_criterion" not in module, ( + f"_optimize_model_options.py must not import from {module!r}" + ) + if isinstance(node, ast.Import): + for alias in node.names: + assert "_llm_criterion" not in alias.name, ( + f"_optimize_model_options.py must not import {alias.name!r}" + ) + + +def test_json_serialization_uses_camel_alias_like_judge(): + opt = OptimizeModelOptions(model_name="gpt-4o", api_key="k", weight=0.5) + dumped = opt.model_dump(by_alias=True) + assert dumped["modelName"] == "gpt-4o" + assert dumped["apiKey"] == "k" + assert dumped["weight"] == 0.5 + + +def test_construction_accepts_full_field_set(): + opt = OptimizeModelOptions( + provider_name="openai", + model_name="gpt-4o", + variant="responses", + base_url="https://api.example.com", + api_key="sk-abc", + extra_fields={"x": 1}, + num_samples=3, + generation_config={"temperature": 0.2, "max_tokens": 1024}, + weight=0.7, + think=True, + ) + assert opt.provider_name == "openai" + assert opt.model_name == "gpt-4o" + assert opt.variant == "responses" + assert opt.base_url == "https://api.example.com" + assert opt.api_key == "sk-abc" + assert opt.extra_fields == {"x": 1} + assert opt.num_samples == 3 + assert opt.generation_config == {"temperature": 0.2, "max_tokens": 1024} + assert opt.weight == 0.7 + assert opt.think is True + + +def test_extra_fields_rejected_consistent_with_eval_base_model(): + with pytest.raises(Exception): + OptimizeModelOptions(unknown_extra_field="oops") diff --git a/tests/evaluation/test_optimize_quickstart_example.py b/tests/evaluation/test_optimize_quickstart_example.py new file mode 100644 index 00000000..404db819 --- /dev/null +++ b/tests/evaluation/test_optimize_quickstart_example.py @@ -0,0 +1,489 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Smoke tests for ``examples/optimization/quickstart``. + +Goals: + * import the quickstart's ``agent`` package and ``run_optimization`` script + without side effects + * verify env-variable validation in ``agent.config.get_model_config`` + * verify ``agent.create_agent`` reads its instruction from + ``agent/prompts/system.md`` and ``agent/prompts/skill.md`` + * verify the script-level ``call_agent`` is async and exposes a single + ``query`` parameter (the contract the optimizer relies on) + * verify the quickstart's ``optimizer.json`` is a valid + ``OptimizeConfigFile`` and exercises the multi-metric scenario + * verify the end-to-end optimize flow wires together when the reflection + LLM, the gepa main loop, and the LLM judge are all mocked out + +The quickstart's ``agent`` and ``run_optimization`` are loaded by absolute path +because they live outside the python package tree. +""" + +from __future__ import annotations + +import importlib.util +import inspect +import sys +from pathlib import Path +from typing import Any + +import pytest + + +# --------------------------------------------------------------------------- +# Loader helpers (import quickstart files by path without polluting sys.modules) +# --------------------------------------------------------------------------- + + +_QUICKSTART_DIR = ( + Path(__file__).resolve().parents[2] + / "examples" + / "optimization" + / "quickstart" +) + + +def _load_quickstart_agent() -> Any: + """Import ``agent.agent`` from the quickstart example directory.""" + if str(_QUICKSTART_DIR) not in sys.path: + sys.path.insert(0, str(_QUICKSTART_DIR)) + if "agent" in sys.modules: + # ensure we always reimport against the freshly mutated env + for name in [k for k in sys.modules if k == "agent" or k.startswith("agent.")]: + sys.modules.pop(name, None) + import agent.agent as agent_mod # type: ignore + return agent_mod + + +def _load_quickstart_run_module() -> Any: + """Load ``run_optimization.py`` as an importable module without executing main().""" + if str(_QUICKSTART_DIR) not in sys.path: + sys.path.insert(0, str(_QUICKSTART_DIR)) + spec = importlib.util.spec_from_file_location( + "quickstart_run_optimization", + _QUICKSTART_DIR / "run_optimization.py", + ) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +@pytest.fixture +def fake_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("TRPC_AGENT_API_KEY", "fake-key") + monkeypatch.setenv("TRPC_AGENT_BASE_URL", "http://localhost/fake") + monkeypatch.setenv("TRPC_AGENT_MODEL_NAME", "fake-model") + + +# --------------------------------------------------------------------------- +# Structure / contract +# --------------------------------------------------------------------------- + + +def test_quickstart_directory_layout_matches_expected_structure(): + expected = { + "agent/__init__.py", + "agent/agent.py", + "agent/config.py", + "agent/prompts/system.md", + "agent/prompts/skill.md", + "optimizer.json", + "train.evalset.json", + "val.evalset.json", + "run_optimization.py", + } + for rel in expected: + path = _QUICKSTART_DIR / rel + assert path.exists(), f"missing quickstart file: {rel}" + + +def test_prompt_files_are_non_empty_markdown_files(): + for rel in ("agent/prompts/system.md", "agent/prompts/skill.md"): + text = (_QUICKSTART_DIR / rel).read_text(encoding="utf-8") + assert text.strip(), f"{rel} must not be empty" + + +def test_optimizer_json_declares_multi_metric_and_multi_prompt_setup(): + """The quickstart must showcase a multi-metric configuration so users see + the reporter handle the multi-metric scenario end to end. The judge LLM + metric (``llm_rubric_response``) must carry a populated rubrics list.""" + import json + payload = json.loads((_QUICKSTART_DIR / "optimizer.json").read_text(encoding="utf-8")) + metrics = payload["evaluate"]["metrics"] + assert len(metrics) >= 2, "quickstart should configure 2+ metrics" + names = {m["metric_name"] for m in metrics} + assert "final_response_avg_score" in names + assert "llm_rubric_response" in names + judge_metric = next(m for m in metrics if m["metric_name"] == "llm_rubric_response") + judge_cfg = judge_metric["criterion"]["llm_judge"] + assert judge_cfg.get("judge_model"), "llm_rubric_response must configure judge_model" + rubrics = judge_cfg.get("rubrics") or [] + assert len(rubrics) >= 2, "llm_rubric_response must list at least 2 rubrics" + + +def test_optimizer_json_validates_against_optimize_config_file(): + """Schema-level smoke: the example config must load cleanly via the SDK's + public loader so any breaking schema change surfaces here.""" + from trpc_agent_sdk.evaluation._optimize_config import load_optimize_config + + cfg = load_optimize_config(str(_QUICKSTART_DIR / "optimizer.json")) + metric_names = {m.metric_name for m in cfg.evaluate.get_eval_metrics()} + assert metric_names == {"final_response_avg_score", "llm_rubric_response"} + # Framework-level stop policy defaults to "all" via the example. + assert cfg.optimize.stop.required_metrics == "all" + + +# --------------------------------------------------------------------------- +# agent.config: environment-variable validation +# --------------------------------------------------------------------------- + + +def test_get_model_config_raises_when_env_missing(monkeypatch: pytest.MonkeyPatch): + monkeypatch.delenv("TRPC_AGENT_API_KEY", raising=False) + monkeypatch.delenv("TRPC_AGENT_BASE_URL", raising=False) + monkeypatch.delenv("TRPC_AGENT_MODEL_NAME", raising=False) + agent_mod = _load_quickstart_agent() + with pytest.raises(ValueError) as exc_info: + agent_mod.get_model_config() + msg = str(exc_info.value) + assert "TRPC_AGENT_API_KEY" in msg + assert "TRPC_AGENT_BASE_URL" in msg + assert "TRPC_AGENT_MODEL_NAME" in msg + + +def test_get_model_config_returns_tuple_when_env_set(fake_env: None): + agent_mod = _load_quickstart_agent() + api_key, base_url, model_name = agent_mod.get_model_config() + assert api_key == "fake-key" + assert base_url == "http://localhost/fake" + assert model_name == "fake-model" + + +# --------------------------------------------------------------------------- +# agent.agent: LlmAgent factory +# --------------------------------------------------------------------------- + + +def test_create_agent_composes_instruction_from_both_prompt_files(fake_env: None): + agent_mod = _load_quickstart_agent() + from trpc_agent_sdk.agents import LlmAgent + + agent_instance = agent_mod.create_agent() + system_text = agent_mod.SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + skill_text = agent_mod.SKILL_PATH.read_text(encoding="utf-8").strip() + assert isinstance(agent_instance, LlmAgent) + assert system_text in agent_instance.instruction + assert skill_text in agent_instance.instruction + assert agent_instance.name == "math_word_problem_agent" + + +def test_create_agent_picks_up_latest_prompt_text(fake_env: None): + """Optimizer-flow sanity: rewriting any of the prompt files must be + visible to the next agent.""" + agent_mod = _load_quickstart_agent() + original_system = agent_mod.SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") + original_skill = agent_mod.SKILL_PATH.read_text(encoding="utf-8") + try: + agent_mod.SYSTEM_PROMPT_PATH.write_text("UPDATED SYSTEM", encoding="utf-8") + agent_mod.SKILL_PATH.write_text("UPDATED SKILL", encoding="utf-8") + new_agent = agent_mod.create_agent() + assert "UPDATED SYSTEM" in new_agent.instruction + assert "UPDATED SKILL" in new_agent.instruction + finally: + agent_mod.SYSTEM_PROMPT_PATH.write_text(original_system, encoding="utf-8") + agent_mod.SKILL_PATH.write_text(original_skill, encoding="utf-8") + + +# --------------------------------------------------------------------------- +# run_optimization.py: call_agent contract +# --------------------------------------------------------------------------- + + +def test_run_optimization_module_exposes_async_call_agent(fake_env: None): + module = _load_quickstart_run_module() + assert inspect.iscoroutinefunction(module.call_agent), ( + "AgentOptimizer requires call_agent to be an async callable" + ) + sig = inspect.signature(module.call_agent) + params = list(sig.parameters.values()) + assert len(params) == 1 + assert params[0].name == "query" + + +def test_run_optimization_uses_runner_and_inmemory_session_service(fake_env: None): + """The example must build call_agent on top of framework primitives.""" + module = _load_quickstart_run_module() + src = (_QUICKSTART_DIR / "run_optimization.py").read_text(encoding="utf-8") + assert "from trpc_agent_sdk.runners import Runner" in src + assert "from trpc_agent_sdk.sessions import InMemorySessionService" in src + assert "AgentOptimizer.optimize" in src + assert "TargetPrompt" in src + assert hasattr(module, "main") + assert inspect.iscoroutinefunction(module.main) + + +# --------------------------------------------------------------------------- +# End-to-end wiring: optimizer flow with mocked gepa + mocked LLM judge +# --------------------------------------------------------------------------- + + +class _FakeGEPAResult: + def __init__(self, candidates: list[dict], val_scores: list[float]) -> None: + self.candidates = candidates + self.val_aggregate_scores = val_scores + self.parents = [[None]] + [[i - 1] for i in range(1, len(candidates))] + self.discovery_eval_counts = [0] * len(candidates) + self.total_metric_calls = 0 + self.best_outputs_valset = None + + @property + def best_idx(self) -> int: + return max( + range(len(self.val_aggregate_scores)), + key=lambda i: self.val_aggregate_scores[i], + ) + + +@pytest.mark.asyncio +async def test_quickstart_optimize_flow_runs_with_mocked_llm( + tmp_path: Path, + fake_env: None, + monkeypatch: pytest.MonkeyPatch, +): + """Full wiring: AgentOptimizer.optimize → adapter.evaluate → call_agent stub + → mocked gepa → mocked LLM judge → SUCCEEDED OptimizeResult. + + Real LLM calls (reflection_lm + judge_model) are short-circuited so the + test only exercises the framework's plumbing. + """ + from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt + from trpc_agent_sdk.evaluation._eval_metrics import EvalStatus + from trpc_agent_sdk.evaluation._llm_judge import LLMJudge + from trpc_agent_sdk.evaluation._optimize_gepa_reflective import ( + GepaReflectiveOptimizer, + ) + + agent_mod = _load_quickstart_agent() + original_system = agent_mod.SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") + original_skill = agent_mod.SKILL_PATH.read_text(encoding="utf-8") + + # ``stub_call_agent`` returns a string that contains every reference answer + # from train + val, so the ``contains``-based ``final_response_avg_score`` + # accepts every case (baseline_pass_rate is independently zeroed below by + # the gepa stub returning a single seed candidate that passes too). + expected_answers = [ + "答案:11 个", + "答案:150 公里", + "答案:160 元", + "答案:40 个", + "答案:3.5 千克", + "答案:18 人", + ] + + async def stub_call_agent(query: str) -> str: + return " | ".join(expected_answers) + + async def fake_judge_evaluate(self, actual_invocations, expected_invocations): + """Return a perfect EvaluationResult so llm_rubric_response is always + PASSED without touching a real judge model.""" + from trpc_agent_sdk.evaluation._eval_result import EvaluationResult + from trpc_agent_sdk.evaluation._eval_result import PerInvocationResult + + per_invocation_results = [ + PerInvocationResult( + actual_invocation=actual, + expected_invocation=expected, + score=1.0, + eval_status=EvalStatus.PASSED, + ) + for actual, expected in zip(actual_invocations, expected_invocations) + ] + return EvaluationResult( + overall_score=1.0, + overall_eval_status=EvalStatus.PASSED, + per_invocation_results=per_invocation_results, + ) + + monkeypatch.setattr(LLMJudge, "evaluate", fake_judge_evaluate) + + async def fake_call_gepa(self, **kwargs): + seed = kwargs["seed_candidate"] + improved = dict(seed) + for key in improved: + improved[key] = improved[key] + "\n\nIMPROVED" + return _FakeGEPAResult( + candidates=[seed, improved], + val_scores=[0.0, 1.0], + ) + + monkeypatch.setattr( + GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa + ) + + try: + target = ( + TargetPrompt() + .add_path("system_prompt", str(agent_mod.SYSTEM_PROMPT_PATH)) + .add_path("skill", str(agent_mod.SKILL_PATH)) + ) + result = await AgentOptimizer.optimize( + config_path=str(_QUICKSTART_DIR / "optimizer.json"), + call_agent=stub_call_agent, + target_prompt=target, + train_dataset_path=str(_QUICKSTART_DIR / "train.evalset.json"), + validation_dataset_path=str(_QUICKSTART_DIR / "val.evalset.json"), + output_dir=str(tmp_path / "quickstart_runs"), + verbose=0, + ) + + assert result.status == "SUCCEEDED" + assert result.algorithm == "gepa_reflective" + assert result.best_pass_rate == pytest.approx(1.0) + # default update_source=False keeps both sources untouched + assert ( + agent_mod.SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") == original_system + ) + assert ( + agent_mod.SKILL_PATH.read_text(encoding="utf-8") == original_skill + ) + # Both registered prompts are present in best_prompts and were rewritten. + assert set(result.best_prompts.keys()) == {"system_prompt", "skill"} + assert "IMPROVED" in result.best_prompts["system_prompt"] + assert "IMPROVED" in result.best_prompts["skill"] + # Artifacts include both best_prompts files (multi-prompt scenario). + best_dir = tmp_path / "quickstart_runs" / "best_prompts" + assert (best_dir / "system_prompt.md").is_file() + assert (best_dir / "skill.md").is_file() + finally: + agent_mod.SYSTEM_PROMPT_PATH.write_text(original_system, encoding="utf-8") + agent_mod.SKILL_PATH.write_text(original_skill, encoding="utf-8") + + +# --------------------------------------------------------------------------- +# CONC-2 fix: real gepa main loop drives adapter.evaluate multiple times, +# verifying the long-lived event loop is shared across rounds and that +# module-level async resources held by call_agent stay valid. +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_quickstart_real_gepa_loop_reuses_single_event_loop_across_rounds( + tmp_path: Path, + fake_env: None, + monkeypatch: pytest.MonkeyPatch, +): + """Real gepa.optimize drives adapter.evaluate multiple times. The + adapter's long-lived event loop must be reused across every evaluate + so call_agent can hold module-level async resources safely.""" + import asyncio + import json + + from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt + from trpc_agent_sdk.evaluation._eval_metrics import EvalStatus + from trpc_agent_sdk.evaluation._llm_judge import LLMJudge + from trpc_agent_sdk.evaluation._optimize_model_callable import ( + _OptimizeModelCallable, + ) + + agent_mod = _load_quickstart_agent() + original_system = agent_mod.SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") + original_skill = agent_mod.SKILL_PATH.read_text(encoding="utf-8") + + # Track the running loop id every time call_agent fires; must stay + # constant across all evaluate() invocations. + seen_loop_ids: list[int] = [] + + expected_answers = [ + "答案:11 个", "答案:150 公里", "答案:160 元", + "答案:40 个", "答案:3.5 千克", "答案:18 人", + ] + + async def stub_call_agent(query: str) -> str: + seen_loop_ids.append(id(asyncio.get_running_loop())) + return " | ".join(expected_answers) + + # Make the LLM judge always pass. + async def fake_judge_evaluate(self, actual_invocations, expected_invocations): + from trpc_agent_sdk.evaluation._eval_result import ( + EvaluationResult, + PerInvocationResult, + ) + return EvaluationResult( + overall_score=1.0, + overall_eval_status=EvalStatus.PASSED, + per_invocation_results=[ + PerInvocationResult( + actual_invocation=a, + expected_invocation=e, + score=1.0, + eval_status=EvalStatus.PASSED, + ) + for a, e in zip(actual_invocations, expected_invocations) + ], + ) + + monkeypatch.setattr(LLMJudge, "evaluate", fake_judge_evaluate) + + # Stub reflection LM so gepa main loop doesn't hit a real backend. + # Returns the candidate's instruction with a marker appended each time. + rewrite_count = {"n": 0} + + def fake_reflection_call(self, prompt): + rewrite_count["n"] += 1 + self.total_calls += 1 + return f"REWRITE_v{rewrite_count['n']}" + + monkeypatch.setattr(_OptimizeModelCallable, "__call__", fake_reflection_call) + + # Use a tiny budget so the run finishes quickly but still exercises + # at least baseline + 1 round of adapter.evaluate (=2 evaluate calls + # minimum, in practice baseline + minibatch_eval + valset_eval per + # round = 3+ evaluate calls). + config_path = tmp_path / "optimizer.json" + config_payload = json.loads( + (_QUICKSTART_DIR / "optimizer.json").read_text(encoding="utf-8") + ) + config_payload["optimize"]["algorithm"]["max_metric_calls"] = 6 + config_payload["optimize"]["algorithm"]["reflection_minibatch_size"] = 1 + config_payload["optimize"]["algorithm"]["max_iterations_without_improvement"] = 1 + config_path.write_text(json.dumps(config_payload), encoding="utf-8") + + try: + target = ( + TargetPrompt() + .add_path("system_prompt", str(agent_mod.SYSTEM_PROMPT_PATH)) + .add_path("skill", str(agent_mod.SKILL_PATH)) + ) + result = await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=stub_call_agent, + target_prompt=target, + train_dataset_path=str(_QUICKSTART_DIR / "train.evalset.json"), + validation_dataset_path=str(_QUICKSTART_DIR / "val.evalset.json"), + output_dir=str(tmp_path / "real_gepa_runs"), + verbose=0, + ) + finally: + agent_mod.SYSTEM_PROMPT_PATH.write_text(original_system, encoding="utf-8") + agent_mod.SKILL_PATH.write_text(original_skill, encoding="utf-8") + + # Real gepa drove adapter.evaluate at least twice (baseline + round 1). + assert len(seen_loop_ids) >= 2, ( + f"Expected real gepa main loop to call call_agent more than once; " + f"saw {len(seen_loop_ids)} call(s)." + ) + + # All call_agent invocations across the entire optimize() ran on the + # same long-lived event loop (CONC-2 fix). + assert len(set(seen_loop_ids)) == 1, ( + f"call_agent ran on multiple distinct loops across rounds: " + f"{set(seen_loop_ids)}. Module-level async resources would break." + ) + + # OptimizeResult is well-formed. + assert result.status in {"SUCCEEDED", "FAILED"} + assert result.algorithm == "gepa_reflective" diff --git a/tests/evaluation/test_optimize_registry.py b/tests/evaluation/test_optimize_registry.py new file mode 100644 index 00000000..ee1938c0 --- /dev/null +++ b/tests/evaluation/test_optimize_registry.py @@ -0,0 +1,109 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for OptimizerRegistry.""" + +from __future__ import annotations + +import pytest + +from trpc_agent_sdk.evaluation._base_optimizer import BaseOptimizer +from trpc_agent_sdk.evaluation._optimize_registry import OPTIMIZER_REGISTRY +from trpc_agent_sdk.evaluation._optimize_registry import OptimizerRegistry +from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult + + +def _dummy_result() -> OptimizeResult: + return OptimizeResult( + algorithm="fake", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + total_rounds=0, + total_reflection_lm_calls=0, + total_judge_model_calls=0, + duration_seconds=0.0, + started_at="1970-01-01T00:00:00Z", + finished_at="1970-01-01T00:00:00Z", + ) + + +class _FakeOptimizerA(BaseOptimizer): + async def run(self) -> OptimizeResult: + return _dummy_result() + + +class _FakeOptimizerB(BaseOptimizer): + async def run(self) -> OptimizeResult: + return _dummy_result() + + +def test_empty_registry_lists_nothing(): + registry = OptimizerRegistry() + assert registry.list_registered() == [] + + +def test_register_and_get_returns_class(): + registry = OptimizerRegistry() + registry.register("fake_a", _FakeOptimizerA) + assert registry.get("fake_a") is _FakeOptimizerA + + +def test_list_registered_is_sorted(): + registry = OptimizerRegistry() + registry.register("zzz", _FakeOptimizerA) + registry.register("aaa", _FakeOptimizerB) + assert registry.list_registered() == ["aaa", "zzz"] + + +def test_register_overwrites_existing_name(): + registry = OptimizerRegistry() + registry.register("dup", _FakeOptimizerA) + registry.register("dup", _FakeOptimizerB) + assert registry.get("dup") is _FakeOptimizerB + + +def test_get_unknown_algorithm_raises_valueerror_with_available_list(): + registry = OptimizerRegistry() + registry.register("fake_a", _FakeOptimizerA) + with pytest.raises(ValueError) as exc_info: + registry.get("unknown_algo") + msg = str(exc_info.value) + assert "unknown_algo" in msg + assert "fake_a" in msg + + +def test_get_on_empty_registry_lists_empty_available(): + registry = OptimizerRegistry() + with pytest.raises(ValueError) as exc_info: + registry.get("anything") + assert "anything" in str(exc_info.value) + + +def test_register_rejects_non_basoptimizer_subclass(): + registry = OptimizerRegistry() + + class _NotAnOptimizer: + pass + + with pytest.raises(TypeError): + registry.register("bad", _NotAnOptimizer) + + +def test_module_level_singleton_is_optimizer_registry_instance(): + assert isinstance(OPTIMIZER_REGISTRY, OptimizerRegistry) + + +def test_module_level_singleton_contains_registered_algorithms(): + """Importing the evaluation package registers all available algorithms. + + The exact list grows over time, but ``gepa_reflective`` is the v1 baseline + contract: any algorithm whose optional dependencies are installed and whose + module is registered in ``_optimize_registrations`` must be reachable via + ``OPTIMIZER_REGISTRY.get(name)``. + """ + assert "gepa_reflective" in OPTIMIZER_REGISTRY.list_registered() diff --git a/tests/evaluation/test_optimize_reporter.py b/tests/evaluation/test_optimize_reporter.py new file mode 100644 index 00000000..00b1d2b1 --- /dev/null +++ b/tests/evaluation/test_optimize_reporter.py @@ -0,0 +1,611 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for OptimizeReporter progress sinks (Null / Rich / ASCII).""" + +from __future__ import annotations + +import io +import logging +from typing import Any + +import pytest + +from trpc_agent_sdk.evaluation._optimize_reporter import ( + RoundView, + RunHeader, + create_reporter, + _AsciiReporter, + _NullReporter, + _SilentGepaLogger, +) + + +def _header(**overrides: Any) -> RunHeader: + defaults = dict( + algorithm="gepa_reflective", + target_fields=[("instruction", "agent/prompts/system.md")], + train_size=5, + val_size=3, + metric_names=["final_response_avg_score"], + output_dir="runs/2026-05-17T16-30-00", + ) + defaults.update(overrides) + return RunHeader(**defaults) + + +def _round_view(**overrides: Any) -> RoundView: + defaults = dict( + round=1, + kind="reflective", + train_minibatch_size=2, + train_size=5, + train_subsample_parent_score=0.0, + train_subsample_candidate_score=1.0, + val_pass_rate=1.0, + accepted=True, + skip_reason=None, + error_message=None, + duration_seconds=28.4, + budget_used=12, + budget_total=None, # "auto" + ) + defaults.update(overrides) + return RoundView(**defaults) + + +class TestFactory: + def test_verbose_zero_returns_null_reporter(self): + reporter = create_reporter(verbose=0) + assert isinstance(reporter, _NullReporter) + + def test_null_reporter_emits_nothing(self, capsys): + reporter = create_reporter(verbose=0) + reporter.run_started(_header()) + reporter.baseline_evaluated(0.0, {}) + reporter.round_completed(_round_view()) + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + + def test_verbose_one_picks_a_real_reporter(self): + reporter = create_reporter(verbose=1, stream=io.StringIO()) + assert not isinstance(reporter, _NullReporter) + + def test_falls_back_to_ascii_reporter_when_rich_is_unavailable( + self, + monkeypatch: pytest.MonkeyPatch, + ): + """``rich`` is an optional extra of the ``optimize`` install group; + when missing, the factory must degrade gracefully to the ASCII + backend so AgentOptimizer still produces a readable timeline.""" + import builtins + real_import = builtins.__import__ + + def fake_import(name, *args, **kwargs): + if name == "rich" or name.startswith("rich."): + raise ImportError("simulated missing rich dependency") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + reporter = create_reporter(verbose=1, stream=io.StringIO()) + assert isinstance(reporter, _AsciiReporter) + + +class TestAsciiReporterRoundRendering: + """Use the ASCII backend directly so assertions don't depend on rich's + rendering quirks. Rich backend is exercised separately in TestRichReporter. + """ + + def _new(self) -> tuple[_AsciiReporter, io.StringIO]: + buf = io.StringIO() + return _AsciiReporter(stream=buf, verbose=1), buf + + def test_round_accepted_renders_one_line_with_semantic_segments(self): + reporter, buf = self._new() + reporter.round_completed(_round_view()) + output = buf.getvalue() + assert "round 1" in output + assert "accepted" in output + assert "train sample 2/5" in output + assert "sample score 0.00" in output + assert "1.00" in output + assert "valset pass_rate 1.0000" in output + assert "evaluations 12/auto" in output + # Single line per round. + assert output.count("round 1") == 1 + + def test_round_skipped_subsample_perfect_uses_skip_marker(self): + reporter, buf = self._new() + reporter.round_completed( + _round_view( + round=2, + train_subsample_parent_score=1.0, + train_subsample_candidate_score=None, + val_pass_rate=None, + accepted=False, + skip_reason="minibatch already perfect (skip_perfect_score on)", + duration_seconds=3.1, + ) + ) + output = buf.getvalue() + assert "round 2" in output + assert "skipped" in output + assert "minibatch already perfect" in output + # No valset segment when skipped pre-val. + assert "valset pass_rate" not in output + + def test_round_skipped_no_proposal_omits_train_segment_when_no_minibatch(self): + reporter, buf = self._new() + reporter.round_completed( + _round_view( + round=4, + train_minibatch_size=0, + train_subsample_parent_score=None, + train_subsample_candidate_score=None, + val_pass_rate=None, + accepted=False, + skip_reason="reflect-LM produced no usable new prompt", + duration_seconds=1.2, + ) + ) + output = buf.getvalue() + assert "round 4" in output + assert "skipped" in output + assert "reflect-LM produced no usable new prompt" in output + assert "train sample" not in output + + def test_round_error_uses_error_marker(self): + reporter, buf = self._new() + reporter.round_completed( + _round_view( + round=3, + train_subsample_candidate_score=None, + val_pass_rate=None, + accepted=False, + skip_reason=None, + error_message="evaluator timeout", + duration_seconds=15.0, + ) + ) + output = buf.getvalue() + assert "round 3" in output + assert "error" in output.lower() + assert "message: evaluator timeout" in output + + def test_round_explored_when_evaluated_but_not_accepted(self): + reporter, buf = self._new() + reporter.round_completed( + _round_view( + round=6, + accepted=False, + val_pass_rate=0.42, + train_subsample_parent_score=0.3, + train_subsample_candidate_score=0.4, + ) + ) + output = buf.getvalue() + assert "round 6" in output + assert "explored" in output + assert "valset pass_rate 0.4200" in output + + def test_merge_round_renders_with_merge_marker(self): + reporter, buf = self._new() + reporter.round_completed(_round_view(round=7, kind="merge")) + output = buf.getvalue() + assert "round 7" in output + assert "merged" in output.lower() or "merge" in output.lower() + + +class TestAsciiReporterHeaderAndBaseline: + def _new(self) -> tuple[_AsciiReporter, io.StringIO]: + buf = io.StringIO() + return _AsciiReporter(stream=buf, verbose=1), buf + + def test_header_single_target_field_shows_basename_only(self): + reporter, buf = self._new() + reporter.run_started(_header()) + out = buf.getvalue() + assert "gepa_reflective" in out + assert "instruction" in out + # Header collapses file-backed sources to basename so deep paths + # don't dominate the panel; full paths remain in config.snapshot.json. + assert "system.md" in out + assert "agent/prompts/system.md" not in out + assert "train/val" in out or "train" in out.lower() + assert "5" in out and "3" in out + assert "runs/2026-05-17T16-30-00" in out + # Legend is printed once after the header so users can decode subsequent + # per-round lines without scrolling back to documentation. + assert "Round line legend" in out + assert "valset pass_rate" in out + assert "evaluations used/total" in out + + def test_header_multiple_target_fields_keeps_callback_sentinel(self): + reporter, buf = self._new() + reporter.run_started( + _header( + target_fields=[ + ("system_prompt", "prompts/system.md"), + ("user_template", "prompts/user.md"), + ("rubric", ""), + ], + ) + ) + out = buf.getvalue() + assert "system_prompt" in out + assert "user_template" in out + assert "rubric" in out + # File-backed sources collapse to basenames in the panel. + assert "system.md" in out + assert "user.md" in out + # Callback sources keep the explicit sentinel. + assert "" in out + + def test_header_multiple_metrics_shows_count(self): + reporter, buf = self._new() + reporter.run_started( + _header( + metric_names=["final_response_avg_score", "llm_rubric_response"] + ) + ) + out = buf.getvalue() + # B2: metric count visible + assert "2" in out + assert "final_response_avg_score" in out + assert "llm_rubric_response" in out + + def test_header_long_field_name_is_truncated(self): + reporter, buf = self._new() + long_name = "this_is_a_very_long_field_name_that_must_be_truncated_for_display" + reporter.run_started( + _header(target_fields=[(long_name, "prompts/x.md")]) + ) + out = buf.getvalue() + # A4: never echo a line longer than the truncation cap + for line in out.splitlines(): + assert len(line) <= 200 # generous cap on header line width + + def test_baseline_renders_pass_rate_and_breakdown(self): + reporter, buf = self._new() + reporter.baseline_evaluated( + 0.42, {"final_response_avg_score": 0.42} + ) + out = buf.getvalue() + assert "baseline" in out.lower() + assert "0.4200" in out + + def test_baseline_shows_thresholds_and_pass_fail_status(self): + reporter, buf = self._new() + reporter.baseline_evaluated( + 0.5, + {"final_response_avg_score": 0.42, "response_match_score": 0.80}, + metric_thresholds={ + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + }, + ) + out = buf.getvalue() + # Threshold column present. + assert "threshold 0.5000" in out + assert "threshold 0.3000" in out + # PASS / FAIL status reflects evaluator semantics (score >= threshold). + assert "FAIL" in out # 0.42 < 0.5 + assert "PASS" in out # 0.80 >= 0.3 + + +class TestAsciiReporterRunFinished: + def _new(self) -> tuple[_AsciiReporter, io.StringIO]: + buf = io.StringIO() + return _AsciiReporter(stream=buf, verbose=1), buf + + def _make_result(self, **overrides: Any) -> Any: + from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult, RoundRecord + defaults = dict( + algorithm="gepa_reflective", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=1.0, + pass_rate_improvement=1.0, + baseline_metric_breakdown={}, + best_metric_breakdown={}, + metric_thresholds={}, + baseline_prompts={"instruction": "old"}, + best_prompts={"instruction": "new"}, + total_rounds=2, + rounds=[], + total_reflection_lm_calls=2, + total_judge_model_calls=0, + total_llm_cost=0.0, + total_token_usage={"prompt": 0, "completion": 0, "total": 0}, + duration_seconds=142.86, + started_at="2026-05-17T16:30:00+00:00", + finished_at="2026-05-17T16:32:22+00:00", + extras={}, + ) + defaults.update(overrides) + return OptimizeResult(**defaults) + + def test_summary_panel_shows_improvement_arrow(self): + reporter, buf = self._new() + result = self._make_result() + reporter.run_finished( + result, output_dir="runs/2026-05-17T16-30-00", update_source=False, + ) + out = buf.getvalue() + assert "SUCCEEDED" in out + assert "0.0000" in out and "1.0000" in out + assert "+1.0000" in out or "+1.00" in out + assert "improved" in out + assert "142.86" in out + assert "runs/2026-05-17T16-30-00" in out + + def test_summary_panel_shows_no_improvement_when_flat(self): + reporter, buf = self._new() + result = self._make_result( + best_pass_rate=0.5, + baseline_pass_rate=0.5, + pass_rate_improvement=0.0, + finish_reason="no_improvement", + ) + reporter.run_finished( + result, output_dir="runs/x", update_source=False, + ) + out = buf.getvalue() + assert "no improvement" in out.lower() or "no_improvement" in out + + def test_summary_panel_marks_failed_status(self): + reporter, buf = self._new() + result = self._make_result( + status="FAILED", finish_reason="error", + error_message="dataset load failed: missing file", + ) + reporter.run_finished( + result, output_dir="runs/x", update_source=False, + ) + out = buf.getvalue() + assert "FAILED" in out + assert "dataset load failed" in out + + def test_summary_shows_update_source_when_true(self): + reporter, buf = self._new() + result = self._make_result() + reporter.run_finished( + result, output_dir="runs/x", update_source=True, + ) + out = buf.getvalue() + # G1: update_source visible + assert "update_source" in out + # Mentions the source was written back. + assert "written" in out.lower() or "true" in out.lower() + + def test_summary_shows_stopped_by_required_metrics_passing(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="required_metrics_passing") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "required metrics met thresholds" in out + + def test_summary_shows_stopped_by_budget_exhausted(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="budget_exhausted") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "budget exhausted" in out + # Disambiguates from the legacy catch-all label so users can tell the + # MaxMetricCallsStopper triggered specifically. + assert "max_metric_calls" in out + + def test_summary_shows_stopped_by_no_improvement(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="no_improvement") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "no improvement" in out + + def test_summary_shows_stopped_by_timeout(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="timeout") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "timeout" in out + + def test_summary_shows_stopped_by_score_threshold(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="score_threshold") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "score threshold" in out + + def test_summary_shows_stopped_by_completed_when_no_stopper_fired(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="completed") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "completed" in out + assert "no stopper triggered" in out + + def test_summary_shows_stopped_by_user_requested_stop(self) -> None: + from trpc_agent_sdk.evaluation._optimize_reporter import _format_stop_reason_text + + assert _format_stop_reason_text("user_requested_stop") == ( + "user requested stop (optimize.stop touched)" + ) + + def test_summary_omits_stopped_by_when_stop_reason_none(self): + reporter, buf = self._new() + result = self._make_result(stop_reason=None) + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" not in out + + def test_summary_per_metric_table_includes_threshold_and_status(self): + reporter, buf = self._new() + result = self._make_result( + baseline_metric_breakdown={ + "final_response_avg_score": 0.42, + "response_match_score": 0.10, + }, + best_metric_breakdown={ + "final_response_avg_score": 1.0, + "response_match_score": 0.20, + }, + metric_thresholds={ + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + }, + ) + reporter.run_finished( + result, output_dir="runs/x", update_source=False, + ) + out = buf.getvalue() + assert "threshold | baseline -> best" in out + assert "threshold 0.5000" in out + assert "threshold 0.3000" in out + # final_response_avg_score 1.0 >= 0.5 → PASS + # response_match_score 0.2 < 0.3 → FAIL + assert "PASS" in out + assert "FAIL" in out + + +class _CapturingHandler(logging.Handler): + """Test helper: collects every record emitted on the attached logger. + + Attached directly to the target logger (rather than relying on root / + caplog) because the ``trpc_agent_sdk`` parent logger sets + ``propagate=False`` once initialised, which would prevent caplog from + seeing child events. + """ + + def __init__(self) -> None: + super().__init__(level=logging.DEBUG) + self.records: list[logging.LogRecord] = [] + + def emit(self, record: logging.LogRecord) -> None: + self.records.append(record) + + +@pytest.fixture +def gepa_log_capture() -> tuple[logging.Logger, list[logging.LogRecord]]: + target = logging.getLogger("trpc_agent_sdk.optimizer.gepa") + handler = _CapturingHandler() + target.addHandler(handler) + previous_level = target.level + target.setLevel(logging.INFO) + try: + yield target, handler.records + finally: + target.removeHandler(handler) + target.setLevel(previous_level) + + +class TestSilentGepaLogger: + """`_SilentGepaLogger` replaces gepa's default StdOutLogger. + + verbose=1: drop every message (no stdout pollution). + verbose=2: forward to logging.getLogger("trpc_agent_sdk.optimizer.gepa") + at INFO level so users can route via logging config. + """ + + def test_verbose_one_drops_message(self, capsys, gepa_log_capture): + _, records = gepa_log_capture + logger = _SilentGepaLogger(verbose=1) + logger.log("Iteration 3: Best valset aggregate score so far: 1.0") + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + assert records == [] + + def test_verbose_two_forwards_to_logging(self, gepa_log_capture): + _, records = gepa_log_capture + logger = _SilentGepaLogger(verbose=2) + logger.log("Iteration 3: Best valset aggregate score so far: 1.0") + assert any( + "Best valset aggregate" in rec.getMessage() + for rec in records + if rec.name == "trpc_agent_sdk.optimizer.gepa" + ) + + +class TestRichBackendFallback: + """When rich is unavailable, factory must fall back to ASCII silently.""" + + def test_create_reporter_falls_back_when_rich_missing(self, monkeypatch): + import builtins + real_import = builtins.__import__ + + def fake_import(name: str, *args: Any, **kwargs: Any) -> Any: + if name == "rich" or name.startswith("rich."): + raise ImportError(f"forced: {name}") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + reporter = create_reporter(verbose=1, stream=io.StringIO()) + assert isinstance(reporter, _AsciiReporter) + + +class TestRichProgressNotAutoRefreshFlood: + """Regression: the Rich Progress region must not flood the timeline with + duplicate "progress ..." lines when stdout does not honour rich's + cursor-up escapes (typical for embedded IDE terminals). + + The bug appeared as dozens of stacked ``progress ...`` rows during a + single GEPA baseline evaluation because rich's default ``auto_refresh`` + fires at 10 Hz; without functioning cursor-up the previous frame was + never erased and every refresh tick became a fresh log line. + + The fix is to build ``Progress`` with ``auto_refresh=False`` and refresh + explicitly on each round event. This test asserts a hard upper bound on + the number of progress lines emitted across a realistic run. + """ + + def test_progress_line_count_is_bounded_by_round_count(self): + from trpc_agent_sdk.evaluation._optimize_reporter import _RichReporter + + buf = io.StringIO() + reporter = _RichReporter(stream=buf, verbose=1) + reporter.run_started(_header(budget_total=60)) + reporter.baseline_evaluated( + 0.0, + {"final_response_avg_score": 0.0, "llm_rubric_response": 1.0}, + metric_thresholds={ + "final_response_avg_score": 1.0, + "llm_rubric_response": 0.66, + }, + ) + for round_no in range(1, 7): + reporter.round_completed( + _round_view( + round=round_no, + accepted=(round_no == 1), + skip_reason=None if round_no == 1 else "all_scores_perfect", + train_subsample_parent_score=1.0, + train_subsample_candidate_score=None, + val_pass_rate=0.6667 if round_no == 1 else None, + budget_used=10 + (round_no - 1) * 2, + budget_total=60, + ) + ) + reporter._stop_progress() + progress_lines = [ + line + for line in buf.getvalue().splitlines() + if line.lstrip().startswith("progress") + ] + # A well-behaved Live region produces at most one progress line per + # discrete event (start + 6 rounds = 7). A regression that re-enables + # auto_refresh at 10 Hz over a multi-minute baseline trivially exceeds + # this bound by an order of magnitude (we saw 30+ in the wild). + assert len(progress_lines) <= 8, ( + f"too many progress lines: {len(progress_lines)} — " + f"auto_refresh may have been re-enabled" + ) diff --git a/tests/evaluation/test_optimize_result.py b/tests/evaluation/test_optimize_result.py new file mode 100644 index 00000000..2bbe93ea --- /dev/null +++ b/tests/evaluation/test_optimize_result.py @@ -0,0 +1,456 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for OptimizeResult / RoundRecord / dump_to / from_file.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult +from trpc_agent_sdk.evaluation._optimize_result import RoundRecord + + +def _round_record(round_idx: int = 1, accepted: bool = True) -> RoundRecord: + return RoundRecord( + round=round_idx, + optimized_field_names=["system_prompt"], + candidate_prompts={"system_prompt": f"v{round_idx}"}, + train_pass_rate=0.5 + 0.1 * round_idx, + validation_pass_rate=0.4 + 0.1 * round_idx, + metric_breakdown={"final_response_avg_score": 0.6}, + accepted=accepted, + acceptance_reason=("validation_pass_rate gain 0.10 >= min_score_gain 0.0" + if accepted else "validation_pass_rate gain -0.02 < min_score_gain 0.0"), + failed_case_ids=["c1", "c2"], + failed_cases_truncated=0, + per_field_diagnosis={"system_prompt": "model said: be more careful"}, + reflection_lm_calls=1, + round_llm_cost=0.012, + round_token_usage={"prompt": 100, "completion": 50, "total": 150}, + started_at="2026-05-14T19:30:00Z", + duration_seconds=2.5, + ) + + +def _optimize_result(rounds: list[RoundRecord] | None = None) -> OptimizeResult: + rounds = rounds or [_round_record(1, accepted=True)] + return OptimizeResult( + algorithm="gepa_reflective", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.4, + best_pass_rate=0.6, + pass_rate_improvement=0.2, + baseline_metric_breakdown={"final_response_avg_score": 0.5}, + best_metric_breakdown={"final_response_avg_score": 0.7}, + baseline_prompts={"system_prompt": "v0"}, + best_prompts={"system_prompt": "v1"}, + total_rounds=len(rounds), + rounds=rounds, + total_reflection_lm_calls=1, + total_judge_model_calls=8, + total_llm_cost=0.05, + total_token_usage={"prompt": 200, "completion": 100, "total": 300}, + duration_seconds=5.0, + started_at="2026-05-14T19:30:00Z", + finished_at="2026-05-14T19:30:05Z", + ) + + +def test_optimize_result_algorithm_field_required(): + """algorithm must be a top-level required field per spec §3.6 / acceptance #20.""" + import pydantic + + with pytest.raises(pydantic.ValidationError) as exc: + OptimizeResult( + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + total_rounds=0, + total_reflection_lm_calls=0, + total_judge_model_calls=0, + duration_seconds=0.0, + started_at="t0", + finished_at="t1", + ) + assert any("algorithm" in str(e["loc"]) for e in exc.value.errors()) + + +def test_optimize_result_algorithm_field_round_trips(tmp_path: Path): + result = _optimize_result() + assert result.algorithm == "gepa_reflective" + target = tmp_path / "r.json" + result.dump_to(str(target)) + loaded = OptimizeResult.from_file(str(target)) + assert loaded.algorithm == "gepa_reflective" + + +def test_optimize_result_metric_thresholds_defaults_to_empty_dict(): + result = _optimize_result() + assert result.metric_thresholds == {} + + +def test_optimize_result_metric_thresholds_round_trip(tmp_path: Path): + result = _optimize_result().model_copy( + update={ + "metric_thresholds": { + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + } + } + ) + path = tmp_path / "with_thresholds.json" + result.dump_to(str(path)) + loaded = OptimizeResult.from_file(str(path)) + assert loaded.metric_thresholds == { + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + } + + +def test_optimize_result_format_summary_includes_thresholds_when_provided(): + result = _optimize_result().model_copy( + update={ + "metric_thresholds": {"final_response_avg_score": 0.5}, + "baseline_metric_breakdown": {"final_response_avg_score": 0.4}, + "best_metric_breakdown": {"final_response_avg_score": 0.9}, + } + ) + summary = result.format_summary(output_dir="/tmp/runs/x", update_source=False) + assert "threshold | baseline -> best" in summary + assert "threshold 0.5000" in summary + assert "0.4000 -> 0.9000" in summary + + +def test_round_record_minimal_construction(): + record = _round_record() + assert record.round == 1 + assert record.accepted is True + assert record.round_llm_cost == 0.012 + assert record.round_token_usage == {"prompt": 100, "completion": 50, "total": 150} + + +def test_round_record_extras_defaults_to_empty_dict(): + record = _round_record() + assert record.extras == {} + + +def test_round_record_extras_accepts_arbitrary_payload(): + record = RoundRecord( + round=1, + optimized_field_names=["a"], + candidate_prompts={"a": "x"}, + train_pass_rate=0.5, + validation_pass_rate=0.5, + metric_breakdown={}, + accepted=False, + acceptance_reason="", + failed_case_ids=[], + failed_cases_truncated=0, + per_field_diagnosis={}, + reflection_lm_calls=0, + round_llm_cost=0.0, + round_token_usage={"prompt": 0, "completion": 0, "total": 0}, + started_at="2026-05-14T19:30:00Z", + duration_seconds=1.0, + extras={"judge_subscores": [0.5, 0.6], "wandb_step": 7}, + ) + assert record.extras["judge_subscores"] == [0.5, 0.6] + assert record.extras["wandb_step"] == 7 + + +def test_optimize_result_minimal_construction(): + result = _optimize_result() + assert result.schema_version == "v1" + assert result.status == "SUCCEEDED" + assert result.finish_reason == "completed" + assert result.baseline_pass_rate == 0.4 + assert result.best_pass_rate == 0.6 + assert result.pass_rate_improvement == 0.2 + assert result.total_rounds == 1 + assert len(result.rounds) == 1 + assert result.extras == {} + + +def test_optimize_result_default_token_usage_is_zero(): + result = OptimizeResult( + algorithm="gepa_reflective", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + baseline_metric_breakdown={}, + best_metric_breakdown={}, + baseline_prompts={}, + best_prompts={}, + total_rounds=0, + rounds=[], + total_reflection_lm_calls=0, + total_judge_model_calls=0, + duration_seconds=0.0, + started_at="2026-05-14T19:30:00Z", + finished_at="2026-05-14T19:30:00Z", + ) + assert result.total_llm_cost == 0.0 + assert result.total_token_usage == {"prompt": 0, "completion": 0, "total": 0} + assert result.extras == {} + assert result.error_message == "" + + +@pytest.mark.parametrize("status", ["SUCCEEDED", "FAILED", "CANCELED"]) +def test_optimize_result_run_status_accepts_all_legal_values(status): + result = _optimize_result() + new = result.model_copy(update={"status": status}) + assert new.status == status + + +def test_optimize_result_rejects_illegal_run_status(): + with pytest.raises(Exception): + OptimizeResult.model_validate({**_optimize_result().model_dump(), "status": "unknown"}) + + +@pytest.mark.parametrize("reason", [ + "completed", + "perfect_pass_rate", + "no_improvement", + "error", +]) +def test_optimize_result_finish_reason_accepts_all_legal_values(reason): + result = _optimize_result() + new = result.model_copy(update={"finish_reason": reason}) + assert new.finish_reason == reason + + +def test_optimize_result_rejects_illegal_finish_reason(): + with pytest.raises(Exception): + OptimizeResult.model_validate({**_optimize_result().model_dump(), "finish_reason": "weird"}) + + +def test_optimize_result_rejects_removed_cancelled_finish_reason(): + """DOC-4: 'cancelled' was removed from FinishReason because no SDK code path + ever produces it; user cancellation surfaces as stop_reason='user_requested_stop' + + status='CANCELED'. Schema must reject it to keep the literal set honest.""" + with pytest.raises(Exception): + OptimizeResult.model_validate( + {**_optimize_result().model_dump(), "finish_reason": "cancelled"} + ) + + +def test_optimize_result_model_dump_json_round_trip(): + original = _optimize_result() + payload = original.model_dump_json() + restored = OptimizeResult.model_validate_json(payload) + assert restored == original + + +def test_optimize_result_dump_to_creates_indented_json_file(tmp_path: Path): + path = tmp_path / "result.json" + result = _optimize_result() + result.dump_to(str(path)) + assert path.exists() + text = path.read_text(encoding="utf-8") + payload = json.loads(text) + assert payload["status"] == "SUCCEEDED" + assert payload["finishReason"] == "completed" + assert "\n" in text + + +def test_optimize_result_from_file_round_trip(tmp_path: Path): + path = tmp_path / "result.json" + original = _optimize_result() + original.dump_to(str(path)) + restored = OptimizeResult.from_file(str(path)) + assert restored == original + + +def test_round_record_new_reporter_fields_default_to_none_or_zero(): + """New fields reporter and artifact persistence consume must default + safely so existing callers keep working unchanged.""" + record = _round_record() + assert record.kind == "reflective" + assert record.train_minibatch_size == 0 + assert record.train_subsample_parent_score is None + assert record.train_subsample_candidate_score is None + assert record.skip_reason is None + assert record.error_message is None + assert record.budget_used is None + assert record.budget_total is None + + +def test_round_record_new_reporter_fields_round_trip(): + record = RoundRecord( + round=2, + optimized_field_names=[], + candidate_prompts={"a": "x"}, + train_pass_rate=0.5, + validation_pass_rate=0.0, + metric_breakdown={}, + accepted=False, + acceptance_reason="", + failed_case_ids=[], + failed_cases_truncated=0, + per_field_diagnosis={}, + reflection_lm_calls=0, + round_llm_cost=0.0, + round_token_usage={"prompt": 0, "completion": 0, "total": 0}, + started_at="2026-05-17T16:30:00Z", + duration_seconds=2.1, + kind="merge", + train_minibatch_size=2, + train_subsample_parent_score=0.6, + train_subsample_candidate_score=0.4, + skip_reason=None, + error_message=None, + budget_used=42, + budget_total=200, + ) + payload = record.model_dump_json() + restored = RoundRecord.model_validate_json(payload) + assert restored == record + assert restored.kind == "merge" + assert restored.train_minibatch_size == 2 + assert restored.budget_used == 42 + assert restored.budget_total == 200 + + +def test_optimize_result_format_summary_succeeded_contains_key_fields(): + """format_summary renders the human-readable summary.txt artifact and + must surface algorithm, status, baseline/best pass rates, delta, + rounds and best_prompts inventory.""" + result = _optimize_result() + summary = result.format_summary( + output_dir="/tmp/runs/2026-05-17T16-30-00", + update_source=False, + ) + assert "gepa_reflective" in summary + assert "SUCCEEDED" in summary + assert "0.4000" in summary and "0.6000" in summary + assert "+0.2000" in summary or "+0.20" in summary + assert "improved" in summary + assert "system_prompt" in summary + assert "/tmp/runs/2026-05-17T16-30-00" in summary + + +def test_optimize_result_format_summary_failed_includes_error_message(): + result = _optimize_result().model_copy(update={ + "status": "FAILED", + "finish_reason": "error", + "error_message": "dataset load failed: missing file", + }) + summary = result.format_summary( + output_dir="/tmp/runs/x", update_source=True, + ) + assert "FAILED" in summary + assert "dataset load failed" in summary + + +def test_optimize_result_from_file_missing_path_raises(tmp_path: Path): + with pytest.raises(FileNotFoundError): + OptimizeResult.from_file(str(tmp_path / "nope.json")) + + +def test_optimize_result_camel_alias_export(): + result = _optimize_result() + dumped = result.model_dump(by_alias=True) + assert "schemaVersion" in dumped + assert "finishReason" in dumped + assert "baselinePassRate" in dumped + assert "totalTokenUsage" in dumped + + +def test_optimize_result_camel_case_input_accepted(): + payload = _optimize_result().model_dump(by_alias=True) + restored = OptimizeResult.model_validate(payload) + assert restored == _optimize_result() + + +def test_optimize_result_extras_round_trip_through_file(tmp_path: Path): + result = _optimize_result().model_copy( + update={"extras": {"wandb_run_id": "abc-123", "git_sha": "deadbeef"}} + ) + path = tmp_path / "result.json" + result.dump_to(str(path)) + restored = OptimizeResult.from_file(str(path)) + assert restored.extras == {"wandb_run_id": "abc-123", "git_sha": "deadbeef"} + + +def test_optimize_result_dump_to_overwrites_existing_file(tmp_path: Path): + path = tmp_path / "result.json" + path.write_text("stale content", encoding="utf-8") + result = _optimize_result() + result.dump_to(str(path)) + payload = json.loads(path.read_text(encoding="utf-8")) + assert payload["status"] == "SUCCEEDED" + + +def test_optimize_result_with_multiple_rounds(): + rounds = [ + _round_record(round_idx=1, accepted=True), + _round_record(round_idx=2, accepted=False), + _round_record(round_idx=3, accepted=True), + ] + result = _optimize_result(rounds=rounds) + assert result.total_rounds == 3 + assert result.rounds[1].accepted is False + payload = result.model_dump_json() + restored = OptimizeResult.model_validate_json(payload) + assert [r.accepted for r in restored.rounds] == [True, False, True] + + +# --------------------------------------------------------------------------- +# stop_reason +# --------------------------------------------------------------------------- + + +def test_optimize_result_stop_reason_defaults_to_none(): + result = _optimize_result() + assert result.stop_reason is None + + +@pytest.mark.parametrize( + "reason", ["required_metrics_passing", "budget_exhausted"], +) +def test_optimize_result_stop_reason_accepts_legal_values(reason): + result = _optimize_result().model_copy(update={"stop_reason": reason}) + assert result.stop_reason == reason + + +def test_optimize_result_stop_reason_rejects_illegal_value(): + with pytest.raises(Exception): + OptimizeResult.model_validate( + {**_optimize_result().model_dump(), "stop_reason": "weird"} + ) + + +def test_optimize_result_stop_reason_round_trip(tmp_path: Path): + result = _optimize_result().model_copy( + update={"stop_reason": "required_metrics_passing"} + ) + target = tmp_path / "r.json" + result.dump_to(str(target)) + loaded = OptimizeResult.from_file(str(target)) + assert loaded.stop_reason == "required_metrics_passing" + + +def test_optimize_result_format_summary_includes_stop_reason_when_set(): + result = _optimize_result().model_copy( + update={"stop_reason": "required_metrics_passing"} + ) + summary = result.format_summary(output_dir="/tmp/x", update_source=False) + assert "stop_reason" in summary + assert "required_metrics_passing" in summary + + +def test_optimize_result_format_summary_omits_stop_reason_when_none(): + result = _optimize_result() + summary = result.format_summary(output_dir="/tmp/x", update_source=False) + assert "stop_reason" not in summary diff --git a/tests/evaluation/test_remote_eval_service.py b/tests/evaluation/test_remote_eval_service.py index 77d03187..3f16fd10 100644 --- a/tests/evaluation/test_remote_eval_service.py +++ b/tests/evaluation/test_remote_eval_service.py @@ -128,6 +128,36 @@ async def call_agent(query: str) -> str: _ = [r async for r in service.evaluate(evaluate_req)] +@pytest.mark.asyncio +async def test_reject_llm_rubric_knowledge_recall_metric_raises_value_error(): + """F-4: ``llm_rubric_knowledge_recall`` requires tool responses from + ``intermediate_data`` which RemoteEvalService never captures (always + ``None``); the judge would silently fall back to "No knowledge search + results were found." Treat it the same as ``tool_trajectory_avg_score``: + fail-fast at evaluate() entry instead of letting users score every + case as 0 and chase a phantom prompt regression. + """ + case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")]) + eval_set = EvalSet(eval_set_id="s1", eval_cases=[case]) + mgr = _make_manager(eval_set) + + async def call_agent(query: str) -> str: + return "world" + + service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr) + req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=1)) + inference_results = [r async for r in service.perform_inference(req)] + evaluate_req = EvaluateRequest( + inference_results=inference_results, + evaluate_config=EvaluateConfig( + eval_metrics=[EvalMetric(metric_name="llm_rubric_knowledge_recall", threshold=1.0)], + ), + ) + + with pytest.raises(ValueError, match="llm_rubric_knowledge_recall"): + _ = [r async for r in service.evaluate(evaluate_req)] + + @pytest.mark.asyncio async def test_case_fail_soft_when_call_agent_raises(): case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")]) diff --git a/tests/evaluation/test_target_prompt.py b/tests/evaluation/test_target_prompt.py new file mode 100644 index 00000000..8a2c7b48 --- /dev/null +++ b/tests/evaluation/test_target_prompt.py @@ -0,0 +1,539 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for TargetPrompt.""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Callable +from unittest import mock + +import pytest + +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt + + +def _write(path: Path, text: str) -> Path: + path.write_text(text, encoding="utf-8") + return path + + +def test_add_path_returns_self_for_chaining(tmp_path: Path): + p1 = _write(tmp_path / "a.md", "A") + p2 = _write(tmp_path / "b.md", "B") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + assert target.names() == ["a", "b"] + + +def test_add_callback_returns_self_for_chaining(): + async def _read() -> str: + return "x" + + async def _write_fn(value: str) -> None: + return None + + target = ( + TargetPrompt() + .add_callback("c1", read=_read, write=_write_fn) + .add_callback("c2", read=_read, write=_write_fn) + ) + assert target.names() == ["c1", "c2"] + + +def test_names_in_registration_order(tmp_path: Path): + p = _write(tmp_path / "x.md", "x") + + async def _read() -> str: + return "" + + async def _write_fn(value: str) -> None: + return None + + target = ( + TargetPrompt() + .add_path("first", str(p)) + .add_callback("second", read=_read, write=_write_fn) + .add_path("third", str(p)) + ) + assert target.names() == ["first", "second", "third"] + + +def test_add_path_duplicate_name_raises_value_error(tmp_path: Path): + p = _write(tmp_path / "a.md", "A") + target = TargetPrompt().add_path("a", str(p)) + with pytest.raises(ValueError, match="already registered"): + target.add_path("a", str(p)) + + +def test_add_callback_duplicate_name_raises_value_error(): + async def _read() -> str: + return "" + + async def _write_fn(value: str) -> None: + return None + + target = TargetPrompt().add_callback("c", read=_read, write=_write_fn) + with pytest.raises(ValueError, match="already registered"): + target.add_callback("c", read=_read, write=_write_fn) + + +def test_add_path_and_callback_same_name_raises(tmp_path: Path): + p = _write(tmp_path / "a.md", "A") + + async def _read() -> str: + return "" + + async def _write_fn(value: str) -> None: + return None + + target = TargetPrompt().add_path("a", str(p)) + with pytest.raises(ValueError, match="already registered"): + target.add_callback("a", read=_read, write=_write_fn) + + +def test_empty_target_prompt_names_is_empty(): + assert TargetPrompt().names() == [] + + +def test_add_callback_requires_async_read_callable(): + def _sync_read() -> str: + return "" + + async def _write_fn(value: str) -> None: + return None + + with pytest.raises(TypeError, match="async"): + TargetPrompt().add_callback("c", read=_sync_read, write=_write_fn) + + +def test_add_callback_requires_async_write_callable(): + async def _read() -> str: + return "" + + def _sync_write(value: str) -> None: + return None + + with pytest.raises(TypeError, match="async"): + TargetPrompt().add_callback("c", read=_read, write=_sync_write) + + +@pytest.mark.asyncio +async def test_read_all_with_paths(tmp_path: Path): + p1 = _write(tmp_path / "a.md", "alpha") + p2 = _write(tmp_path / "b.md", "beta") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + assert await target.read_all() == {"a": "alpha", "b": "beta"} + + +@pytest.mark.asyncio +async def test_read_all_path_not_exist_raises_file_not_found(tmp_path: Path): + target = TargetPrompt().add_path("missing", str(tmp_path / "ghost.md")) + with pytest.raises(FileNotFoundError): + await target.read_all() + + +@pytest.mark.asyncio +async def test_read_all_with_async_callback(): + async def _read() -> str: + return "callback-value" + + async def _write_fn(value: str) -> None: + return None + + target = TargetPrompt().add_callback("k", read=_read, write=_write_fn) + assert await target.read_all() == {"k": "callback-value"} + + +@pytest.mark.asyncio +async def test_read_all_callback_raises_propagates(): + async def _read() -> str: + raise RuntimeError("remote down") + + async def _write_fn(value: str) -> None: + return None + + target = TargetPrompt().add_callback("k", read=_read, write=_write_fn) + with pytest.raises(RuntimeError, match="remote down"): + await target.read_all() + + +@pytest.mark.asyncio +async def test_read_all_mixed_path_and_callback(tmp_path: Path): + p = _write(tmp_path / "p.md", "from-file") + + async def _read() -> str: + return "from-callback" + + async def _write_fn(value: str) -> None: + return None + + target = ( + TargetPrompt() + .add_path("a", str(p)) + .add_callback("b", read=_read, write=_write_fn) + ) + assert await target.read_all() == {"a": "from-file", "b": "from-callback"} + + +@pytest.mark.asyncio +async def test_read_single_field(tmp_path: Path): + p = _write(tmp_path / "a.md", "single") + target = TargetPrompt().add_path("a", str(p)) + assert await target.read("a") == "single" + + +@pytest.mark.asyncio +async def test_read_unknown_name_raises_key_error(): + target = TargetPrompt() + with pytest.raises(KeyError): + await target.read("nope") + + +@pytest.mark.asyncio +async def test_write_all_with_paths_updates_files(tmp_path: Path): + p1 = _write(tmp_path / "a.md", "old-a") + p2 = _write(tmp_path / "b.md", "old-b") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + await target.write_all({"a": "new-a", "b": "new-b"}) + assert p1.read_text(encoding="utf-8") == "new-a" + assert p2.read_text(encoding="utf-8") == "new-b" + + +@pytest.mark.asyncio +async def test_write_all_with_callback_invokes_write(): + received: dict[str, str] = {} + + async def _read() -> str: + return received.get("k", "") + + async def _write_fn(value: str) -> None: + received["k"] = value + + target = TargetPrompt().add_callback("k", read=_read, write=_write_fn) + await target.write_all({"k": "callback-payload"}) + assert received == {"k": "callback-payload"} + + +@pytest.mark.asyncio +async def test_write_all_keys_mismatch_raises(tmp_path: Path): + p = _write(tmp_path / "a.md", "A") + target = TargetPrompt().add_path("a", str(p)) + + with pytest.raises(ValueError, match="mismatch"): + await target.write_all({}) + + with pytest.raises(ValueError, match="mismatch"): + await target.write_all({"a": "ok", "extra": "x"}) + + +@pytest.mark.asyncio +async def test_write_all_no_tmp_file_remains_on_success(tmp_path: Path): + p = _write(tmp_path / "a.md", "old") + target = TargetPrompt().add_path("a", str(p)) + await target.write_all({"a": "new"}) + assert p.read_text(encoding="utf-8") == "new" + leftover = [f for f in os.listdir(tmp_path) if f.endswith(".tmp")] + assert leftover == [] + + +@pytest.mark.asyncio +async def test_write_all_atomic_rollback_on_partial_failure(tmp_path: Path): + p1 = _write(tmp_path / "a.md", "old-a") + p2 = _write(tmp_path / "b.md", "old-b") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + + original_replace = os.replace + seen: dict[str, int] = {"count": 0} + + def _flaky_replace(src: str, dst: str) -> None: + seen["count"] += 1 + if seen["count"] == 2: + raise OSError("simulated disk failure on second rename") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(OSError, match="simulated"): + await target.write_all({"a": "new-a", "b": "new-b"}) + + # Atomicity contract: every source file is restored to its pre-call content. + assert p1.read_text(encoding="utf-8") == "old-a" + assert p2.read_text(encoding="utf-8") == "old-b" + leftover = [f for f in os.listdir(tmp_path) if f.endswith(".tmp")] + assert leftover == [] + + +# --------------------------------------------------------------------------- +# CONC-3 fix: rollback uses atomic primitives + best-effort failure aggregation. +# Test matrix: T1/T2/T7 already covered above; below adds T3-T8 + edge cases. +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_write_all_callback_failure_rolls_back_paths(tmp_path: Path): + """T4: callback write fails after path writes succeed; path fields must + be restored to baseline. The callback failure is propagated.""" + p1 = _write(tmp_path / "a.md", "old-a") + p2 = _write(tmp_path / "b.md", "old-b") + + async def _read() -> str: + return "" + + async def _write_fn(value: str) -> None: + raise RuntimeError("simulated KV write failure") + + target = ( + TargetPrompt() + .add_path("a", str(p1)) + .add_path("b", str(p2)) + .add_callback("c", read=_read, write=_write_fn) + ) + + with pytest.raises(RuntimeError, match="simulated KV"): + await target.write_all({"a": "new-a", "b": "new-b", "c": "new-c"}) + + assert p1.read_text(encoding="utf-8") == "old-a" + assert p2.read_text(encoding="utf-8") == "old-b" + leftover = [f for f in os.listdir(tmp_path) if f.endswith(".tmp")] + assert leftover == [] + + +@pytest.mark.asyncio +async def test_write_all_rolls_back_to_unlink_when_baseline_absent(tmp_path: Path): + """T5: file did not exist before write_all (backup=None); rollback path + must unlink the file rather than restore content.""" + p1 = _write(tmp_path / "a.md", "old-a") + ghost = tmp_path / "ghost.md" + assert not ghost.exists() + + target = TargetPrompt().add_path("ghost", str(ghost)).add_path("a", str(p1)) + + original_replace = os.replace + seen = {"count": 0} + + def _flaky_replace(src: str, dst: str) -> None: + seen["count"] += 1 + # registration order: ghost first, a second + # call 1 = ghost.md.tmp -> ghost.md (succeeds, ghost newly created) + # call 2 = a.md.tmp -> a.md (fails -> rollback for [ghost]) + if seen["count"] == 2: + raise OSError("simulated failure on second rename") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(OSError, match="simulated"): + await target.write_all({"ghost": "new-ghost", "a": "new-a"}) + + assert not ghost.exists() + assert p1.read_text(encoding="utf-8") == "old-a" + leftover = [f for f in os.listdir(tmp_path) if f.endswith(".tmp")] + assert leftover == [] + + +@pytest.mark.asyncio +async def test_write_all_rollback_failure_aggregates_and_chains_root_cause(tmp_path: Path): + """T3+T6: both forward write and rollback restore fail. Aggregate + _RollbackError lists the failed field; root cause preserved on __cause__.""" + from trpc_agent_sdk.evaluation._target_prompt import _RollbackError + + p1 = _write(tmp_path / "a.md", "baseline-a") + p2 = _write(tmp_path / "b.md", "baseline-b") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + + original_replace = os.replace + call_count = {"n": 0} + + def _flaky_replace(src: str, dst: str) -> None: + call_count["n"] += 1 + # call 1: forward a.md (succeeds) + # call 2: forward b.md (fails -> rollback for [a]) + # call 3: rollback a-restore (fails too) + if call_count["n"] == 2: + raise OSError("primary write failure") + if call_count["n"] >= 3: + raise PermissionError("rollback restore denied") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(_RollbackError) as excinfo: + await target.write_all({"a": "new-a", "b": "new-b"}) + + err = excinfo.value + assert "a" in str(err) + assert "PermissionError" in str(err) + assert isinstance(err.__cause__, OSError) + assert "primary write failure" in str(err.__cause__) + assert len(err.failures) == 1 + assert err.failures[0][0] == "a" + assert isinstance(err.failures[0][1], PermissionError) + + +@pytest.mark.asyncio +async def test_write_all_rollback_unlink_failure_aggregated(tmp_path: Path): + """T6 variant: backup=None case; unlink fails -> _RollbackError carries it.""" + from trpc_agent_sdk.evaluation._target_prompt import _RollbackError + + p1 = _write(tmp_path / "a.md", "baseline-a") + ghost = tmp_path / "ghost.md" + target = TargetPrompt().add_path("ghost", str(ghost)).add_path("a", str(p1)) + + original_replace = os.replace + original_unlink = os.unlink + state = {"replace_count": 0} + + def _flaky_replace(src: str, dst: str) -> None: + state["replace_count"] += 1 + # call 1: ghost.md.tmp -> ghost.md (succeeds) + # call 2: a.md.tmp -> a.md (fails -> rollback for [ghost]) + if state["replace_count"] == 2: + raise OSError("primary failure on a.md") + return original_replace(src, dst) + + def _flaky_unlink(path: str) -> None: + if str(path) == str(ghost): + raise PermissionError("unlink denied") + return original_unlink(path) + + with mock.patch("os.replace", side_effect=_flaky_replace), \ + mock.patch("os.unlink", side_effect=_flaky_unlink): + with pytest.raises(_RollbackError) as excinfo: + await target.write_all({"ghost": "g", "a": "new-a"}) + + err = excinfo.value + assert "ghost" in str(err) + assert "PermissionError" in str(err) + assert isinstance(err.__cause__, OSError) + assert "primary failure on a.md" in str(err.__cause__) + + +@pytest.mark.asyncio +async def test_write_all_rollback_continues_after_partial_failure(tmp_path: Path): + """T3 best-effort: when field A's rollback fails, field B's rollback + still runs and succeeds.""" + from trpc_agent_sdk.evaluation._target_prompt import _RollbackError + + p1 = _write(tmp_path / "a.md", "baseline-a") + p2 = _write(tmp_path / "b.md", "baseline-b") + p3 = _write(tmp_path / "c.md", "baseline-c") + target = ( + TargetPrompt() + .add_path("a", str(p1)) + .add_path("b", str(p2)) + .add_path("c", str(p3)) + ) + + original_replace = os.replace + state = {"n": 0} + + def _flaky_replace(src: str, dst: str) -> None: + state["n"] += 1 + # forward: 1=a, 2=b, 3=c (fails -> rollback for [a, b]) + # rollback: 4=a-restore (fails), 5=b-restore (succeeds) + if state["n"] == 3: + raise OSError("primary failure on c") + if state["n"] == 4: + raise PermissionError("rollback a denied") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(_RollbackError) as excinfo: + await target.write_all({"a": "new-a", "b": "new-b", "c": "new-c"}) + + # Best-effort: b's rollback ran and succeeded. + assert p2.read_text(encoding="utf-8") == "baseline-b" + err = excinfo.value + assert len(err.failures) == 1 + assert err.failures[0][0] == "a" + + +@pytest.mark.asyncio +async def test_write_all_rollback_uses_atomic_primitive(tmp_path: Path, monkeypatch): + """T8: critical regression. Rollback restore path must go through + _atomic_write_path (tmp + os.replace), not raw Path.write_text.""" + p1 = _write(tmp_path / "a.md", "baseline-a") + p2 = _write(tmp_path / "b.md", "baseline-b") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + + rollback_calls: list[str] = [] + original_atomic = TargetPrompt._atomic_write_path + + def _spy_atomic(path: str, content: str) -> None: + rollback_calls.append(path) + return original_atomic(path, content) + + monkeypatch.setattr(TargetPrompt, "_atomic_write_path", staticmethod(_spy_atomic)) + + original_replace = os.replace + state = {"n": 0} + + def _flaky_replace(src: str, dst: str) -> None: + state["n"] += 1 + if state["n"] == 2: + raise OSError("simulated") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(OSError, match="simulated"): + await target.write_all({"a": "new-a", "b": "new-b"}) + + # forward writes for a + b (2 calls), then rollback restore for a (1 call) = 3. + # If rollback used raw write_text, the third call would not appear. + assert len(rollback_calls) == 3 + assert str(p1) in rollback_calls + assert p1.read_text(encoding="utf-8") == "baseline-a" + + +@pytest.mark.asyncio +async def test_write_all_keyboard_interrupt_during_callback_still_rolls_back(tmp_path: Path): + """KeyboardInterrupt is BaseException; except BaseException ensures + rollback still runs for path fields when interrupted mid-callback.""" + p1 = _write(tmp_path / "a.md", "baseline-a") + + async def _read() -> str: + return "" + + async def _write_fn(value: str) -> None: + raise KeyboardInterrupt() + + target = ( + TargetPrompt() + .add_path("a", str(p1)) + .add_callback("c", read=_read, write=_write_fn) + ) + + with pytest.raises(KeyboardInterrupt): + await target.write_all({"a": "new-a", "c": "new-c"}) + + assert p1.read_text(encoding="utf-8") == "baseline-a" + leftover = [f for f in os.listdir(tmp_path) if f.endswith(".tmp")] + assert leftover == [] + + +@pytest.mark.asyncio +async def test_write_all_no_tmp_left_after_rollback(tmp_path: Path): + """T7 extension: after forward fail + rollback success, no .tmp residue + anywhere in the directory.""" + p1 = _write(tmp_path / "a.md", "baseline-a") + p2 = _write(tmp_path / "b.md", "baseline-b") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + + original_replace = os.replace + state = {"n": 0} + + def _flaky_replace(src: str, dst: str) -> None: + state["n"] += 1 + if state["n"] == 2: + raise OSError("simulated") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(OSError): + await target.write_all({"a": "new-a", "b": "new-b"}) + + leftover = sorted(f for f in os.listdir(tmp_path) if f.endswith(".tmp")) + assert leftover == [] + assert p1.read_text(encoding="utf-8") == "baseline-a" + assert p2.read_text(encoding="utf-8") == "baseline-b" diff --git a/trpc_agent_sdk/evaluation/__init__.py b/trpc_agent_sdk/evaluation/__init__.py index 4c87990a..8f614b4b 100644 --- a/trpc_agent_sdk/evaluation/__init__.py +++ b/trpc_agent_sdk/evaluation/__init__.py @@ -179,10 +179,72 @@ from ._user_simulator_base import Status from ._user_simulator_base import UserSimulator from ._user_simulator_provider import UserSimulatorProvider +from ._agent_optimizer import AgentOptimizer +from ._base_optimizer import BaseOptimizer +from ._optimize_config import FrameworkStopConfig +from ._optimize_config import GepaReflectiveAlgo +from ._optimize_config import OptimizeConfig +from ._optimize_config import OptimizeConfigFile +from ._optimize_config import load_optimize_config +from ._optimize_evaluator_call import EvaluationOutcome +from ._optimize_evaluator_call import run_evaluator +from ._optimize_evaluator_call import summarize_outcome +from ._optimize_gepa_reflective import GepaReflectiveOptimizer +from ._optimize_metric_info import build_metric_reference_doc +from ._optimize_metric_info import build_metric_section +from ._optimize_metric_info import build_reflection_prompt_template +from ._optimize_model_callable import DEFAULT_OPTIMIZE_MAX_TOKENS +from ._optimize_model_callable import DEFAULT_OPTIMIZE_TEMPERATURE +from ._optimize_model_options import OptimizeModelOptions +from ._optimize_registry import OPTIMIZER_REGISTRY +from ._optimize_registry import OptimizerRegistry +from ._optimize_reporter import OptimizeReporter +from ._optimize_reporter import RoundView +from ._optimize_reporter import RunHeader +from ._optimize_reporter import create_reporter +from ._optimize_result import FinishReason +from ._optimize_result import OptimizeResult +from ._optimize_result import RoundKind +from ._optimize_result import RoundRecord +from ._optimize_result import RunStatus +from ._optimize_result import StopReason +from ._target_prompt import TargetPrompt from ._utils import EvalResultHandler from ._utils import MetricRunRecord +from . import _optimize_registrations # noqa: F401 # triggers algorithm registrations + __all__ = [ + "AgentOptimizer", + "BaseOptimizer", + "DEFAULT_OPTIMIZE_MAX_TOKENS", + "DEFAULT_OPTIMIZE_TEMPERATURE", + "EvaluationOutcome", + "FinishReason", + "FrameworkStopConfig", + "GepaReflectiveAlgo", + "GepaReflectiveOptimizer", + "OPTIMIZER_REGISTRY", + "OptimizeConfig", + "OptimizeConfigFile", + "OptimizeModelOptions", + "OptimizeReporter", + "OptimizeResult", + "OptimizerRegistry", + "RoundKind", + "RoundRecord", + "RoundView", + "RunHeader", + "RunStatus", + "StopReason", + "build_metric_reference_doc", + "build_metric_section", + "build_reflection_prompt_template", + "create_reporter", + "run_evaluator", + "summarize_outcome", + "TargetPrompt", + "load_optimize_config", "CRITERION_REGISTRY", "CriterionRegistry", "CriterionType", diff --git a/trpc_agent_sdk/evaluation/_agent_evaluator.py b/trpc_agent_sdk/evaluation/_agent_evaluator.py index f4d1da60..a2e47009 100644 --- a/trpc_agent_sdk/evaluation/_agent_evaluator.py +++ b/trpc_agent_sdk/evaluation/_agent_evaluator.py @@ -75,6 +75,24 @@ _RESULT_HANDLER = _utils.EvalResultHandler() +class _EvaluationCasesFailed(AssertionError): + """Signal raised by ``_EvalExecuter._run`` when one or more eval cases fail. + + Subclasses :class:`AssertionError` so direct ``AgentEvaluator.evaluate`` + callers (CI pytest gates such as ``examples/optimization/ci_integration``) + keep working unchanged: ``except AssertionError`` and + ``isinstance(exc, AssertionError)`` both still match, and the formatted + message remains the JSON failure summary so pytest JUnit XML output is + byte-for-byte identical to the previous ``assert False, combined``. + + Internal optimizer wrappers (``_optimize_evaluator_call.run_evaluator``) + catch this concrete subclass so unrelated ``AssertionError`` (e.g. numpy + ``assert allclose``) is no longer silently swallowed. Replacing the bare + ``assert`` statement also keeps the failure signal alive under + ``python -O`` where ``assert`` is stripped. + """ + + @dataclass(frozen=True) class PassNC: """(n, c): n = runs, c = runs that all passed (for pass@k / pass^k).""" @@ -101,6 +119,7 @@ def __init__( case_eval_parallelism: Optional[int] = None, callbacks: Optional[Callbacks] = None, eval_metrics_file_path_or_dir: Optional[str] = None, + print_summary_report: bool = True, ): self._agent_module = agent_module self._call_agent = call_agent @@ -108,6 +127,7 @@ def __init__( self._num_runs = num_runs self._agent_name = agent_name self._print_detailed_results = print_detailed_results + self._print_summary_report = print_summary_report self._eval_result_output_dir = eval_result_output_dir self._runner = runner self._case_parallelism = case_parallelism @@ -124,6 +144,7 @@ async def _run(self) -> None: num_runs = self._num_runs agent_name = self._agent_name print_detailed_results = self._print_detailed_results + print_summary_report = self._print_summary_report eval_result_output_dir = self._eval_result_output_dir runner = self._runner case_parallelism = self._case_parallelism @@ -189,7 +210,7 @@ async def _run(self) -> None: eval_results_by_eval_id=eval_results_by_eval_id, num_runs=num_runs_for_set, ) - if all_details or all_results: + if print_summary_report and (all_details or all_results): _RESULT_HANDLER.print_evaluation_report( all_details=all_details, all_results=all_results, @@ -207,7 +228,7 @@ async def _run(self) -> None: indent=2, ensure_ascii=False, ) - assert False, combined + raise _EvaluationCasesFailed(combined) async def _ensure_run(self) -> None: if self._task is None: @@ -354,6 +375,7 @@ def get_executer( case_eval_parallelism: Optional[int] = None, callbacks: Optional[Callbacks] = None, eval_metrics_file_path_or_dir: Optional[str] = None, + print_summary_report: bool = True, ) -> _EvalExecuter: """Return an executer (does not run). Await executer.evaluate() then executer.get_result() for result. @@ -377,6 +399,12 @@ def get_executer( evaluation config JSON (file) or directory containing a single config JSON. When provided, overrides the dataset-local ``test_config.json`` convention for ALL discovered datasets. + print_summary_report: When False, suppress the Execution Details and + Evaluation Result tables normally printed at the end of a run. + The result is still computed and returned by ``get_result()``. + Defaults to True for direct callers; tools that drive the + evaluator inside a larger workflow (e.g. ``AgentOptimizer``) + pass False to keep their own output unmixed. Returns: _EvalExecuter: Await .evaluate() to run, then .get_result() for EvaluateResult. @@ -394,6 +422,7 @@ def get_executer( case_eval_parallelism=case_eval_parallelism, callbacks=callbacks, eval_metrics_file_path_or_dir=eval_metrics_file_path_or_dir, + print_summary_report=print_summary_report, ) @staticmethod diff --git a/trpc_agent_sdk/evaluation/_agent_optimizer.py b/trpc_agent_sdk/evaluation/_agent_optimizer.py new file mode 100644 index 00000000..16510e75 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_agent_optimizer.py @@ -0,0 +1,614 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""AgentOptimizer: business-facing entry point for prompt optimization. + +Mirrors :class:`AgentEvaluator`: business code calls +``AgentOptimizer.optimize(...)`` and the facade dispatches to the +algorithm registered under ``config.optimize.algorithm.name`` (looked +up in :data:`OPTIMIZER_REGISTRY`). Switching algorithms is a +single-field config change. +""" + +from __future__ import annotations + +import inspect +import logging +import os +import signal +import sys +import threading +import warnings +from datetime import datetime +from datetime import timezone +from pathlib import Path +from typing import Any +from typing import Optional +from typing import Sequence + +from ._eval_callbacks import Callbacks +from ._optimize_config import OptimizeConfigFile +from ._optimize_config import load_optimize_config +from ._optimize_registry import OPTIMIZER_REGISTRY +from ._optimize_reporter import RunHeader +from ._optimize_reporter import create_reporter +from ._optimize_result import OptimizeResult +from ._remote_eval_service import CallAgent +from ._target_prompt import TargetPrompt + +# Metrics incompatible with call_agent (black-box) mode because their +# evaluators need data RemoteEvalService doesn't capture: +# - ``tool_trajectory_avg_score``: per-step tool call traces. +# - ``llm_rubric_knowledge_recall``: tool responses from +# ``Invocation.intermediate_data`` (RemoteEvalService leaves it None, +# so the judge would always see "No knowledge search results were +# found." for every case). +_DISALLOWED_METRICS_IN_CALL_AGENT_MODE = frozenset({ + "tool_trajectory_avg_score", + "llm_rubric_knowledge_recall", +}) + +_PROMPT_FILE_LOGGER = logging.getLogger("trpc_agent_sdk.optimizer") + + +def _atomic_write_text(path: str, content: str) -> None: + """Atomically replace ``path`` with ``content`` (UTF-8). + + Writes to a sibling ``.tmp`` then ``os.replace`` to swap into + place — POSIX guarantees rename is atomic, so a process kill or + power loss between the write and the rename leaves ``path`` either + pristine (pre-call content, or missing if it did not exist) or + fully updated, never half-written. Mirrors + :meth:`TargetPrompt._atomic_write_path` so artifact persistence + enjoys the same crash safety as source rollback. + """ + tmp = path + ".tmp" + Path(tmp).write_text(content, encoding="utf-8") + os.replace(tmp, path) + + +class _mask_sigint: + """Context manager that masks SIGINT for the duration of the block. + + Used by :meth:`AgentOptimizer._persist_artifacts` so a panicked second + Ctrl+C during teardown cannot interrupt artifact writes between + ``os.replace`` boundaries. Restores the previous handler on exit even + if the block raises. On platforms / threads where ``signal.signal`` + is unavailable (Windows, non-main thread) the context degrades to a + no-op rather than crashing — the underlying ``_atomic_write_text`` is + still crash-safe; only the second-Ctrl+C-during-finally race + protection is foregone. + """ + + def __init__(self) -> None: + self._previous = None + self._installed = False + + def __enter__(self) -> "_mask_sigint": + # signal.signal() only works in the main thread of the main interpreter. + if threading.current_thread() is not threading.main_thread(): + return self + try: + self._previous = signal.signal(signal.SIGINT, signal.SIG_IGN) + self._installed = True + except (ValueError, OSError): # pragma: no cover - platform fallback + # ValueError: not main thread on some platforms; OSError: signal + # not supported (rare embedded interpreters). Either way, leave + # SIGINT as-is; persistence is still best-effort. + self._installed = False + return self + + def __exit__(self, exc_type, exc, tb) -> None: + if not self._installed: + return + try: + signal.signal(signal.SIGINT, self._previous) + except (ValueError, OSError): # pragma: no cover - platform fallback + pass + + +class AgentOptimizer: + """Business-facing entry point dispatching to the registered algorithm. + + Business code passes a config file path; the facade reads + validates + it, looks up the algorithm class from + :data:`OPTIMIZER_REGISTRY` by ``config.optimize.algorithm.name``, + instantiates it, and runs the loop. + + Example: + target = TargetPrompt().add_path("system_prompt", "prompts/system.md") + result = await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=my_call_agent, + target_prompt=target, + train_dataset_path="data/train.evalset.json", + validation_dataset_path="data/val.evalset.json", + output_dir="runs/2026-05-17T16-30-00", + ) + """ + + @classmethod + async def optimize( + cls, + *, + config_path: str, + call_agent: CallAgent, + target_prompt: TargetPrompt, + train_dataset_path: str, + validation_dataset_path: str, + output_dir: str, + callbacks: Optional[Callbacks] = None, + update_source: bool = False, + verbose: int = 1, + extra_stop_callbacks: Optional[Sequence[Any]] = None, + extra_gepa_callbacks: Optional[Sequence[Any]] = None, + ) -> OptimizeResult: + """Load the config file at ``config_path`` and run the selected algorithm. + + Args: + config_path: Path to the optimizer JSON config file. + call_agent: Async callable mapping a user query to an agent response. + target_prompt: Registry of prompt fields to optimize. + train_dataset_path: Path to the training eval set file. + validation_dataset_path: Path to the validation eval set file (must + differ from ``train_dataset_path``). + output_dir: Required artifact directory. The facade creates it when + missing and persists ``result.json``, ``summary.txt``, + ``rounds/`` records, ``baseline_prompts/`` and ``best_prompts/`` + directories, a ``config.snapshot.json`` copy of the input + config, and a ``run.log`` summary line. + callbacks: Optional evaluator lifecycle callbacks. + update_source: When True, persist the best candidate back to + every registered TargetPrompt field after a SUCCEEDED + run; when False (default), source files keep their + baseline content. ``OptimizeResult.best_prompts`` always + carries the best text regardless, so callers can review + the proposal before deciding to write back. + verbose: Reporter verbosity. ``0`` suppresses terminal + output (artifact persistence still happens). ``1`` + (default): Rich panel header + per-round line + closing + summary, falling back to ASCII when ``rich`` is missing. + ``2`` adds gepa-internal log forwarding on the + ``trpc_agent_sdk.optimizer.gepa`` logger. + extra_stop_callbacks: Runtime-only stoppers appended after + gepa-native stoppers. Useful for SLO monitors / kill + switches. Plain callables surface as + ``stop_reason="completed"``; wrap in + ``_LabeledStopper`` (or expose a ``.label`` attribute + matching :data:`StopReason`) for a stable classification. + extra_gepa_callbacks: Runtime-only gepa event callbacks + appended after the framework's built-in + ``_AgentGEPACallback`` (e.g. forwarding events to a + dashboard). Each entry should implement the + ``gepa.core.callback.GEPACallback`` protocol; gepa + silently ignores callbacks missing a method it invokes. + + Raises: + FileNotFoundError: if ``config_path`` does not exist. + pydantic.ValidationError: if the config violates schema constraints. + ValueError: if ``optimize`` section is missing; if the requested + ``algorithm.name`` is not registered (message lists every + algorithm currently in ``OPTIMIZER_REGISTRY.list_registered()``); + if ``target_prompt`` has no registered fields; if a metric + requiring session traces is configured under call_agent mode; or + if ``train_dataset_path`` and ``validation_dataset_path`` resolve + to the same file (train-test leakage guard). + TypeError: if ``call_agent`` is not an ``async`` callable. + """ + cls._precheck_algorithm_name(config_path) + config = load_optimize_config(config_path) + cls._validate_inputs( + config=config, + call_agent=call_agent, + target_prompt=target_prompt, + train_dataset_path=train_dataset_path, + validation_dataset_path=validation_dataset_path, + output_dir=output_dir, + ) + os.makedirs(output_dir, exist_ok=True) + + algorithm_name = config.optimize.algorithm.name + algorithm_cls = OPTIMIZER_REGISTRY.get(algorithm_name) + optimizer = algorithm_cls( + config=config, + call_agent=call_agent, + target_prompt=target_prompt, + train_dataset_path=train_dataset_path, + validation_dataset_path=validation_dataset_path, + callbacks=callbacks, + output_dir=output_dir, + extra_stop_callbacks=extra_stop_callbacks, + extra_gepa_callbacks=extra_gepa_callbacks, + ) + + reporter = create_reporter(verbose=verbose, stream=sys.stdout) + baseline_snapshot = await target_prompt.read_all() + header = cls._build_run_header( + algorithm=algorithm_name, + target_prompt=target_prompt, + config=config, + train_dataset_path=train_dataset_path, + validation_dataset_path=validation_dataset_path, + output_dir=output_dir, + ) + cls._safe_reporter_call(reporter.run_started, header) + + result: Optional[OptimizeResult] = None + # ``cleanup_done`` gates whether the ``finally`` block must restore + # baseline. It flips to True after EITHER (a) write_all(best) succeeded + # (so sources already hold the desired content and no restore is + # needed) OR (b) the ``except`` branch successfully wrote baseline back + # as part of its rollback. This single sentinel guarantees baseline + # write_all is invoked at most once per optimize() — important for + # callback-backed fields whose write_fn may be non-idempotent (version + # counters, audit log entries). + cleanup_done = False + run_error: Optional[BaseException] = None + try: + try: + result = await optimizer.run(reporter=reporter) + except BaseException as ex: + run_error = ex + raise + + if update_source and result.status == "SUCCEEDED": + # write_all is atomic for path-backed sources (tmp + + # os.replace, rollback on partial failure). If it raises, + # sources may sit at an intermediate candidate from the + # last in-run evaluation — restore baseline explicitly + # then re-raise so the caller sees the write failure. + try: + await target_prompt.write_all(result.best_prompts) + cleanup_done = True + except Exception: + try: + await target_prompt.write_all(baseline_snapshot) + cleanup_done = True + except Exception: # pragma: no cover - defensive guard + pass + raise + finally: + if not cleanup_done: + # Best-effort restore: never mask the underlying run/write error. + try: + await target_prompt.write_all(baseline_snapshot) + except Exception: # pragma: no cover - defensive guard + pass + + cls._persist_artifacts( + result=result, + baseline_snapshot=baseline_snapshot, + output_dir=output_dir, + config_path=config_path, + run_error=run_error, + update_source=update_source, + ) + cls._emit_reporter_finish( + reporter=reporter, + result=result, + baseline_snapshot=baseline_snapshot, + output_dir=output_dir, + update_source=update_source, + run_error=run_error, + ) + return result + + @staticmethod + def _build_run_header( + *, + algorithm: str, + target_prompt: TargetPrompt, + config: OptimizeConfigFile, + train_dataset_path: str, + validation_dataset_path: str, + output_dir: str, + ) -> RunHeader: + """Collect the static run context surfaced in the terminal header. + + Train / val sizes are read from each EvalSet on disk so the header + reflects the actual material the algorithm will evaluate, including + edge cases where one of the sets is empty. + """ + from ._eval_set import EvalSet + from pathlib import Path + + def _count_cases(path: str) -> int: + try: + return len(EvalSet.model_validate_json(Path(path).read_text(encoding="utf-8")).eval_cases) + except Exception: + return 0 + + target_fields: list[tuple[str, str]] = [] + for name in target_prompt.names(): + target_fields.append((name, target_prompt.describe_source(name))) + + metric_names = [metric.metric_name for metric in config.evaluate.get_eval_metrics()] + budget_total = getattr(config.optimize.algorithm, "max_metric_calls", None) + return RunHeader( + algorithm=algorithm, + target_fields=target_fields, + train_size=_count_cases(train_dataset_path), + val_size=_count_cases(validation_dataset_path), + metric_names=metric_names, + output_dir=output_dir, + budget_total=budget_total, + ) + + @staticmethod + def _safe_reporter_call(fn, *args, **kwargs) -> None: + """Invoke a reporter method, swallowing render errors.""" + try: + fn(*args, **kwargs) + except Exception: # pragma: no cover - reporter must never break the loop + _PROMPT_FILE_LOGGER.warning("reporter event failed", exc_info=True) + + @classmethod + def _emit_reporter_finish( + cls, + *, + reporter, + result: Optional[OptimizeResult], + baseline_snapshot: dict[str, str], + output_dir: str, + update_source: bool, + run_error: Optional[BaseException], + ) -> None: + if result is not None: + cls._safe_reporter_call( + reporter.run_finished, + result, + output_dir=output_dir, + update_source=update_source, + ) + return + message = (str(run_error) if run_error is not None else "optimization failed") + cls._safe_reporter_call( + reporter.run_failed, + baseline_prompts=dict(baseline_snapshot), + output_dir=output_dir, + error_message=message, + ) + + @classmethod + def _persist_artifacts( + cls, + *, + result: Optional[OptimizeResult], + baseline_snapshot: dict[str, str], + output_dir: str, + config_path: str, + run_error: Optional[BaseException], + update_source: bool, + ) -> None: + """Write run artifacts under ``output_dir``. + + Layout: + - ``result.json`` Full OptimizeResult JSON. + - ``summary.txt`` Human-readable summary. + - ``rounds/round_.json`` One file per RoundRecord. + - ``baseline_prompts/.md`` Pre-run snapshot of every + TargetPrompt field + (regardless of update_source). + - ``best_prompts/.md`` Best candidate per field + (only when a result was produced). + - ``config.snapshot.json`` Copy of the input config. + - ``run.log`` One-line status footer. + + SIGINT (Ctrl+C) is masked for the duration of this method so a + second Ctrl+C during persistence cannot leave half-written + artifacts. All files are written atomically (tmp + os.replace), + so even if SIGKILL or a power loss interrupts the process the + output_dir never contains a partially-written file (only a + ``.tmp`` sibling that the next run can ignore). Missing pieces + (e.g. ``best_prompts`` on early failure) are silently omitted. + """ + with _mask_sigint(): + cls._write_baseline_prompts(baseline_snapshot, output_dir) + cls._copy_config_snapshot(config_path, output_dir) + + if result is None: + cls._write_run_log( + output_dir=output_dir, + line=cls._render_failure_log_line(run_error), + ) + return + + try: + _atomic_write_text( + os.path.join(output_dir, "result.json"), + result.model_dump_json(indent=2, by_alias=True), + ) + except Exception: # pragma: no cover - defensive guard for write errors + _PROMPT_FILE_LOGGER.warning("failed to write result.json", exc_info=True) + + try: + summary_text = result.format_summary(output_dir=output_dir, update_source=update_source) + _atomic_write_text(os.path.join(output_dir, "summary.txt"), summary_text) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to write summary.txt", exc_info=True) + + cls._write_rounds_directory(result, output_dir) + cls._write_best_prompts(result, output_dir) + cls._write_run_log( + output_dir=output_dir, + line=cls._render_success_log_line(result), + ) + + @staticmethod + def _write_baseline_prompts(baseline_snapshot: dict[str, str], output_dir: str) -> None: + baseline_dir = os.path.join(output_dir, "baseline_prompts") + os.makedirs(baseline_dir, exist_ok=True) + for name, content in baseline_snapshot.items(): + path = os.path.join(baseline_dir, f"{name}.md") + try: + _atomic_write_text(path, content) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to write baseline prompt %s", name, exc_info=True) + + @staticmethod + def _write_best_prompts(result: OptimizeResult, output_dir: str) -> None: + best_dir = os.path.join(output_dir, "best_prompts") + os.makedirs(best_dir, exist_ok=True) + for name, content in result.best_prompts.items(): + path = os.path.join(best_dir, f"{name}.md") + try: + _atomic_write_text(path, content) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to write best prompt %s", name, exc_info=True) + + @staticmethod + def _write_rounds_directory(result: OptimizeResult, output_dir: str) -> None: + rounds_dir = os.path.join(output_dir, "rounds") + os.makedirs(rounds_dir, exist_ok=True) + for record in result.rounds: + path = os.path.join(rounds_dir, f"round_{record.round:03d}.json") + try: + _atomic_write_text(path, record.model_dump_json(indent=2, by_alias=True)) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to write round %s", record.round, exc_info=True) + + @staticmethod + def _copy_config_snapshot(config_path: str, output_dir: str) -> None: + target = os.path.join(output_dir, "config.snapshot.json") + try: + # Read + atomic-write rather than shutil.copyfile so an interrupted + # copy cannot leave a half-written ``config.snapshot.json``. + content = Path(config_path).read_text(encoding="utf-8") + _atomic_write_text(target, content) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to copy config snapshot", exc_info=True) + + @staticmethod + def _write_run_log(*, output_dir: str, line: str) -> None: + try: + _atomic_write_text( + os.path.join(output_dir, "run.log"), + line.rstrip("\n") + "\n", + ) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to write run.log", exc_info=True) + + @staticmethod + def _render_success_log_line(result: OptimizeResult) -> str: + return (f"{datetime.now(timezone.utc).isoformat()} status={result.status} " + f"algorithm={result.algorithm} " + f"baseline={result.baseline_pass_rate:.4f} " + f"best={result.best_pass_rate:.4f} " + f"delta={result.pass_rate_improvement:+.4f} " + f"rounds={result.total_rounds} " + f"duration_seconds={result.duration_seconds:.2f}") + + @staticmethod + def _render_failure_log_line(run_error: Optional[BaseException]) -> str: + msg = str(run_error) if run_error else "optimization failed before result" + return (f"{datetime.now(timezone.utc).isoformat()} status=FAILED " + f"error={msg!r}") + + @staticmethod + def _precheck_algorithm_name(config_path: str) -> None: + """Friendly fail-fast when ``algorithm.name`` is unknown. + + ``GepaReflectiveAlgo.name`` is declared as ``Literal["gepa_reflective"]`` + for future pydantic-discriminator-based union routing. The Literal + causes pydantic to reject unknown names with a ``literal_error`` that + does not list available algorithms. We pre-read the raw JSON, look up + ``algorithm.name`` against ``OPTIMIZER_REGISTRY``, and raise a + ``ValueError`` listing every registered algorithm before pydantic's + Literal check fires. If parsing fails or the field is absent we let + pydantic's normal error path handle it (so we do not duplicate + formatting errors). + """ + import json + + try: + with open(config_path, "r", encoding="utf-8") as f: + raw = json.load(f) + except (OSError, json.JSONDecodeError): + return # let pydantic / load_optimize_config surface the real cause + + try: + name = raw["optimize"]["algorithm"]["name"] + except (KeyError, TypeError): + return # malformed shape: pydantic will raise a structured error + + if not isinstance(name, str): + return # type error: let pydantic's normal validation handle it + + registered = OPTIMIZER_REGISTRY.list_registered() + if name not in registered: + raise ValueError(f"No optimizer registered for algorithm: {name!r}. " + f"Available algorithms: {registered}") + + @staticmethod + def _validate_inputs( + *, + config, + call_agent: CallAgent, + target_prompt: TargetPrompt, + train_dataset_path: str, + validation_dataset_path: str, + output_dir: str, + ) -> None: + """Startup-time fail-fast checks. + + Reports actionable error messages so misconfigurations surface before + any LLM call is made. + """ + if not output_dir or not isinstance(output_dir, str): + raise ValueError("output_dir is required and must be a non-empty path; " + "pass output_dir='runs/' or similar.") + + if not target_prompt.names(): + raise ValueError("TargetPrompt has no registered fields; " + "call .add_path(...) or .add_callback(...) before optimize().") + + # Accept async functions and partials wrapping a coroutine function. + is_async = inspect.iscoroutinefunction(call_agent) + if not is_async: + wrapped = getattr(call_agent, "__wrapped__", None) + is_async = wrapped is not None and inspect.iscoroutinefunction(wrapped) + if not is_async: + raise TypeError("call_agent must be an async callable (async def or " + "Callable returning Awaitable[str]); " + f"got {type(call_agent).__name__}.") + + # Normalize so trivially-different strings ('./x', 'x') still collide + # when they resolve to the same file (train-validation leakage guard). + train_norm = os.path.normpath(os.path.abspath(train_dataset_path)) + val_norm = os.path.normpath(os.path.abspath(validation_dataset_path)) + if train_norm == val_norm: + raise ValueError("train_dataset_path and validation_dataset_path resolve to the " + f"same file ({train_norm}); use distinct datasets to avoid " + "train-validation leakage.") + + # call_agent (black-box) mode can't supply session traces or + # tool intermediate_data. ``get_eval_metrics()`` normalizes both + # 'metrics' and 'criteria' encodings so this check is uniform. + for metric in config.evaluate.get_eval_metrics(): + if metric.metric_name in _DISALLOWED_METRICS_IN_CALL_AGENT_MODE: + raise ValueError(f"Metric '{metric.metric_name}' requires session " + "traces or tool intermediate data, which call_agent " + "(black-box) mode does not capture; remove it from " + "evaluate.metrics or switch to a response-based metric " + "(e.g. final_response_avg_score, llm_rubric_response, " + "llm_final_response).") + + # gepa merge degenerates to "pick one of two parents" with a single + # component, never producing new candidates. Warn instead of error + # so existing benign configs keep running; user gets a clear hint + # that merge_rounds_total will be 0. + algo = config.optimize.algorithm + if (getattr(algo, "name", None) == "gepa_reflective" and getattr(algo, "use_merge", False) + and len(target_prompt.names()) < 2): + warnings.warn( + "use_merge=true requires TargetPrompt to register at least 2 " + "fields. With a single field, gepa merge degenerates to " + "picking one of the two parents and never creates new " + "candidates (merge_rounds_total stays 0). Set use_merge=false " + "or register more prompt fields. See " + "examples/optimization/advanced_strategies/README.md §6.1.", + UserWarning, + stacklevel=2, + ) diff --git a/trpc_agent_sdk/evaluation/_base_optimizer.py b/trpc_agent_sdk/evaluation/_base_optimizer.py new file mode 100644 index 00000000..6d79027b --- /dev/null +++ b/trpc_agent_sdk/evaluation/_base_optimizer.py @@ -0,0 +1,123 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Abstract base class for prompt optimization algorithms.""" + +from __future__ import annotations + +from abc import ABC +from abc import abstractmethod +from typing import TYPE_CHECKING +from typing import Any +from typing import Optional +from typing import Sequence + +from ._eval_callbacks import Callbacks +from ._optimize_config import FrameworkStopConfig +from ._optimize_config import OptimizeConfigFile +from ._optimize_result import OptimizeResult +from ._remote_eval_service import CallAgent +from ._target_prompt import TargetPrompt + +if TYPE_CHECKING: + from ._optimize_reporter import OptimizeReporter + + +class BaseOptimizer(ABC): + """Abstract base class for prompt optimization algorithms. + + Subclasses implement `run()` to execute one full optimization loop + against the supplied config, evaluator inputs, and TargetPrompt. + """ + + def __init__( + self, + *, + config: OptimizeConfigFile, + call_agent: CallAgent, + target_prompt: TargetPrompt, + train_dataset_path: str, + validation_dataset_path: str, + callbacks: Optional[Callbacks] = None, + output_dir: Optional[str] = None, + extra_stop_callbacks: Optional[Sequence[Any]] = None, + extra_gepa_callbacks: Optional[Sequence[Any]] = None, + ) -> None: + self.config = config + self.call_agent = call_agent + self.target_prompt = target_prompt + self.train_dataset_path = train_dataset_path + self.validation_dataset_path = validation_dataset_path + self.callbacks = callbacks + self.output_dir = output_dir + # Runtime-only hooks are not part of the JSON config schema + # because they're Python callables (SLO monitors, kill switches, + # custom telemetry sinks) whose identity is meaningful and + # cannot be serialised. Plain stoppers surface a generic + # ``"completed"`` stop_reason unless wrapped in + # ``_LabeledStopper``. + self.extra_stop_callbacks: list[Any] = (list(extra_stop_callbacks) if extra_stop_callbacks else []) + self.extra_gepa_callbacks: list[Any] = (list(extra_gepa_callbacks) if extra_gepa_callbacks else []) + + @abstractmethod + async def run( + self, + *, + reporter: Optional["OptimizeReporter"] = None, + ) -> OptimizeResult: + """Execute the optimization loop and return the final OptimizeResult. + + Args: + reporter: Progress sink for ``baseline_evaluated`` and + ``round_completed`` events. The facade always supplies + a non-None instance (``_NullReporter`` when + ``verbose=0``); subclasses may treat ``None`` as a noop + for direct invocations. + """ + + @staticmethod + def resolve_required_thresholds( + stop_config: FrameworkStopConfig, + metric_thresholds: dict[str, float], + ) -> dict[str, float]: + """Return the subset of thresholds the framework stop policy enforces. + + Resolution rules: + - ``required_metrics`` is None or empty list → ``{}`` (disabled). + - ``required_metrics == "all"`` → copy of all thresholds. + - non-empty list → ``metric_thresholds`` + filtered to listed names. Unknown names are silently dropped + (cross-field validation on :class:`OptimizeConfigFile` + already rejects them at config load time). + + Algorithms call this once per run and feed the result to + :meth:`metrics_meet_thresholds`. + """ + required = stop_config.required_metrics + if required is None: + return {} + if isinstance(required, list): + if not required: + return {} + allowed = set(required) + return {name: thr for name, thr in metric_thresholds.items() if name in allowed} + return dict(metric_thresholds) + + @staticmethod + def metrics_meet_thresholds( + metric_breakdown: dict[str, float], + required_thresholds: dict[str, float], + ) -> bool: + """True iff every required metric meets its threshold. + + Returns ``False`` when ``required_thresholds`` is empty so the + policy is a no-op when nothing is required. Callers obtain + ``required_thresholds`` from :meth:`resolve_required_thresholds` + for consistent "all / list / None / empty" semantics. + """ + if not required_thresholds: + return False + return all( + metric_breakdown.get(name, float("-inf")) >= threshold for name, threshold in required_thresholds.items()) diff --git a/trpc_agent_sdk/evaluation/_optimize_config.py b/trpc_agent_sdk/evaluation/_optimize_config.py new file mode 100644 index 00000000..31547125 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_config.py @@ -0,0 +1,257 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Optimizer configuration schema. + +Each registered algorithm contributes a pydantic model under +``OptimizeConfig.algorithm``; field names mirror the upstream library +(e.g. https://github.com/gepa-ai/gepa) 1:1 so users can cross-reference +upstream docs without translating. + +The top-level ``optimize`` section only carries algorithm-agnostic +switches (e.g. evaluator parallelism, framework stop policies); any +switch whose effect depends on the selected algorithm lives inside the +algorithm block. +""" + +from __future__ import annotations + +from typing import Literal +from typing import Optional +from typing import Union + +from pydantic import Field +from pydantic import model_validator + +from ._common import EvalBaseModel +from ._eval_config import EvalConfig +from ._optimize_model_options import OptimizeModelOptions + + +class GepaReflectiveAlgo(EvalBaseModel): + """gepa_reflective algorithm configuration. + + Field names mirror ``gepa.optimize`` parameters and gepa + ``StopperProtocol`` constructor arguments so config maps to gepa + docs directly. + """ + + name: Literal["gepa_reflective"] = Field(description="Algorithm discriminator tag.", ) + + seed: int = Field( + default=42, + description="Random seed forwarded to gepa.optimize(seed=...).", + ) + reflection_lm: OptimizeModelOptions = Field( + description=("LLM gepa uses to reflect on failed cases and propose new prompts. " + "Forwarded to gepa.optimize(reflection_lm=...)."), ) + + candidate_selection_strategy: Literal[ + "pareto", + "current_best", + "epsilon_greedy", + "top_k_pareto", + ] = Field( + default="pareto", + description="Strategy gepa uses to pick the parent candidate each round.", + ) + module_selector: str = Field( + default="round_robin", + description="Component selector passed to gepa (e.g. 'round_robin', 'all').", + ) + frontier_type: Literal["instance", "objective", "hybrid", "cartesian"] = Field( + default="instance", + description="Pareto frontier tracking granularity forwarded to gepa.", + ) + reflection_minibatch_size: Optional[int] = Field( + default=None, + description="Per-round minibatch size for the reflective dataset; None lets gepa decide.", + ) + reflection_history_top_k: int = Field( + default=2, + ge=0, + le=5, + description=("How many historical best traces per case to expose to the " + "reflection LM as the ``history_top_k`` record field. 0 " + "disables the feature. Capped at 5 to bound prompt-token " + "growth — for K=2 a typical multi-turn case grows ~30%."), + ) + perfect_score: float = Field( + default=1.0, + description="Score considered 'perfect' for skip_perfect_score decisions.", + ) + skip_perfect_score: bool = Field( + default=True, + description="Whether gepa skips optimizing instances that already score perfect.", + ) + + use_merge: bool = Field( + default=False, + description="Whether to enable gepa merge-based candidate proposals.", + ) + max_merge_invocations: int = Field( + default=5, + description="Maximum merge invocations when use_merge is true.", + ) + merge_val_overlap_floor: int = Field( + default=5, + description="Minimum shared validation ids required before attempting a merge subsample.", + ) + + cache_evaluation: bool = Field( + default=False, + description="Cache (candidate, case) scores so repeated evaluations skip the metric call.", + ) + track_best_outputs: bool = Field( + default=False, + description="Track per-case best outputs alongside the best candidate.", + ) + + max_metric_calls: Optional[int] = Field( + default=None, + description=("Stop after this many metric calls (one metric call = one case-level " + "evaluation). Mapped to gepa MaxMetricCallsStopper. At least one of the " + "five stop conditions on this object must be set."), + ) + max_iterations_without_improvement: Optional[int] = Field( + default=None, + description=("Stop after this many consecutive iterations whose best valset score " + "did not improve. Mapped to gepa NoImprovementStopper."), + ) + timeout_seconds: Optional[float] = Field( + default=None, + description=("Stop after this many wall-clock seconds. Mapped to gepa " + "TimeoutStopCondition."), + ) + score_threshold: Optional[float] = Field( + default=None, + description=("Stop once the best valset score reaches this threshold. Mapped to " + "gepa ScoreThresholdStopper."), + ) + max_candidate_proposals: Optional[int] = Field( + default=None, + description=("Stop after this many candidate proposals. Mapped to gepa " + "MaxCandidateProposalsStopper."), + ) + max_tracked_candidates: Optional[int] = Field( + default=None, + description=("Stop once the candidate pool reaches this size. Mapped to gepa " + "MaxTrackedCandidatesStopper."), + ) + + @model_validator(mode="after") + def _require_at_least_one_stop_condition(self) -> "GepaReflectiveAlgo": + if not any(value is not None for value in ( + self.max_metric_calls, + self.max_iterations_without_improvement, + self.timeout_seconds, + self.score_threshold, + self.max_candidate_proposals, + self.max_tracked_candidates, + )): + raise ValueError("gepa_reflective requires at least one stop condition: set one of " + "max_metric_calls / max_iterations_without_improvement / " + "timeout_seconds / score_threshold / max_candidate_proposals / " + "max_tracked_candidates.") + return self + + +class FrameworkStopConfig(EvalBaseModel): + """Framework-level stop policies applied to every algorithm. + + Today the only such policy is metric-based early stopping: stop + when every metric named by ``required_metrics`` meets its threshold + on the validation set. Threshold values come from + ``evaluate.metrics[].threshold``; this section only decides which + metrics participate. + + Pass-rate-based stopping is not exposed here because every supported + engine has an equivalent native field (e.g. ``algorithm.score_threshold`` + for gepa_reflective). + + Field values for ``required_metrics``: + - ``"all"`` (default): every metric in ``evaluate.metrics[]`` + must meet its threshold. + - ``list[str]``: only the listed metrics must meet thresholds. + Each name must match an entry in + ``evaluate.metrics[].metric_name`` (validated by + :class:`OptimizeConfigFile`). Empty list disables the policy. + - ``None``: disable the policy entirely; the run finishes only + via algorithm-native stop conditions. + """ + + required_metrics: Optional[Union[Literal["all"], list[str]]] = Field( + default="all", + description=("Metrics whose thresholds must be met on the validation set " + "before the framework asks the algorithm to stop. 'all' means " + "every metric in evaluate.metrics[]; a list narrows the set; " + "None or [] disables the policy."), + ) + + +class OptimizeConfig(EvalBaseModel): + """Algorithm-agnostic optimizer section. + + Holds switches the framework itself consumes; algorithm-specific + knobs live under :attr:`algorithm` so different algorithms can + expose entirely different field sets without polluting one another. + + To add a second algorithm: + 1. Define ``MyAlgo(EvalBaseModel)`` with ``name: Literal["my_algo"]``. + 2. Replace :attr:`algorithm` type with:: + + algorithm: Annotated[ + Union[GepaReflectiveAlgo, MyAlgo], + Field(discriminator="name"), + ] + + pydantic v2 then routes validation by the ``name`` tag and + rejects unknown algorithm names with a clear error. + """ + + eval_case_parallelism: int = Field( + default=4, + description="Case-level parallelism forwarded to the evaluator.", + ) + stop: FrameworkStopConfig = Field( + default_factory=FrameworkStopConfig, + description=("Framework-level stop policies; OR'd with any algorithm-native " + "stop conditions configured under :attr:`algorithm`."), + ) + algorithm: GepaReflectiveAlgo = Field(description="Algorithm selection and algorithm-specific parameters.", ) + + +class OptimizeConfigFile(EvalBaseModel): + """Top-level schema for an optimizer JSON config file.""" + + evaluate: EvalConfig = Field(description="Evaluator section: same schema as evaluator's EvalConfig.", ) + optimize: OptimizeConfig = Field(description="Optimizer section: framework switches plus the algorithm block.", ) + + @model_validator(mode="after") + def _validate_required_metrics_against_evaluate(self) -> "OptimizeConfigFile": + required = self.optimize.stop.required_metrics + if not isinstance(required, list) or not required: + return self + available = {metric.metric_name for metric in self.evaluate.get_eval_metrics()} + unknown = [name for name in required if name not in available] + if unknown: + raise ValueError("stop.required_metrics references unknown metric(s) " + f"{unknown}; available metrics from evaluate.metrics[]: " + f"{sorted(available)}") + return self + + +def load_optimize_config(path: str) -> OptimizeConfigFile: + """Load and parse an optimizer JSON config file. + + Accepts camelCase and snake_case keys. + + Raises: + FileNotFoundError: if path does not exist. + pydantic.ValidationError: on schema violations. + """ + with open(path, "r", encoding="utf-8") as f: + content = f.read() + return OptimizeConfigFile.model_validate_json(content) diff --git a/trpc_agent_sdk/evaluation/_optimize_evaluator_call.py b/trpc_agent_sdk/evaluation/_optimize_evaluator_call.py new file mode 100644 index 00000000..73b46148 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_evaluator_call.py @@ -0,0 +1,136 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Optimizer-facing wrapper around AgentEvaluator.""" + +from __future__ import annotations + +from dataclasses import dataclass +from dataclasses import field +from statistics import mean +from typing import Optional + +from ._agent_evaluator import AgentEvaluator +from ._agent_evaluator import _EvaluationCasesFailed +from ._eval_callbacks import Callbacks +from ._eval_metrics import EvalStatus +from ._eval_result import EvaluateResult +from ._remote_eval_service import CallAgent + + +@dataclass(frozen=True) +class EvaluationOutcome: + """Summary metrics extracted from an EvaluateResult for the optimizer. + + Attributes: + pass_rate: Fraction of cases whose final_eval_status is PASSED. + tiebreaker: Mean of all per-case metric scores; used when pass_rate ties. + metric_breakdown: Mean score per metric name across all cases. + failed_case_ids: Eval ids of cases that did not pass; duplicated across runs. + judge_model_calls: Currently always 0; the evaluator does not surface per-judge invocation counts. + raw_result: The original EvaluateResult for downstream inspection. + """ + + pass_rate: float + tiebreaker: float + metric_breakdown: dict[str, float] = field(default_factory=dict) + failed_case_ids: list[str] = field(default_factory=list) + judge_model_calls: int = 0 + raw_result: Optional[EvaluateResult] = None + + +def summarize_outcome(result: EvaluateResult) -> EvaluationOutcome: + """Reduce a raw EvaluateResult to the metrics the optimizer needs. + + judge_model_calls is set to 0 here; remote evaluators may overwrite it + after the call returns when actual judge invocation counts are known. + """ + total = 0 + passed = 0 + failed_case_ids: list[str] = [] + scores_by_metric: dict[str, list[float]] = {} + + for set_result in result.results_by_eval_set_id.values(): + for eval_id, runs in set_result.eval_results_by_eval_id.items(): + for run in runs: + total += 1 + if run.final_eval_status == EvalStatus.PASSED: + passed += 1 + else: + failed_case_ids.append(eval_id) + for metric in run.overall_eval_metric_results: + if metric.score is None: + continue + scores_by_metric.setdefault(metric.metric_name, []).append(metric.score) + + pass_rate = passed / total if total > 0 else 0.0 + metric_breakdown = {name: mean(scores) for name, scores in scores_by_metric.items()} + all_scores = [s for scores in scores_by_metric.values() for s in scores] + tiebreaker = mean(all_scores) if all_scores else 0.0 + + return EvaluationOutcome( + pass_rate=pass_rate, + tiebreaker=tiebreaker, + metric_breakdown=metric_breakdown, + failed_case_ids=failed_case_ids, + judge_model_calls=0, + raw_result=result, + ) + + +async def run_evaluator( + *, + eval_dataset_path: str, + eval_metrics_path: Optional[str], + call_agent: CallAgent, + callbacks: Optional[Callbacks], + num_runs: int = 1, + case_parallelism: Optional[int] = None, +) -> EvaluationOutcome: + """Run the evaluator over a dataset and summarize the outcome. + + Args: + eval_dataset_path: Path to an eval set file or directory of eval sets. + eval_metrics_path: Path to a shared metrics config file; None falls back to dataset-local config. + call_agent: Async function that maps a user query to an agent response. + callbacks: Optional lifecycle callbacks passed through to the evaluator. + num_runs: Number of runs per eval set. + case_parallelism: Max concurrent cases for inference; None lets the + evaluator use its default. Plumbs ``optimize.eval_case_parallelism`` + through to :meth:`AgentEvaluator.get_executer`. + + Returns: + EvaluationOutcome with extracted pass_rate / tiebreaker / metric_breakdown / failed_case_ids. + """ + executer = AgentEvaluator.get_executer( + eval_dataset_path, + call_agent=call_agent, + callbacks=callbacks, + num_runs=num_runs, + print_detailed_results=False, + print_summary_report=False, + eval_result_output_dir=None, + eval_metrics_file_path_or_dir=eval_metrics_path, + case_parallelism=case_parallelism, + ) + # _EvaluationCasesFailed signals "some cases failed" — the evaluator has + # already populated ``executer.get_result()`` before raising, so we swallow + # this specific subclass and let the optimizer keep iterating. Any other + # exception (FileNotFoundError, network error, third-party AssertionError, + # ...) is a real failure and must propagate: silently substituting an empty + # EvaluateResult would make the optimizer see a 0.0 pass_rate and continue + # optimizing against phantom data. + try: + await executer.evaluate() + except _EvaluationCasesFailed: + pass + result = executer.get_result() + if result is None: + # _run raised before populating self._result. This only happens on a + # real upstream error (which would have re-raised above) or a logic + # bug. Return an empty outcome rather than crash, but the path is + # defensive — not a normal control-flow branch. + result = EvaluateResult() + return summarize_outcome(result) diff --git a/trpc_agent_sdk/evaluation/_optimize_gepa_adapter.py b/trpc_agent_sdk/evaluation/_optimize_gepa_adapter.py new file mode 100644 index 00000000..34fc4dd2 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_gepa_adapter.py @@ -0,0 +1,794 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""GEPA protocol adapter and reflective-dataset builder. + +Implements ``gepa.core.adapter.GEPAAdapter`` so gepa's main loop can +drive evaluation through the framework's ``AgentEvaluator``. The +adapter stays decoupled from any specific gepa algorithm class so +gepa-family algorithms can reuse it without duplicating evaluator I/O. + +:meth:`_AgentGEPAAdapter.make_reflective_dataset` renders each failed +case into a turn-sliced markdown record +(``{case_id, score, "Case Body", "Other Active Components"?}``) tuned +for the reflection LM in multi-component / multi-turn / multi-run / +tool-using scenarios. + +``gepa`` is an optional dependency: ``EvaluationBatch`` is imported +lazily inside :meth:`_AgentGEPAAdapter.evaluate`, so importing this +module without ``gepa`` installed succeeds but ``evaluate`` then fails +fast. +""" + +from __future__ import annotations + +import asyncio +import tempfile +import uuid +from pathlib import Path +from typing import Any +from typing import Mapping +from typing import Optional +from typing import Sequence + +from ._eval_callbacks import Callbacks +from ._eval_case import EvalCase +from ._eval_case import Invocation +from ._eval_case import get_all_tool_calls +from ._eval_case import get_all_tool_responses +from ._eval_config import EvalConfig +from ._eval_metrics import EvalStatus +from ._eval_metrics import PrebuiltMetrics +from ._eval_result import EvalCaseResult +from ._eval_result import EvalMetricResult +from ._eval_result import EvaluateResult +from ._eval_set import EvalSet +from ._optimize_evaluator_call import run_evaluator +from ._remote_eval_service import CallAgent +from ._target_prompt import TargetPrompt + + +def _extract_case_output(case_result: EvalCaseResult) -> str: + """Return the agent's final response text from the first per-invocation entry. + + Used to populate ``EvaluationBatch.outputs`` — GEPA reads that field + directly to decide whether a candidate's behaviour improved on a case + even before consulting the trajectory or score. + """ + per_inv = case_result.eval_metric_result_per_invocation or [] + if not per_inv: + return "" + actual = per_inv[0].actual_invocation + if not actual or not actual.final_response or not actual.final_response.parts: + return "" + return "\n".join((p.text or "") for p in actual.final_response.parts).strip() + + +def _invocation_text(invocation: Optional[Invocation], *, user: bool) -> str: + """Concatenate a single invocation's user_content or final_response text.""" + if invocation is None: + return "" + content = invocation.user_content if user else invocation.final_response + if content is None or not content.parts: + return "" + return "\n".join((p.text or "") for p in content.parts).strip() + + +def _render_metric_lines(metrics: Sequence[EvalMetricResult]) -> list[str]: + """Render one block of per-metric verdict lines for a turn or aggregate. + + Drives both per-invocation blocks (``### Turn N``) inside + :func:`_build_turn_block` and the case-level aggregate block + (``### Overall``) inside :func:`_build_overall_block`. Each metric + occupies one ``[PASS|FAIL] name: score=..., threshold=...`` line; + optional ``reason`` and rubric sub-score lines are nested below it. + """ + lines: list[str] = [] + for metric in metrics: + status = _format_status(metric.eval_status) + score_str = f"{metric.score:.4f}" if metric.score is not None else "n/a" + lines.append(f"[{status}] {metric.metric_name}: " + f"score={score_str}, threshold={metric.threshold:.4f}") + + # ``details.reason`` is only populated by LLM-judged evaluators. + # For deterministic matchers, synthesize a one-line explanation + # from the criterion config so the reflection LM sees WHY the + # check failed. + explicit_reason = (metric.details.reason if (metric.details and metric.details.reason) else None) + if explicit_reason: + lines.append(f" reason: {explicit_reason}") + else: + synthesized = _synthesize_failure_reason(metric) + if synthesized: + lines.append(f" reason: {synthesized}") + + # Expand rubric sub-scores so the reflection LM can target the + # precise failing aspect instead of guessing. + rubric_scores = (getattr(metric.details, "rubric_scores", None) if metric.details else None) + if rubric_scores: + for rs in rubric_scores: + rid = (getattr(rs, "id", None) if not isinstance(rs, dict) else rs.get("id")) or "?" + rscore = (getattr(rs, "score", None) if not isinstance(rs, dict) else rs.get("score")) + rreason = (getattr(rs, "reason", "") if not isinstance(rs, dict) else rs.get("reason", "")) + if rscore is None: + continue + rs_status = "PASS" if float(rscore) >= 1.0 else "FAIL" + line = (f" · rubric[{rid}]: {rs_status} " + f"score={float(rscore):.2f}") + if rreason: + line += f" reason: {rreason}" + lines.append(line) + return lines + + +def _synthesize_failure_reason(metric: EvalMetricResult) -> Optional[str]: + """Synthesize a short failure explanation for deterministic metrics. + + Deterministic evaluators (e.g. ``_final_response_evaluator``) only + emit ``score`` + ``eval_status``; without this, the reflection LM + has to diff the agent's output against the reference itself to + guess why the match failed. Translate the criterion config into one + of: + + - "agent output not byte-equal to expected (case-sensitive)" (exact) + - "expected substring not contained in agent output (case-insensitive)" (contains) + - "agent output did not match expected regex" (regex) + - "JSON structural comparison failed" (json) + - "text-... AND JSON-..." when both checks are configured + + Returns ``None`` for non-deterministic metrics, currently-passing + metrics, and missing/malformed criterion configs. + """ + if metric.metric_name != PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: + return None + if metric.score is None or float(metric.score) >= 1.0: + return None + criterion = metric.criterion or {} + if not isinstance(criterion, dict): + return None + fr = criterion.get("final_response") or criterion.get("finalResponse") + if not isinstance(fr, dict): + return None + + notes: list[str] = [] + text = fr.get("text") or fr.get("text_strategy") or fr.get("textStrategy") + if isinstance(text, dict) and not text.get("ignore"): + match = str(text.get("match") or text.get("match_strategy") or "exact").strip().lower() + case_ins = bool(text.get("case_insensitive") or text.get("caseInsensitive")) + case_tag = "case-insensitive" if case_ins else "case-sensitive" + if match == "exact": + notes.append(f"agent output not byte-equal to expected ({case_tag})") + elif match == "contains": + notes.append(f"expected substring not contained in agent output ({case_tag})") + elif match == "regex": + notes.append(f"agent output did not match expected regex ({case_tag})") + else: + notes.append(f"text match (mode={match}) failed ({case_tag})") + + json_cfg = fr.get("json") or fr.get("json_strategy") or fr.get("jsonStrategy") + if isinstance(json_cfg, dict) and not json_cfg.get("ignore"): + notes.append("JSON structural comparison failed") + + if not notes: + return None + return " AND ".join(notes) + + +def _format_status(status: Any) -> str: + """Render an EvalStatus as its name (PASSED/FAILED/...) — readable + to the reflection LM than the numeric ``.value``. + """ + name = getattr(status, "name", None) + if isinstance(name, str): + return name + return str(status) + + +def _per_metric_objective_scores(case_runs: Sequence[EvalCaseResult], ) -> dict[str, float]: + """Build the per-objective score map for one case. + + Each metric name maps to the mean of its ``score`` across runs. + GEPA uses this to maintain a per-objective Pareto frontier + independent of the aggregated case score — so a candidate that + dominates on one metric (e.g. rubric quality) survives even when + overall pass rates tie. Metrics with no signal across all runs are + skipped (they would taint the mean). + """ + sums: dict[str, float] = {} + counts: dict[str, int] = {} + for run in case_runs: + for metric in run.overall_eval_metric_results or []: + if metric.score is None: + continue + sums[metric.metric_name] = sums.get(metric.metric_name, 0.0) + float(metric.score) + counts[metric.metric_name] = counts.get(metric.metric_name, 0) + 1 + return {name: sums[name] / counts[name] for name in sums} + + +def _continuous_case_score(case_runs: Sequence[EvalCaseResult]) -> float: + """Compute case_score as the mean of per-metric continuous scores. + + Per run: average all ``EvalMetricResult.score`` values (each in + ``[0, 1]``). Across runs (``num_runs > 1``): average the per-run + scores. Continuous scoring lets gepa distinguish candidates that + share PASS/FAIL labels but differ in metric quality (e.g. one keeps + a rubric at 1.0 while another regresses to 0.33 — both still FAIL + overall but only one is strictly better). + """ + run_scores: list[float] = [] + for run in case_runs: + metrics = run.overall_eval_metric_results or [] + metric_scores = [float(m.score) for m in metrics if m.score is not None] + if metric_scores: + run_scores.append(sum(metric_scores) / len(metric_scores)) + else: + # Fallback to the binary PASS/FAIL signal when no per-metric scores + # are emitted (e.g. error path or evaluator that omits details). + run_scores.append(1.0 if run.final_eval_status == EvalStatus.PASSED else 0.0) + if not run_scores: + return 0.0 + return sum(run_scores) / len(run_scores) + + +def _format_tool_args(args: Any) -> str: + """Render a tool-call ``args`` dict inline as ``k=v, k=v``. + + Inline form keeps each tool call on one line; gepa's prompt_renderer + would otherwise expand each arg into its own ``###### key`` heading + and hit the H6 cap. + """ + if not isinstance(args, dict): + return repr(args) + parts: list[str] = [] + for key, value in args.items(): + if isinstance(value, str): + parts.append(f"{key}={value!r}") + elif isinstance(value, (int, float, bool)) or value is None: + parts.append(f"{key}={value}") + else: + parts.append(f"{key}={value!r}") + return ", ".join(parts) + + +def _format_tool_response(response: Any) -> str: + """Render a tool response inline; collapse single-key dicts to bare value.""" + if isinstance(response, dict): + if len(response) == 1: + value = next(iter(response.values())) + if isinstance(value, str): + return repr(value) + return str(value) + return "{" + _format_tool_args(response) + "}" + if isinstance(response, str): + return repr(response) + return str(response) + + +def _resolve_turn_metrics(run: EvalCaseResult, turn_idx: int, total_turns: int) -> list[EvalMetricResult]: + """Pick the verdict slice for one (run, turn). + + Multi-turn cases use ``eval_metric_result_per_invocation[i]. + eval_metric_results``. Single-turn cases sometimes leave that empty + and only populate ``overall_eval_metric_results`` — fall back so a + Turn 1 block still carries a verdict. + """ + per_inv = run.eval_metric_result_per_invocation or [] + if 0 <= turn_idx - 1 < len(per_inv): + pinv = per_inv[turn_idx - 1] + if pinv.eval_metric_results: + return list(pinv.eval_metric_results) + if total_turns == 1: + return list(run.overall_eval_metric_results or []) + return [] + + +def _build_turn_block( + case: EvalCase, + case_runs: Sequence[EvalCaseResult], + turn_idx: int, + total_turns: int, + is_multi_run: bool, +) -> str: + """Render one ``### Turn N`` block grouping user/expected/agent/tool/verdict. + + Conversational truth (User/Expected) is shared across runs and printed + first; for each run the actual agent_response, function-call trace, and + per-turn verdict follow. Multi-run cases nest each run under + ``#### Run N`` so the LM can attribute output variance to a specific + roll-out. + """ + lines: list[str] = [f"### Turn {turn_idx}"] + + convo = case.conversation or case.actual_conversation or [] + if 0 <= turn_idx - 1 < len(convo): + inv = convo[turn_idx - 1] + user_text = _invocation_text(inv, user=True) + if user_text: + lines.append(f"**User**: {user_text}") + expected_text = _invocation_text(inv, user=False) + if expected_text: + lines.append(f"**Expected**: {expected_text}") + + for ordinal, run in enumerate(case_runs, start=1): + run_id = getattr(run, "run_id", None) or ordinal + per_inv = run.eval_metric_result_per_invocation or [] + actual_inv: Optional[Invocation] = None + if 0 <= turn_idx - 1 < len(per_inv): + actual_inv = per_inv[turn_idx - 1].actual_invocation + + if is_multi_run: + lines.append("") + lines.append(f"#### Run {run_id}") + + if actual_inv is not None: + response_text = _invocation_text(actual_inv, user=False) + if response_text: + lines.append(f"**Agent Response**: {response_text}") + + tool_calls = get_all_tool_calls(actual_inv.intermediate_data) + tool_responses = get_all_tool_responses(actual_inv.intermediate_data) + if tool_calls or tool_responses: + lines.append("**Tool Trace**:") + resp_by_id: dict[str, Any] = {tr.id: tr for tr in tool_responses if tr.id} + consumed_ids: set[str] = set() + for tc in tool_calls: + args_inline = _format_tool_args(tc.args) if tc.args else "" + suffix = "" + if tc.id and tc.id in resp_by_id: + tr = resp_by_id[tc.id] + consumed_ids.add(tc.id) + suffix = f" → {_format_tool_response(tr.response)}" + id_tag = f" [id={tc.id}]" if tc.id else "" + lines.append(f"- {tc.name or ''}({args_inline}){suffix}{id_tag}") + # Surface tool_responses arriving without a paired call so + # the reflection LM doesn't miss out-of-band observations. + for tr in tool_responses: + if tr.id and tr.id in consumed_ids: + continue + id_tag = f" [id={tr.id}]" if tr.id else "" + lines.append(f"- (orphan response) {tr.name or ''} → " + f"{_format_tool_response(tr.response)}{id_tag}") + + verdict_metrics = _resolve_turn_metrics(run, turn_idx, total_turns) + if verdict_metrics: + run_tag = f", Run {run_id}" if is_multi_run else "" + lines.append(f"**Verdict** (Turn {turn_idx}{run_tag}):") + for verdict_line in _render_metric_lines(verdict_metrics): + lines.append(f" {verdict_line}") + + return "\n".join(lines) + + +def _build_overall_block(case_runs: Sequence[EvalCaseResult], is_multi_run: bool) -> str: + """Render the case-level aggregate verdict block. + + Single-run: ``### Overall (case-level aggregate)`` from the run's + ``overall_eval_metric_results``. Multi-run: ``### Overall (per-run + aggregate)`` with one sub-block per run, so the LM can spot which + runs failed without averaging through to a single mean. + """ + if is_multi_run: + lines: list[str] = ["### Overall (per-run aggregate)"] + for ordinal, run in enumerate(case_runs, start=1): + run_id = getattr(run, "run_id", None) or ordinal + lines.append(f"**Run {run_id}**:") + for verdict_line in _render_metric_lines(run.overall_eval_metric_results or []): + lines.append(f" {verdict_line}") + return "\n".join(lines) + + lines = ["### Overall (case-level aggregate)"] + if case_runs: + lines.extend(_render_metric_lines(case_runs[0].overall_eval_metric_results or [])) + return "\n".join(lines) + + +def _build_case_body(case: EvalCase, case_runs: Sequence[EvalCaseResult]) -> str: + """Build the per-turn-sliced markdown body of a failed case. + + Each turn is one ``### Turn N`` block bundling user / expected / + agent_response / Tool Trace / Verdict so each failing metric is + visually anchored to the turn that produced it. Multi-run cases nest + each run under ``#### Run N``. Multi-turn or multi-run cases close + with an ``### Overall`` aggregate. + + Returns an empty string when no usable turn data is available, so + the caller can decide whether to drop the record. + """ + if not case_runs: + return "" + + n_runs = len(case_runs) + is_multi_run = n_runs > 1 + + convo = case.conversation or case.actual_conversation or [] + if convo: + n_turns = len(convo) + else: + n_turns = max( + (len(run.eval_metric_result_per_invocation or []) for run in case_runs), + default=0, + ) + + if n_turns == 0: + return "" + + blocks: list[str] = [] + for turn_idx in range(1, n_turns + 1): + blocks.append(_build_turn_block(case, case_runs, turn_idx, n_turns, is_multi_run)) + + # Single-turn single-run cases skip the Overall block — Turn 1 + # already carries the only verdict that exists. + if n_turns > 1 or is_multi_run: + blocks.append(_build_overall_block(case_runs, is_multi_run)) + + return "\n\n".join(blocks) + + +def _build_other_active_components(candidate: dict[str, str], components_to_update: Sequence[str]) -> str: + """Render the prompt content of every candidate component NOT being + refined this round. + + GEPA fills ```` with only the prompt being rewritten, + but the evaluator's verdict was produced by the agent running with + ALL prompts. Surfacing the others as static context stops the LM + from regressing requirements already enforced elsewhere or + duplicating instructions. + + Returns an empty string when there is only one component or when + the others contain no text. + """ + targets = set(components_to_update) + others = {name: text for name, text in candidate.items() if name not in targets and text} + if not others: + return "" + lines: list[str] = [] + for name in sorted(others): + lines.append(f"### {name} (current)") + lines.append(others[name].rstrip()) + lines.append("") + return "\n".join(lines).rstrip() + + +def _build_trajectory_entry( + case: EvalCase, + score: float, + *, + case_runs: Sequence[EvalCaseResult] = (), + error_message: Optional[str] = None, +) -> dict[str, Any]: + """Bundle one case's evaluation artifacts for reflective dataset construction. + + ``score`` lets ``make_reflective_dataset`` filter to failed cases + without re-reading the runs. ``_case`` + ``_case_runs`` carry + everything the record builder needs to render the turn-sliced body. + On evaluator-error paths (no runs produced), ``error_message`` + surfaces a diagnostic in place of a Case Body. + """ + return { + "score": score, + "_case": case, + "_case_runs": list(case_runs), + "error_message": error_message, + } + + +def _make_return_type_checked_call_agent(call_agent: Any) -> Any: + """Wrap ``call_agent`` with a one-shot return-type check. + + Plain ``async def f(query): return 42`` passes + :func:`inspect.iscoroutinefunction`, so the broken return type is only + discovered when a metric tries to call ``.lower()`` / ``.strip()`` on the + int and produces an opaque ``AttributeError`` deep inside the metric path. + + The wrapper validates ``isinstance(result, str)`` on the first call only, + raising a clear ``TypeError`` that names the actual returned type. After + the first successful call subsequent invocations bypass the check, so the + overhead is a single boolean check on the first case and zero thereafter. + """ + checked = {"done": False} + + async def _checked(query: str) -> str: + result = await call_agent(query) + if not checked["done"]: + if not isinstance(result, str): + raise TypeError(f"call_agent must return str; got " + f"{type(result).__name__} (value={result!r}). " + f"This is checked once on the first invocation.") + checked["done"] = True + return result + + return _checked + + +class _AgentGEPAAdapter: + """GEPA protocol adapter bridging gepa.optimize() to the framework evaluator. + + Per ``evaluate`` call: + 1. Apply the proposed ``candidate`` to all registered ``TargetPrompt`` fields. + 2. Serialize ``batch`` and ``eval_config`` to a temp directory. + 3. Run ``run_evaluator`` (asyncio.run) and collect per-case pass + status + final response. + 4. Build an ``EvaluationBatch`` carrying scores, outputs, and + (optionally) trajectories used by reflective dataset construction. + + ``make_reflective_dataset`` then renders failed trajectories as + ``{component: [{case_id, score, "Case Body", "Other Active Components"?}, + ...]}`` for gepa's reflection prompt template. + """ + + # gepa's reflective proposer reads ``adapter.propose_new_texts`` + # directly; ``None`` signals "use gepa's default reflection LM path". + propose_new_texts = None + + def __init__( + self, + *, + target_prompt: TargetPrompt, + eval_config: EvalConfig, + call_agent: CallAgent, + callbacks: Optional[Callbacks] = None, + num_runs: int = 1, + case_parallelism: Optional[int] = None, + top_k_per_case: int = 2, + ) -> None: + self.target_prompt = target_prompt + self.eval_config = eval_config + # Wrap call_agent so the first call validates the return type and + # surfaces a clear TypeError on misuse (e.g. ``async def f(): return 42`` + # passes static signature checks but only blows up inside metrics). + # The check fires once; later calls bypass the wrapper. + self.call_agent = _make_return_type_checked_call_agent(call_agent) + self.callbacks = callbacks + self.num_runs = num_runs + self.case_parallelism = case_parallelism + self._top_k = max(0, int(top_k_per_case)) + self._best_history: dict[str, list[dict[str, Any]]] = {} + from ._optimize_evaluator_call import EvaluationOutcome # local to avoid cycle + self.last_outcome: Optional[EvaluationOutcome] = None + # Long-lived event loop reused across every evaluate() call so + # async resources held inside call_agent (httpx.AsyncClient, + # asyncpg pools, grpc.aio channels, ...) stay bound to a single + # loop. Created lazily on first evaluate() because adapter is + # constructed from an async context; allocating the loop here + # would not bind to the worker thread that gepa.optimize runs in. + self._loop: Optional[asyncio.AbstractEventLoop] = None + + def _get_or_create_loop(self) -> asyncio.AbstractEventLoop: + """Return the adapter-owned loop, creating it on first call. + + Must be invoked from the worker thread that drives gepa.optimize + (no running loop in that thread, so a fresh loop is safe). + """ + if self._loop is None or self._loop.is_closed(): + self._loop = asyncio.new_event_loop() + return self._loop + + def close(self) -> None: + """Close the adapter-owned loop. Idempotent; safe before first evaluate().""" + loop = getattr(self, "_loop", None) + self._loop = None + if loop is None or loop.is_closed(): + return + try: + loop.close() + except Exception: # pragma: no cover - defensive guard + pass + + def _record_history( + self, + *, + case_id: str, + score: float, + best_response: str, + ) -> None: + """Append one historical entry per case, keep at most top_k by score.""" + if self._top_k <= 0: + return + bucket = self._best_history.setdefault(case_id, []) + bucket.append({"score": float(score), "best_response": best_response}) + bucket.sort(key=lambda entry: entry["score"], reverse=True) + del bucket[self._top_k:] + + def evaluate( + self, + batch: list[EvalCase], + candidate: dict[str, str], + capture_traces: bool = False, + ) -> Any: + """Apply ``candidate`` and run the evaluator over ``batch``. + + Both the prompt write and the evaluator run execute on the + adapter-owned event loop so async resources held by call_agent + stay bound to a single loop across every gepa iteration. + """ + from gepa.core.adapter import EvaluationBatch + + loop = self._get_or_create_loop() + loop.run_until_complete(self.target_prompt.write_all(candidate)) + + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + evalset_path = tmp_path / "batch.evalset.json" + metrics_path = tmp_path / "batch.metrics.json" + + # Unique id per call so the in-memory eval-set manager doesn't + # reject repeated batches. gepa's batch sampler pads minibatches + # with least-frequent ids when trainset_size doesn't divide + # minibatch_size, so the same eval_case can appear twice — rename + # duplicate eval_ids in place so the manager accepts the EvalSet + # and every minibatch position still gets scored. + seen: dict[str, int] = {} + unique_cases: list[EvalCase] = [] + for case in batch: + seen[case.eval_id] = seen.get(case.eval_id, 0) + 1 + if seen[case.eval_id] == 1: + unique_cases.append(case) + else: + cloned = case.model_copy() + cloned.eval_id = f"{case.eval_id}__rep{seen[case.eval_id]}" + unique_cases.append(cloned) + evalset = EvalSet( + eval_set_id=f"optimize_gepa_batch_{uuid.uuid4().hex[:8]}", + eval_cases=unique_cases, + ) + evalset_path.write_text(evalset.model_dump_json(indent=2), encoding="utf-8") + metrics_path.write_text(self.eval_config.model_dump_json(indent=2), encoding="utf-8") + + outcome = loop.run_until_complete( + run_evaluator( + eval_dataset_path=str(evalset_path), + eval_metrics_path=str(metrics_path), + call_agent=self.call_agent, + callbacks=self.callbacks, + num_runs=self.num_runs, + case_parallelism=self.case_parallelism, + )) + self.last_outcome = outcome + + return self._build_evaluation_batch( + batch=unique_cases, + result=outcome.raw_result, + capture_traces=capture_traces, + evaluation_batch_cls=EvaluationBatch, + ) + + def make_reflective_dataset( + self, + candidate: dict[str, str], + eval_batch: Any, + components_to_update: list[str], + ) -> Mapping[str, Sequence[Mapping[str, Any]]]: + """Render failed-case trajectories into GEPA's reflective dataset shape. + + Each record is a turn-sliced dict tuned for multi-component / + multi-turn / multi-run / tool-using / multi-metric scenarios: + + - ``case_id``: stable identifier for cross-referencing. + - ``score``: aggregated case score in ``[0, 1]``; ``1.0`` = + every metric passed. + - ``Case Body``: turn-sliced markdown — see :func:`_build_case_body`. + - ``Other Active Components`` *(optional)*: current text of + every other prompt in the candidate. Present only when the + candidate has more than one component and the others + contain text. See :func:`_build_other_active_components`. + + Cases on the evaluator-error path (no runs produced) surface a + minimal record whose Case Body is the captured ``error_message``, + so the reflection LM still sees that the case failed. + """ + if not components_to_update: + return {} + + trajectories = getattr(eval_batch, "trajectories", None) + if not trajectories: + return {comp: [] for comp in components_to_update} + + # Per-component records: ``Other Active Components`` depends on + # which component is being rewritten this round, so rebuild it. + dataset: dict[str, list[Mapping[str, Any]]] = {} + for comp in components_to_update: + other_components_md = _build_other_active_components(candidate, [comp]) + records: list[Mapping[str, Any]] = [] + for traj in trajectories: + score = traj.get("score", 0.0) + if score >= 1.0: + continue + + case = traj.get("_case") + case_runs = traj.get("_case_runs") or [] + if not isinstance(case, EvalCase): + continue + + case_body = (_build_case_body(case, case_runs) if case_runs else "") + if not case_body: + # Evaluator-error path: fall back to the captured + # error_message so the LM still gets a diagnostic. + case_body = (traj.get("error_message") or "(no trajectory data captured)") + record: dict[str, Any] = { + "case_id": case.eval_id, + "score": float(score), + "Case Body": case_body, + } + history = self._best_history.get(case.eval_id, [])[:self._top_k] + if history: + record["history_top_k"] = history + if other_components_md: + record["Other Active Components"] = other_components_md + records.append(record) + dataset[comp] = records + return dataset + + def _build_evaluation_batch( + self, + *, + batch: list[EvalCase], + result: Optional[EvaluateResult], + capture_traces: bool, + evaluation_batch_cls: type, + ) -> Any: + scores: list[float] = [] + outputs: list[Any] = [] + trajectories: Optional[list[dict[str, Any]]] = [] if capture_traces else None + # Per-case per-metric scores. Dropped to ``None`` after the loop + # if no metric data was collected, so gepa's per-objective + # frontier stays inactive when the evaluator emits none. + objective_scores: list[dict[str, float]] = [] + + if result is None or not result.results_by_eval_set_id: + for case in batch: + scores.append(0.0) + outputs.append("") + objective_scores.append({}) + if trajectories is not None: + trajectories.append(_build_trajectory_entry(case, 0.0, error_message="no result returned")) + return evaluation_batch_cls( + outputs=outputs, + scores=scores, + trajectories=trajectories, + objective_scores=None, + ) + + set_result = next(iter(result.results_by_eval_set_id.values())) + + for case in batch: + case_runs = set_result.eval_results_by_eval_id.get(case.eval_id, []) + if not case_runs: + scores.append(0.0) + outputs.append("") + objective_scores.append({}) + if trajectories is not None: + trajectories.append( + _build_trajectory_entry( + case, + 0.0, + error_message="case missing from evaluator result", + )) + continue + + case_score = _continuous_case_score(case_runs) + scores.append(case_score) + objective_scores.append(_per_metric_objective_scores(case_runs)) + + first_run = case_runs[0] + outputs.append(_extract_case_output(first_run)) + + self._record_history( + case_id=case.eval_id, + score=case_score, + best_response=_extract_case_output(first_run), + ) + + if trajectories is not None: + trajectories.append(_build_trajectory_entry(case, case_score, case_runs=case_runs)) + + # Keep the field active when ANY case produced a non-empty metric map; + # GEPA treats ``None`` as "no per-objective data". + has_objective_data = any(scores_map for scores_map in objective_scores) + return evaluation_batch_cls( + outputs=outputs, + scores=scores, + trajectories=trajectories, + objective_scores=objective_scores if has_objective_data else None, + ) diff --git a/trpc_agent_sdk/evaluation/_optimize_gepa_callback.py b/trpc_agent_sdk/evaluation/_optimize_gepa_callback.py new file mode 100644 index 00000000..a7588ef8 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_gepa_callback.py @@ -0,0 +1,381 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""GEPACallback adapter buffering real-time iteration events as RoundRecords. + +Implements ``gepa.core.callbacks.GEPACallback`` so the framework captures the +full reflective lifecycle for each iteration: + + * ``on_iteration_start`` — reset per-iteration buffer; snapshot the + reflection-LM counters so per-round deltas + are correct. + * ``on_minibatch_sampled`` — record train minibatch size for the round. + * ``on_proposal_end`` — capture which components the reflection LM + actually rewrote this round (gepa's + component selector, e.g. RoundRobin, may + mutate only a subset of the candidate's + components per round). + * ``on_evaluation_end`` — capture parent / candidate subsample scores + (the first two non-seed evaluations of an + iteration are parent + candidate on the + sampled minibatch). + * ``on_evaluation_skipped`` — capture the skip reason that prevented a + full validation evaluation (e.g. subsample + gate did not pass). + * ``on_valset_evaluated`` — capture the full validation pass rate, + metric breakdown and failed case ids; the + ``iteration == 0`` event is recorded as the + baseline instead of a round. + * ``on_merge_attempted`` — tag the current round as a ``"merge"`` round. + * ``on_budget_updated`` — track the gepa-reported ``metric_calls_used`` + counter so the reporter shows real budget + usage instead of a derived estimate. + * ``on_iteration_end`` — flush a complete RoundRecord (always, even + for rounds rejected at the subsample gate); + emit a RoundView for the attached reporter. +""" + +from __future__ import annotations + +from datetime import datetime +from datetime import timezone +from typing import TYPE_CHECKING +from typing import Any +from typing import Callable +from typing import Mapping +from typing import Optional + +from ._optimize_result import RoundRecord + +if TYPE_CHECKING: + from ._optimize_reporter import OptimizeReporter + +# Translate gepa's skip reason literals into user-facing wording. +# Source: reference/gepa reflective_mutation.py:299, :320. +_GEPA_SKIP_REASON_MAP: dict[str, str] = { + "no_trajectories": "no trajectories captured this round", + "all_scores_perfect": "minibatch already perfect (skip_perfect_score on)", +} + +# Used when a round produced no candidate without emitting evaluation_skipped. +_NO_PROPOSAL_FALLBACK: str = "reflect-LM produced no usable new prompt" + + +def _translate_skip_reason(raw: Optional[str]) -> Optional[str]: + """Translate a gepa skip reason; unknown values surface under ``gepa-internal:``.""" + if raw is None: + return None + text = str(raw).strip() + if not text: + return None + if text in _GEPA_SKIP_REASON_MAP: + return _GEPA_SKIP_REASON_MAP[text] + normalised = text.lower().replace(" ", "_").replace("-", "_") + if normalised in _GEPA_SKIP_REASON_MAP: + return _GEPA_SKIP_REASON_MAP[normalised] + return f"gepa-internal: {text}" + + +class _AgentGEPACallback: + """Buffer per-iteration RoundRecords for GepaReflectiveOptimizer. + + Attributes: + rounds: list of RoundRecord populated during gepa.optimize() execution. + baseline_metric_breakdown: metric breakdown for the seed candidate + captured from the iteration-0 valset evaluation event. + baseline_failed_case_ids: failed case ids for the seed candidate. + baseline_pass_rate: average validation score for the seed candidate. + """ + + def __init__( + self, + *, + adapter: Any = None, + reflection_lm: Any = None, + reporter: Optional["OptimizeReporter"] = None, + train_size: int = 0, + budget_total: Optional[int] = None, + metric_thresholds: Optional[Mapping[str, float]] = None, + on_valset_breakdown: Optional[Callable[[dict[str, float]], None]] = None, + ) -> None: + self.rounds: list[RoundRecord] = [] + self.baseline_metric_breakdown: dict[str, float] = {} + self.baseline_failed_case_ids: list[str] = [] + self.baseline_pass_rate: float = 0.0 + self._adapter = adapter + self._reflection_lm = reflection_lm + self._reporter = reporter + self._train_size = int(train_size) + self._budget_total = budget_total + self._metric_thresholds = dict(metric_thresholds or {}) + self._on_valset_breakdown = on_valset_breakdown + self._budget_used: int = 0 + self._reset_iter_buffer() + self._calls_at_iter_start: int = 0 + self._cost_at_iter_start: float = 0.0 + self._tokens_at_iter_start: dict[str, int] = { + "prompt": 0, + "completion": 0, + "total": 0, + } + + def _reset_iter_buffer(self) -> None: + self._iter_started_at: Optional[datetime] = None + self._iter_iteration: int = 0 + self._iter_candidate: Optional[dict[str, str]] = None + self._iter_val_score: Optional[float] = None + self._iter_is_best: bool = False + self._iter_metric_breakdown: dict[str, float] = {} + self._iter_failed_case_ids: list[str] = [] + self._iter_train_minibatch_size: int = 0 + self._iter_train_size: int = self._train_size + self._iter_train_parent_score: Optional[float] = None + self._iter_train_candidate_score: Optional[float] = None + self._iter_skip_reason: Optional[str] = None + self._iter_error_message: Optional[str] = None + self._iter_kind: str = "reflective" + # Components rewritten this round (set by on_proposal_end). None + # means no proposal event observed for the iteration. + self._iter_changed_components: Optional[list[str]] = None + + def on_iteration_start(self, event: Mapping[str, Any]) -> None: + self._reset_iter_buffer() + self._iter_started_at = datetime.now(timezone.utc) + self._iter_iteration = int(event.get("iteration", 0)) + if self._reflection_lm is not None: + self._calls_at_iter_start = int(getattr(self._reflection_lm, "total_calls", 0)) + self._cost_at_iter_start = float(getattr(self._reflection_lm, "total_cost", 0.0)) + usage = getattr(self._reflection_lm, "total_token_usage", None) or {} + self._tokens_at_iter_start = { + "prompt": int(usage.get("prompt", 0)), + "completion": int(usage.get("completion", 0)), + "total": int(usage.get("total", 0)), + } + + def on_minibatch_sampled(self, event: Mapping[str, Any]) -> None: + minibatch_ids = event.get("minibatch_ids") or [] + self._iter_train_minibatch_size = len(minibatch_ids) + trainset_size = event.get("trainset_size") + if isinstance(trainset_size, int) and trainset_size > 0: + self._iter_train_size = trainset_size + + def on_proposal_end(self, event: Mapping[str, Any]) -> None: + """Capture which components the reflection LM rewrote this round. + + gepa's component selector (e.g. ``RoundRobinReflectionComponentSelector``) + chooses a subset of the candidate's components per round; only + components that produced a non-empty new instruction land in + ``new_instructions``, making it the authoritative source for the + ``optimized_field_names`` field on the buffered RoundRecord. Code + paths that bypass this event (e.g. merge rounds) leave the + marker ``None`` so ``on_iteration_end`` falls back to + ``candidate.keys()``. + """ + new_instructions = event.get("new_instructions") + if isinstance(new_instructions, Mapping): + self._iter_changed_components = list(new_instructions.keys()) + + def on_evaluation_end(self, event: Mapping[str, Any]) -> None: + """Record subsample scores for the parent and the new candidate. + + gepa marks the post-mutation / post-merge evaluation with + ``candidate_idx=None`` (reflective_mutation.py:430 emits None for + the new-candidate eval; merge.py:376 also uses None for the + post-merge eval). Every other evaluation_end carries an int + ``candidate_idx`` and represents the parent / current-program + eval. Routing on this field is more reliable than counting + event order — earlier seq-based logic misclassified rounds + where the reflective proposer picked the seed program (id=0) + as parent, because gepa flags that parent eval with + ``is_seed_candidate=True`` and the previous early-return + dropped the parent score, shifting the candidate score into + the parent slot. + """ + scores = event.get("scores") or [] + if not scores: + return + avg = sum(float(s) for s in scores) / max(1, len(scores)) + if event.get("candidate_idx") is None: + # New candidate evaluation (reflective post-mutation OR + # merge post-merge). + self._iter_train_candidate_score = avg + else: + # Parent / current-program evaluation. + self._iter_train_parent_score = avg + if not self._iter_train_minibatch_size: + self._iter_train_minibatch_size = len(scores) + + def on_evaluation_skipped(self, event: Mapping[str, Any]) -> None: + translated = _translate_skip_reason(event.get("reason")) + if translated: + self._iter_skip_reason = translated + + def on_merge_attempted(self, event: Mapping[str, Any]) -> None: + self._iter_kind = "merge" + + def on_budget_updated(self, event: Mapping[str, Any]) -> None: + used = event.get("metric_calls_used") + if isinstance(used, int): + self._budget_used = used + + def on_error(self, event: Mapping[str, Any]) -> None: + exc = event.get("exception") + if exc is not None: + self._iter_error_message = str(exc) + + def on_valset_evaluated(self, event: Mapping[str, Any]) -> None: + candidate = event.get("candidate") + if candidate is None: + return + # adapter.last_outcome was set immediately before gepa emits this + # event, so the breakdown / failures correspond to ``candidate``. + outcome = getattr(self._adapter, "last_outcome", None) if self._adapter else None + metric_breakdown: dict[str, float] = {} + failed_case_ids: list[str] = [] + if outcome is not None: + metric_breakdown = dict(getattr(outcome, "metric_breakdown", {})) + failed_case_ids = list(getattr(outcome, "failed_case_ids", [])) + + if self._on_valset_breakdown is not None: + try: + self._on_valset_breakdown(dict(metric_breakdown)) + except Exception: # pragma: no cover - never break loop on stopper error + pass + + if int(event.get("iteration", -1)) == 0: + self.baseline_metric_breakdown = metric_breakdown + self.baseline_failed_case_ids = failed_case_ids + self.baseline_pass_rate = float(event.get("average_score", 0.0)) + if self._reporter is not None: + try: + self._reporter.baseline_evaluated( + self.baseline_pass_rate, + dict(self.baseline_metric_breakdown), + metric_thresholds=dict(self._metric_thresholds), + ) + except Exception: # pragma: no cover - never break loop on reporter error + pass + return + + self._iter_candidate = dict(candidate) + self._iter_val_score = float(event.get("average_score", 0.0)) + self._iter_is_best = bool(event.get("is_best_program", False)) + self._iter_metric_breakdown = metric_breakdown + self._iter_failed_case_ids = failed_case_ids + + def on_iteration_end(self, event: Mapping[str, Any]) -> None: + """Flush a RoundRecord for the iteration regardless of acceptance. + + Iterations rejected at the subsample gate (``_iter_candidate`` stays + None) are still recorded so the reporter timeline matches gepa's + actual progression and round indices stay contiguous. + """ + iteration = int(event.get("iteration", self._iter_iteration)) + started_at = self._iter_started_at or datetime.now(timezone.utc) + finished_at = datetime.now(timezone.utc) + duration = max(0.0, (finished_at - started_at).total_seconds()) + proposal_accepted = bool(event.get("proposal_accepted", False)) + candidate_seen = self._iter_candidate is not None + accepted = proposal_accepted and candidate_seen + + if self._iter_error_message: + reason = f"error: {self._iter_error_message}" + elif self._iter_skip_reason: + reason = f"skipped: {self._iter_skip_reason}" + elif candidate_seen: + score = self._iter_val_score or 0.0 + reason = (f"GEPA accepted proposal (val_score={score:.4f})" + if accepted else f"Explored by GEPA (val_score={score:.4f})") + else: + reason = "no candidate produced this round" + + reflection_calls_delta = 0 + round_llm_cost = 0.0 + round_token_usage = {"prompt": 0, "completion": 0, "total": 0} + if self._reflection_lm is not None: + reflection_calls_delta = max( + 0, + int(getattr(self._reflection_lm, "total_calls", 0)) - self._calls_at_iter_start, + ) + round_llm_cost = max( + 0.0, + float(getattr(self._reflection_lm, "total_cost", 0.0)) - self._cost_at_iter_start, + ) + cur = getattr(self._reflection_lm, "total_token_usage", None) or {} + for key in ("prompt", "completion", "total"): + round_token_usage[key] = max( + 0, + int(cur.get(key, 0)) - self._tokens_at_iter_start.get(key, 0), + ) + + validation_pass_rate = (self._iter_val_score if self._iter_val_score is not None else 0.0) + candidate_prompts = (dict(self._iter_candidate) if candidate_seen else {}) + # Authoritative source: components captured from on_proposal_end. + # Fallback to full candidate keys for rounds without a proposal + # event (e.g. merge rounds — "rewrite" doesn't apply, listing all + # keys is the least misleading default). + if self._iter_changed_components is not None: + optimized_field_names = list(self._iter_changed_components) + elif candidate_seen: + optimized_field_names = list(self._iter_candidate.keys()) + else: + optimized_field_names = [] + + skip_reason = self._iter_skip_reason + if (not candidate_seen and skip_reason is None and self._iter_error_message is None): + skip_reason = _NO_PROPOSAL_FALLBACK + + record = RoundRecord( + round=iteration, + optimized_field_names=optimized_field_names, + candidate_prompts=candidate_prompts, + train_pass_rate=0.0, + validation_pass_rate=validation_pass_rate, + metric_breakdown=dict(self._iter_metric_breakdown), + accepted=accepted, + acceptance_reason=reason, + failed_case_ids=list(self._iter_failed_case_ids), + reflection_lm_calls=reflection_calls_delta, + round_llm_cost=round_llm_cost, + round_token_usage=round_token_usage, + started_at=started_at.isoformat(), + duration_seconds=duration, + kind=self._iter_kind if self._iter_kind in ("reflective", "merge") else "reflective", + train_minibatch_size=self._iter_train_minibatch_size, + train_subsample_parent_score=self._iter_train_parent_score, + train_subsample_candidate_score=self._iter_train_candidate_score, + skip_reason=skip_reason, + error_message=self._iter_error_message, + budget_used=self._budget_used if self._budget_used else None, + budget_total=self._budget_total, + ) + self.rounds.append(record) + + if self._reporter is not None: + try: + self._emit_round_completed(record) + except Exception: # pragma: no cover - never break loop on reporter error + pass + + def _emit_round_completed(self, record: RoundRecord) -> None: + """Translate a freshly buffered RoundRecord into a RoundView event.""" + from ._optimize_reporter import RoundView + + view = RoundView( + round=record.round, + kind=record.kind, + train_minibatch_size=record.train_minibatch_size, + train_size=self._iter_train_size or self._train_size, + train_subsample_parent_score=record.train_subsample_parent_score, + train_subsample_candidate_score=record.train_subsample_candidate_score, + val_pass_rate=(record.validation_pass_rate if record.candidate_prompts else None), + accepted=record.accepted, + skip_reason=record.skip_reason, + error_message=record.error_message, + duration_seconds=record.duration_seconds, + budget_used=record.budget_used, + budget_total=record.budget_total, + ) + self._reporter.round_completed(view) diff --git a/trpc_agent_sdk/evaluation/_optimize_gepa_reflective.py b/trpc_agent_sdk/evaluation/_optimize_gepa_reflective.py new file mode 100644 index 00000000..322340eb --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_gepa_reflective.py @@ -0,0 +1,612 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""GEPA reflective optimizer: BaseOptimizer subclass driving ``gepa.optimize()``. + +Hosts the ``gepa_reflective`` algorithm and its registry entry. The GEPA +protocol adapter and trajectory helpers live in +:mod:`_optimize_gepa_adapter`; the reflection-LM wrapper lives in +:mod:`_optimize_model_callable`. + +``gepa`` is an optional dependency: ``gepa.optimize`` and the stopper +classes are imported lazily inside :meth:`GepaReflectiveOptimizer._call_gepa_optimize` +and :meth:`GepaReflectiveOptimizer._build_stop_callbacks`, so importing +this module without ``gepa`` installed succeeds but ``run()`` then fails +fast with an informative ImportError. +""" + +from __future__ import annotations + +import asyncio +from datetime import datetime +from datetime import timezone +from pathlib import Path +from typing import Any +from typing import Optional + +from ._base_optimizer import BaseOptimizer +from ._eval_case import EvalCase +from ._eval_config import EvalConfig +from ._eval_set import EvalSet +from ._optimize_config import FrameworkStopConfig +from ._optimize_config import GepaReflectiveAlgo +from ._optimize_gepa_adapter import _AgentGEPAAdapter +from ._optimize_gepa_callback import _AgentGEPACallback +from ._optimize_metric_info import build_metric_reference_doc +from ._optimize_metric_info import build_reflection_prompt_template +from ._optimize_model_callable import _OptimizeModelCallable +from ._optimize_reporter import OptimizeReporter +from ._optimize_reporter import _SilentGepaLogger +from ._optimize_result import OptimizeResult +from ._optimize_result import RoundRecord +from ._optimize_result import StopReason + + +def _load_evalset_cases(path: str) -> list[EvalCase]: + """Read an EvalSet JSON file and return its eval_cases list. + + Raises: + FileNotFoundError: if path does not exist. + pydantic.ValidationError: on schema violations. + """ + content = Path(path).read_text(encoding="utf-8") + evalset = EvalSet.model_validate_json(content) + return list(evalset.eval_cases) + + +def _collect_metric_thresholds(eval_config: EvalConfig) -> dict[str, float]: + """Return ``{metric_name: threshold}`` for every metric in the evaluator config. + + Mirrors what the local evaluator and per-metric evaluators consume so the + reporter and the persisted result share one source of truth for thresholds. + """ + return {metric.metric_name: float(metric.threshold) for metric in eval_config.get_eval_metrics()} + + +class _LabeledStopper: + """Wrap a gepa StopperProtocol with a stable :data:`StopReason` label. + + Delegates ``__call__`` to the inner stopper and exposes a sticky + ``last_triggered`` flag set the first time the inner stopper returns + ``True``. ``_classify_stop_reason`` reads the label after gepa + returns to map back to a single ``stop_reason`` enum value. + """ + + def __init__(self, inner: Any, label: StopReason) -> None: + self._inner = inner + self.label: StopReason = label + self.last_triggered: bool = False + + def __call__(self, *args: Any, **kwargs: Any) -> bool: + result = bool(self._inner(*args, **kwargs)) + if result: + self.last_triggered = True + return result + + +class _RequiredMetricsAboveThresholdStopper: + """gepa Stopper that fires once every required metric meets its threshold. + + Backs the framework-level ``stop.required_metrics`` policy. Each + iteration's per-metric breakdown is pushed via ``update`` (called by + ``_AgentGEPACallback.on_valset_breakdown``); ``__call__`` returns + True as soon as that breakdown clears every threshold, halting the + run with ``stop_reason="required_metrics_passing"``. + + Attributes: + last_triggered: Sticky flag set the first time ``__call__`` + returned True. + """ + + def __init__(self, required_thresholds: dict[str, float]) -> None: + self._thresholds: dict[str, float] = dict(required_thresholds) + self._latest: dict[str, float] = {} + self.last_triggered: bool = False + + def update(self, breakdown: dict[str, float]) -> None: + """Record the most recent per-metric breakdown observed on the valset.""" + self._latest = dict(breakdown) + + def __call__(self, gepa_state: Any = None) -> bool: + triggered = BaseOptimizer.metrics_meet_thresholds(self._latest, self._thresholds) + if triggered: + self.last_triggered = True + return triggered + + +def _build_optimize_result( + *, + gepa_result: Any, + baseline_prompts: dict[str, str], + best_candidate: dict[str, str], + reflection_lm_cost: float, + started_at: datetime, + finished_at: datetime, + algo_name: str, + finish_reason: str = "completed", + callback_rounds: Optional[list[RoundRecord]] = None, + baseline_metric_breakdown: Optional[dict[str, float]] = None, + metric_thresholds: Optional[dict[str, float]] = None, + stop_reason: Optional[StopReason] = None, + total_reflection_lm_calls: int = 0, + total_judge_model_calls: int = 0, + total_judge_cost: float = 0.0, + total_token_usage: Optional[dict[str, int]] = None, +) -> OptimizeResult: + """Map a successful GEPAResult into the framework's OptimizeResult schema. + + Round source priority: + 1. ``callback_rounds`` — real-time RoundRecord buffer from + :class:`_AgentGEPACallback` (used in production whenever gepa + emits iteration events). + 2. Post-hoc reconstruction from ``gepa_result.candidates`` / + ``val_aggregate_scores`` — fallback for callers that don't + install the callback (e.g. mock-driven unit tests, older gepa + versions). + + Args: + baseline_metric_breakdown: Per-metric mean for the baseline + candidate, captured by callback at iteration 0. + total_reflection_lm_calls: Reflection LM invocation count. + total_judge_model_calls: Evaluator-internal judge LM count. + total_judge_cost: USD cost charged to the judge LM (added to + reflection-LM cost). + total_token_usage: ``{"prompt", "completion", "total"}`` for the + reflection LM, optionally merged with judge token usage. + """ + val_scores = list(gepa_result.val_aggregate_scores) + baseline_pass_rate = float(val_scores[0]) if val_scores else 0.0 + best_idx = int(gepa_result.best_idx) + best_pass_rate = float(val_scores[best_idx]) if val_scores else 0.0 + + started_iso = started_at.isoformat() + if callback_rounds: + rounds = list(callback_rounds) + else: + # Fallback path: no callback event stream available. gepa_result + # alone doesn't carry per-round mutation metadata, so fields + # below use the most-conservative approximation: + # * optimized_field_names: all candidate keys (no signal for + # which subset the reflection LM actually rewrote — the + # callback path narrows this via on_proposal_end). + # * accepted: equated with is_best, since GEPAResult only + # reports the final winner, not per-round acceptance. + candidates = list(gepa_result.candidates) + rounds = [] + for i in range(1, len(candidates)): + candidate = dict(candidates[i]) + score = float(val_scores[i]) if i < len(val_scores) else 0.0 + is_best = i == best_idx + rounds.append( + RoundRecord( + round=i, + optimized_field_names=list(candidate.keys()), + candidate_prompts=candidate, + train_pass_rate=0.0, + validation_pass_rate=score, + accepted=is_best, + acceptance_reason=(f"Selected as best by GEPA (val_score={score:.4f})" + if is_best else f"Explored by GEPA (val_score={score:.4f})"), + started_at=started_iso, + duration_seconds=0.0, + )) + + best_metric_breakdown: dict[str, float] = {} + for record in rounds: + if record.candidate_prompts == best_candidate and record.metric_breakdown: + best_metric_breakdown = dict(record.metric_breakdown) + break + + # When gepa finds no improvement (best_idx == 0), best_candidate equals + # the seed prompts and the loop above never matches — iteration 0 is + # captured as ``baseline_metric_breakdown`` rather than a RoundRecord. + # Mirror baseline data into ``best`` so summary.txt shows + # ``baseline -> baseline`` (no improvement) instead of + # ``baseline -> nan`` (looks like data loss). + if (not best_metric_breakdown and best_candidate == baseline_prompts and baseline_metric_breakdown): + best_metric_breakdown = dict(baseline_metric_breakdown) + + extras: dict[str, Any] = {} + total_metric_calls = getattr(gepa_result, "total_metric_calls", None) + if total_metric_calls is not None: + extras["total_metric_calls"] = int(total_metric_calls) + + duration_seconds = max(0.0, (finished_at - started_at).total_seconds()) + token_usage = dict(total_token_usage) if total_token_usage else { + "prompt": 0, + "completion": 0, + "total": 0, + } + + # GEPA's per_objective_best_candidates is dict[str, set[int]] | None; + # convert to dict[str, list[int]] (sorted) for stable JSON output. + raw_per_metric_best = getattr(gepa_result, "per_objective_best_candidates", None) + per_metric_best: dict[str, list[int]] = {} + if isinstance(raw_per_metric_best, dict): + for metric_name, indices in raw_per_metric_best.items(): + try: + per_metric_best[str(metric_name)] = sorted(int(i) for i in indices) + except (TypeError, ValueError): + continue + + return OptimizeResult( + algorithm=algo_name, + status="SUCCEEDED", + finish_reason=finish_reason, + stop_reason=stop_reason, + baseline_pass_rate=baseline_pass_rate, + best_pass_rate=best_pass_rate, + pass_rate_improvement=best_pass_rate - baseline_pass_rate, + baseline_metric_breakdown=dict(baseline_metric_breakdown or {}), + best_metric_breakdown=best_metric_breakdown, + metric_thresholds=dict(metric_thresholds or {}), + per_metric_best_candidates=per_metric_best, + baseline_prompts=dict(baseline_prompts), + best_prompts=dict(best_candidate), + total_rounds=len(rounds), + rounds=rounds, + total_reflection_lm_calls=int(total_reflection_lm_calls), + total_judge_model_calls=int(total_judge_model_calls), + total_llm_cost=float(reflection_lm_cost) + float(total_judge_cost), + total_token_usage=token_usage, + duration_seconds=duration_seconds, + started_at=started_iso, + finished_at=finished_at.isoformat(), + extras=extras, + ) + + +def _build_failed_result( + *, + baseline_prompts: dict[str, str], + started_at: datetime, + finished_at: datetime, + error_message: str, + algo_name: str, + metric_thresholds: Optional[dict[str, float]] = None, +) -> OptimizeResult: + """Build a FAILED OptimizeResult preserving the baseline as the best prompts.""" + return OptimizeResult( + algorithm=algo_name, + status="FAILED", + finish_reason="error", + error_message=error_message, + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + metric_thresholds=dict(metric_thresholds or {}), + baseline_prompts=dict(baseline_prompts), + best_prompts=dict(baseline_prompts), + total_rounds=0, + rounds=[], + total_reflection_lm_calls=0, + total_judge_model_calls=0, + total_llm_cost=0.0, + duration_seconds=max(0.0, (finished_at - started_at).total_seconds()), + started_at=started_at.isoformat(), + finished_at=finished_at.isoformat(), + extras={}, + ) + + +def _build_stop_callbacks( + algo: GepaReflectiveAlgo, + stop_config: FrameworkStopConfig, + metric_thresholds: dict[str, float], + *, + output_dir: Optional[str] = None, +) -> tuple[list[Any], Optional[_RequiredMetricsAboveThresholdStopper]]: + """Translate stop fields into gepa StopperProtocol instances. + + Each non-None ``algo`` field maps to one gepa-native stopper + (max_metric_calls, no_improvement, timeout, score_threshold, + max_candidate_proposals, max_tracked_candidates). + + The framework-level :class:`FrameworkStopConfig` adds the + metric-thresholds policy via + :class:`_RequiredMetricsAboveThresholdStopper` when + ``stop_config.required_metrics`` resolves to a non-empty subset of + ``metric_thresholds``. That instance is also returned so the caller + can inspect ``last_triggered`` for stop-reason classification. + + When ``output_dir`` is supplied, a :class:`gepa.utils.FileStopper` + watches ``/optimize.stop``: creating that file (e.g. + ``touch $OUTPUT_DIR/optimize.stop``) halts gepa cleanly at the next + poll and surfaces as ``stop_reason="user_requested_stop"``. + + Returns: + ``(stop_callbacks, framework_stopper)`` — ``framework_stopper`` + is ``None`` when no per-metric thresholds are enforced. + """ + from gepa.utils.stop_condition import MaxCandidateProposalsStopper + from gepa.utils.stop_condition import MaxMetricCallsStopper + from gepa.utils.stop_condition import MaxTrackedCandidatesStopper + from gepa.utils.stop_condition import NoImprovementStopper + from gepa.utils.stop_condition import ScoreThresholdStopper + from gepa.utils.stop_condition import TimeoutStopCondition + + callbacks: list[Any] = [] + if algo.max_metric_calls is not None: + callbacks.append(_LabeledStopper( + MaxMetricCallsStopper(int(algo.max_metric_calls)), + "budget_exhausted", + )) + if algo.max_iterations_without_improvement is not None: + callbacks.append( + _LabeledStopper( + NoImprovementStopper(int(algo.max_iterations_without_improvement)), + "no_improvement", + )) + if algo.timeout_seconds is not None: + callbacks.append(_LabeledStopper( + TimeoutStopCondition(float(algo.timeout_seconds)), + "timeout", + )) + if algo.score_threshold is not None: + callbacks.append(_LabeledStopper( + ScoreThresholdStopper(float(algo.score_threshold)), + "score_threshold", + )) + if algo.max_candidate_proposals is not None: + callbacks.append( + _LabeledStopper( + MaxCandidateProposalsStopper(int(algo.max_candidate_proposals)), + "max_candidate_proposals", + )) + if algo.max_tracked_candidates is not None: + callbacks.append( + _LabeledStopper( + MaxTrackedCandidatesStopper(int(algo.max_tracked_candidates)), + "max_tracked_candidates", + )) + + framework_stopper: Optional[_RequiredMetricsAboveThresholdStopper] = None + required = BaseOptimizer.resolve_required_thresholds(stop_config, metric_thresholds) + if required: + framework_stopper = _RequiredMetricsAboveThresholdStopper(required) + callbacks.append(framework_stopper) + + if output_dir is not None: + import os as _os + from gepa.utils import FileStopper + + callbacks.append( + _LabeledStopper( + FileStopper(_os.path.join(output_dir, "optimize.stop")), + "user_requested_stop", + )) + + return callbacks, framework_stopper + + +def _classify_stop_reason( + *, + stop_callbacks: list[Any], + framework_stopper: Optional[_RequiredMetricsAboveThresholdStopper], +) -> StopReason: + """Pick the most-specific :data:`StopReason` for an ended gepa run. + + Resolution order: + 1. Framework-level ``required_metrics`` policy (highest priority + because users explicitly opt in). + 2. First :class:`_LabeledStopper` whose ``last_triggered`` is True + (insertion order breaks ties when gepa polled multiple stoppers + in the same tick). + 3. ``"completed"`` when no stopper fired (gepa loop ended + naturally, e.g. exhausted candidate proposals). + """ + if framework_stopper is not None and framework_stopper.last_triggered: + return "required_metrics_passing" + for stopper in stop_callbacks: + if isinstance(stopper, _LabeledStopper) and stopper.last_triggered: + return stopper.label + return "completed" + + +class GepaReflectiveOptimizer(BaseOptimizer): + """BaseOptimizer driving ``gepa.optimize()`` with the framework adapter. + + Flow inside :meth:`run`: + 1. Snapshot baseline prompts via ``TargetPrompt.read_all``. + 2. Load training / validation eval cases. + 3. Build :class:`_AgentGEPAAdapter` and + :class:`_OptimizeModelCallable` (gepa-compatible reflection LM). + 4. Run ``gepa.optimize`` in a worker thread (``asyncio.to_thread``) + so its sync main loop does not block the surrounding event loop. + 5. On success, return a populated :class:`OptimizeResult`; on + failure, return a FAILED result preserving the baseline prompts. + + The facade (``AgentOptimizer.optimize``) decides whether to persist + the winning candidate based on the ``update_source`` flag. + """ + + async def _call_gepa_optimize(self, **kwargs: Any) -> Any: + """Run gepa.optimize in a thread; isolated for tests to monkeypatch.""" + from gepa import optimize as gepa_optimize # lazy import; gepa is optional + + return await asyncio.to_thread(gepa_optimize, **kwargs) + + async def run( + self, + *, + reporter: Optional[OptimizeReporter] = None, + ) -> OptimizeResult: + algo: GepaReflectiveAlgo = self.config.optimize.algorithm + algo_name = algo.name + metric_thresholds = _collect_metric_thresholds(self.config.evaluate) + + started_at = datetime.now(timezone.utc) + baseline_prompts = await self.target_prompt.read_all() + seed_candidate = dict(baseline_prompts) + + try: + trainset = _load_evalset_cases(self.train_dataset_path) + valset = _load_evalset_cases(self.validation_dataset_path) + except Exception as ex: + return _build_failed_result( + baseline_prompts=baseline_prompts, + started_at=started_at, + finished_at=datetime.now(timezone.utc), + error_message=f"dataset load failed: {ex}", + algo_name=algo_name, + metric_thresholds=metric_thresholds, + ) + + adapter = _AgentGEPAAdapter( + target_prompt=self.target_prompt, + eval_config=self.config.evaluate, + call_agent=self.call_agent, + callbacks=self.callbacks, + num_runs=self.config.evaluate.num_runs, + case_parallelism=self.config.optimize.eval_case_parallelism, + top_k_per_case=int(algo.reflection_history_top_k), + ) + reflection_lm = _OptimizeModelCallable(algo.reflection_lm) + + try: + return await self._run_with_adapter( + adapter=adapter, + reflection_lm=reflection_lm, + algo=algo, + algo_name=algo_name, + baseline_prompts=baseline_prompts, + seed_candidate=seed_candidate, + trainset=trainset, + valset=valset, + metric_thresholds=metric_thresholds, + started_at=started_at, + reporter=reporter, + ) + finally: + adapter.close() + + async def _run_with_adapter( + self, + *, + adapter: _AgentGEPAAdapter, + reflection_lm: _OptimizeModelCallable, + algo: GepaReflectiveAlgo, + algo_name: str, + baseline_prompts: dict[str, str], + seed_candidate: dict[str, str], + trainset: list, + valset: list, + metric_thresholds: dict[str, float], + started_at: datetime, + reporter: Optional[OptimizeReporter], + ) -> OptimizeResult: + try: + stop_callbacks, framework_stopper = _build_stop_callbacks( + algo, + self.config.optimize.stop, + metric_thresholds, + output_dir=self.output_dir, + ) + except ImportError as ex: + return _build_failed_result( + baseline_prompts=baseline_prompts, + started_at=started_at, + finished_at=datetime.now(timezone.utc), + error_message=f"gepa stop_callbacks unavailable: {ex}", + algo_name=algo_name, + metric_thresholds=metric_thresholds, + ) + + gepa_callback = _AgentGEPACallback( + adapter=adapter, + reflection_lm=reflection_lm, + reporter=reporter, + train_size=len(trainset), + budget_total=algo.max_metric_calls, + metric_thresholds=metric_thresholds, + on_valset_breakdown=(framework_stopper.update if framework_stopper is not None else None), + ) + + # Embed a metric reference doc in the reflection prompt template so + # the reflection LM understands each feedback row. Empty doc still + # yields a GEPA-valid template. + reflection_prompt_template = build_reflection_prompt_template(build_metric_reference_doc(self.config.evaluate)) + + gepa_kwargs: dict[str, Any] = dict( + seed_candidate=seed_candidate, + trainset=trainset, + valset=valset, + adapter=adapter, + reflection_lm=reflection_lm, + reflection_prompt_template=reflection_prompt_template, + callbacks=[gepa_callback, *self.extra_gepa_callbacks], + candidate_selection_strategy=algo.candidate_selection_strategy, + module_selector=algo.module_selector, + reflection_minibatch_size=algo.reflection_minibatch_size, + skip_perfect_score=algo.skip_perfect_score, + perfect_score=algo.perfect_score, + use_merge=algo.use_merge, + max_merge_invocations=algo.max_merge_invocations, + merge_val_overlap_floor=algo.merge_val_overlap_floor, + frontier_type=algo.frontier_type, + cache_evaluation=algo.cache_evaluation, + track_best_outputs=algo.track_best_outputs, + raise_on_exception=True, + seed=algo.seed, + display_progress_bar=False, + stop_callbacks=[*stop_callbacks, *self.extra_stop_callbacks], + ) + # ``max_metric_calls`` is also a direct kwarg for backwards + # compatibility with gepa builds lacking ``MaxMetricCallsStopper``. + if algo.max_metric_calls is not None: + gepa_kwargs["max_metric_calls"] = int(algo.max_metric_calls) + + # Silence gepa's stdout logger when a reporter is attached so its + # internal messages don't collide with the reporter timeline. + if reporter is not None: + gepa_kwargs["logger"] = _SilentGepaLogger(verbose=1) + try: + gepa_result = await self._call_gepa_optimize(**gepa_kwargs) + except Exception as ex: + return _build_failed_result( + baseline_prompts=baseline_prompts, + started_at=started_at, + finished_at=datetime.now(timezone.utc), + error_message=str(ex), + algo_name=algo_name, + metric_thresholds=metric_thresholds, + ) + + best_idx = int(gepa_result.best_idx) + best_candidate = dict(gepa_result.candidates[best_idx]) + + val_scores = list(gepa_result.val_aggregate_scores) + baseline_pass_rate = float(val_scores[0]) if val_scores else 0.0 + best_pass_rate = float(val_scores[best_idx]) if val_scores else 0.0 + if best_pass_rate >= 1.0 and baseline_pass_rate >= 1.0: + finish_reason = "perfect_pass_rate" + elif best_pass_rate <= baseline_pass_rate: + finish_reason = "no_improvement" + else: + finish_reason = "completed" + + stop_reason: StopReason = _classify_stop_reason( + stop_callbacks=stop_callbacks, + framework_stopper=framework_stopper, + ) + + return _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline_prompts, + best_candidate=best_candidate, + reflection_lm_cost=reflection_lm.total_cost, + callback_rounds=gepa_callback.rounds, + started_at=started_at, + finished_at=datetime.now(timezone.utc), + algo_name=algo_name, + finish_reason=finish_reason, + baseline_metric_breakdown=dict(gepa_callback.baseline_metric_breakdown), + metric_thresholds=metric_thresholds, + stop_reason=stop_reason, + total_reflection_lm_calls=int(reflection_lm.total_calls), + total_judge_model_calls=0, + total_judge_cost=0.0, + total_token_usage=dict(reflection_lm.total_token_usage), + ) diff --git a/trpc_agent_sdk/evaluation/_optimize_metric_info.py b/trpc_agent_sdk/evaluation/_optimize_metric_info.py new file mode 100644 index 00000000..70c6f2b1 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_metric_info.py @@ -0,0 +1,534 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Metric reference doc builder for the optimize module. + +Renders a structured markdown "syllabus" describing how each +user-configured metric is computed, for injection into the reflection +LM's prompt template alongside the per-case feedback. + +The corpus is owned here (not delegated to each evaluator's +``get_metric_info()``) so wording can be tuned for what a rewriting LM +needs. + +Coverage: +- Excludes tool/algorithm-fixed metrics (``tool_trajectory_avg_score``, + ``response_match_score``, ``response_evaluation_score``). +- FinalResponseCriterion: text match modes / case sensitivity / ignore / + JSON tree / numeric tolerance / AND combination / custom compare. +- LLMJudgeCriterion: single/multi judge / six built-in aggregators / + parallel / rubrics / knowledge_tool_names / generation_config / think + mode / weights. +""" + +from __future__ import annotations + +import math +from typing import Any +from typing import Optional + +from ._eval_config import EvalConfig +from ._eval_metrics import EvalMetric +from ._eval_metrics import PrebuiltMetrics +from ._evaluator_registry import EVALUATOR_REGISTRY + +_SKIPPED_METRICS: frozenset[str] = frozenset({ + PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value, + PrebuiltMetrics.RESPONSE_MATCH_SCORE.value, + PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value, +}) + +_METRIC_DESCRIPTIONS: dict[str, str] = { + PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: ("Deterministic match between the agent's final response and the " + "reference answer. Each invocation scores 1.0 (match) or 0.0 (no " + "match); the case score is the mean across invocations."), + PrebuiltMetrics.LLM_FINAL_RESPONSE.value: ("An LLM judge inspects the agent's final response and returns a " + "holistic valid/invalid verdict (1.0 or 0.0) together with a " + "natural-language reason."), + PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value: ("An LLM judge scores the agent's final response against a list of " + "rubric items. Each rubric is judged independently (0 or 1); the " + "overall score is the mean of sub-scores. The judge returns a per-" + "rubric reason explaining its verdict."), + PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value: + ("An LLM judge inspects the knowledge content the agent retrieved via " + "tool calls and scores it against a list of rubric items. Each " + "rubric is judged independently (0 or 1); the overall score is the " + "mean of sub-scores."), +} + +_AGGREGATOR_EXPLANATIONS: dict[str, str] = { + "all_pass": "all judges must PASS for the metric to PASS (strictest).", + "any_pass": "any single judge passing is enough for the metric to PASS (most lenient).", + "majority_pass": "more than half of the judges must PASS.", + "avg": "arithmetic mean of judges' scores (uniform weighting).", + "weighted_avg": "weighted mean of judges' scores using each model's ``weight``.", + "weighted_majority": "weighted majority vote: passes when the weighted PASS vote exceeds the FAIL vote.", +} + +_HEADER = ("## Metrics Reference\n\n" + "The assistant's outputs are graded by the metrics below. UNDERSTAND THESE " + "BEFORE PROPOSING CHANGES — they determine whether your new instruction " + "improves or regresses the candidate.") + +_FOOTER_GUIDELINES = ("## Rewriting Guidelines\n\n" + "1. **Preserve passing metrics.** A metric currently above its threshold " + "must not be sacrificed to fix a failing one.\n" + "2. **Use per-rubric sub-scores.** When a metric's per-case feedback " + "includes ``rubric_scores``, the failing sub-rubric tells you exactly " + "what's missing — and the passing ones tell you what to keep.\n" + "3. **Criterion-based metrics are deterministic.** The agent's output " + "must literally satisfy the matching rule (a ``contains`` rule means " + "the actual output has to include the expected substring verbatim).\n" + "4. **LLM-judged metrics evaluate qualities.** The judge reads each " + "rubric body literally. To lift a failing rubric you must instruct the " + "agent to visibly exhibit the quality that rubric describes.") + + +def build_metric_reference_doc(eval_config: EvalConfig) -> str: + """Render the metric reference doc as markdown. + + Builds one section per user-configured criterion-based metric (skipping + tool-call and algorithm-fixed metrics). Order is preserved from the user's + configuration. Returns the header alone when no metric is eligible — the + caller still gets a valid doc to inject. + """ + metrics = eval_config.get_eval_metrics() + included = [m for m in metrics if m.metric_name not in _SKIPPED_METRICS] + + if not included: + return _HEADER + "\n\n_No graded metrics with criterion config are registered._\n" + + sections = [_HEADER] + for metric in included: + sections.append(build_metric_section(metric)) + sections.append(_FOOTER_GUIDELINES) + + return "\n\n".join(sections) + + +def build_metric_section(metric: EvalMetric) -> str: + """Render a single metric's section. + + Public to keep tests focused: the section is also unit-testable + independently of the surrounding header/footer. + """ + name = metric.metric_name + threshold = float(metric.threshold) + criterion = metric.criterion or {} + + lines: list[str] = [] + lines.append(f"### Metric: `{name}`") + lines.append("") + lines.append(f"**Type**: {_metric_type(name)}") + description = _METRIC_DESCRIPTIONS.get(name) + if description: + lines.append(f"**Description**: {description}") + lines.append("") + + lines.append("**Scoring algorithm**:") + if name == PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: + lines.extend(_render_final_response_criterion(criterion, metric_name=name)) + elif name in { + PrebuiltMetrics.LLM_FINAL_RESPONSE.value, + PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value, + PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value, + }: + lines.extend(_render_llm_judge_criterion(criterion, metric_name=name)) + lines.append("") + + lines.append("**Score range**: 0.0 ~ 1.0") + lines.append(f"**PASS condition**: score >= {threshold:.4f}") + if name in { + PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value, + PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value, + }: + n_rubrics = _count_rubrics(criterion) + if n_rubrics > 0: + min_pass = math.ceil(threshold * n_rubrics) + lines.append(f" - With {n_rubrics} rubric item(s), at least **{min_pass}** must pass.") + lines.append("") + + lines.append("**Per-case feedback contains**:") + lines.extend(_render_feedback_fields(name)) + lines.append("") + + lines.append("**What reflection LM should know**:") + lines.extend(_render_reflection_hints(name, criterion)) + + return "\n".join(lines) + + +def _metric_type(name: str) -> str: + if name == PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: + return "criterion-based (deterministic text and/or JSON match)" + if name == PrebuiltMetrics.LLM_FINAL_RESPONSE.value: + return "LLM-judged binary (valid/invalid)" + if name == PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value: + return "LLM-judged rubric scoring (multiple sub-rubrics, score is the mean)" + if name == PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value: + return "LLM-judged rubric scoring over knowledge-retrieval tool outputs" + return "custom" + + +def _render_final_response_criterion(criterion: dict, *, metric_name: str) -> list[str]: + out: list[str] = [] + + if _has_custom_compare(metric_name): + out.append("- **Custom compare function**: registered via " + "``EVALUATOR_REGISTRY.set_criterion_compare``. This callable " + "**overrides** all built-in text/JSON strategies below — the " + "agent's output is judged purely by user code.") + return out + + fr = _pick(criterion, "final_response", "finalResponse") + if not isinstance(fr, dict) or not fr: + out.append("- _No ``final_response`` config provided; the metric will return 0.0 (FAIL)._") + return out + + text = _pick(fr, "text", "text_strategy", "textStrategy") + json_cfg = _pick(fr, "json", "json_strategy", "jsonStrategy") + + if isinstance(text, dict): + out.extend(_render_text_strategy(text)) + if isinstance(json_cfg, dict): + out.extend(_render_json_strategy(json_cfg)) + + if isinstance(text, dict) and isinstance(json_cfg, dict): + out.append("- **Combined**: both text and JSON checks must pass (AND logic). " + "A single failing check fails the case.") + + if not isinstance(text, dict) and not isinstance(json_cfg, dict): + out.append("- _Neither text nor JSON strategy configured; the metric will FAIL by default._") + + return out + + +def _render_text_strategy(text: dict) -> list[str]: + match = str(text.get("match") or text.get("match_strategy") or "exact").strip().lower() + case_insensitive = bool(text.get("case_insensitive") or text.get("caseInsensitive")) + ignored = bool(text.get("ignore")) + + if ignored: + return ["- **Text comparison**: ``ignore=True`` — text check is skipped (always passes)"] + + mode_desc = { + "exact": "actual output must be **byte-equal** to expected", + "contains": "actual output must **contain** expected as a substring", + "regex": "expected is treated as a **regular expression**; matched via ``re.search``", + }.get(match, f"``{match}``") + case_note = "case-insensitive" if case_insensitive else "case-sensitive" + + return [f"- **Text comparison** (``match=\"{match}\"``, {case_note}): {mode_desc}"] + + +def _render_json_strategy(json_cfg: dict) -> list[str]: + if bool(json_cfg.get("ignore")): + return ["- **JSON comparison**: ``ignore=True`` — JSON check is skipped"] + + out = ["- **JSON comparison**: actual and expected are parsed as JSON, then compared structurally"] + ignore_tree = _pick(json_cfg, "ignore_tree", "ignoreTree") + tolerance = _pick(json_cfg, "number_tolerance", "numberTolerance") + if isinstance(ignore_tree, dict) and ignore_tree: + out.append(f" - Keys ignored before compare (``ignore_tree``): ``{ignore_tree}``") + if tolerance is not None: + out.append(f" - Numeric tolerance: {tolerance}") + else: + out.append(" - Numeric tolerance: 1e-6 (default)") + return out + + +def _render_llm_judge_criterion(criterion: dict, *, metric_name: str) -> list[str]: + out: list[str] = [] + + llm = _pick(criterion, "llm_judge", "llmJudge") + if not isinstance(llm, dict) or not llm: + out.append("- _No ``llm_judge`` config provided; the metric will fail to evaluate._") + return out + + single = _pick(llm, "judge_model", "judgeModel") + multi = _pick(llm, "judge_models", "judgeModels") + + if isinstance(multi, list) and multi: + out.append(f"- **Judge models** ({len(multi)} judges, each scores independently):") + for jm in multi: + if isinstance(jm, dict): + out.append(" - " + _format_judge_model(jm)) + agg = str(_pick(llm, "models_aggregator", "modelsAggregator") or "all_pass") + agg_expl = _AGGREGATOR_EXPLANATIONS.get(agg, "custom aggregator (registered separately).") + out.append(f"- **Cross-model aggregator** (``{agg}``): {agg_expl}") + parallel = llm.get("parallel", True) + par_text = ("yes (judges run concurrently)" if parallel else "no (judges run sequentially)") + out.append(f"- **Parallel execution**: {par_text}") + elif isinstance(single, dict): + out.append(f"- **Judge model**: {_format_judge_model(single)}") + else: + out.append("- _No judge model configured._") + + rubrics = llm.get("rubrics") or [] + if isinstance(rubrics, list) and rubrics: + out.append(f"- **Rubric items** ({len(rubrics)} items judged independently, each scored 0 or 1; " + "overall score = mean of sub-scores):") + for i, rubric in enumerate(rubrics, 1): + if not isinstance(rubric, dict): + continue + rid = rubric.get("id", f"rubric_{i}") + desc = rubric.get("description", "") + content = rubric.get("content") or {} + body = content.get("text", "") if isinstance(content, dict) else "" + head = f" {i}. **``{rid}``**" + if desc: + head += f" — {desc}" + out.append(head) + if body: + out.append(f" > {body}") + + if metric_name == PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value: + knowledge_tools = _pick(llm, "knowledge_tool_names", "knowledgeToolNames") + if isinstance(knowledge_tools, list) and knowledge_tools: + out.append("- **Knowledge tools** (judge inspects results from these tool calls): " + f"``{', '.join(knowledge_tools)}``") + else: + out.append("- **Knowledge tools**: default knowledge tool set is used (no override).") + + return out + + +def _format_judge_model(jm: dict) -> str: + model = jm.get("model_name") or jm.get("modelName") or "" + extras: list[str] = [] + + num_samples = jm.get("num_samples") or jm.get("numSamples") + if isinstance(num_samples, int) and num_samples > 1: + extras.append(f"num_samples={num_samples}") + + gen = jm.get("generation_config") or jm.get("generationConfig") or {} + if isinstance(gen, dict): + if "temperature" in gen: + extras.append(f"temperature={gen['temperature']}") + mt = gen.get("max_tokens") or gen.get("maxTokens") + if mt is not None: + extras.append(f"max_tokens={mt}") + + weight = jm.get("weight") + if isinstance(weight, (int, float)) and float(weight) != 1.0: + extras.append(f"weight={weight}") + + think = jm.get("think") + if think is True: + extras.append("think=True") + elif think is False: + extras.append("think=False") + + base = f"``{model}``" + if extras: + return f"{base} ({', '.join(extras)})" + return base + + +def _render_feedback_fields(metric_name: str) -> list[str]: + out = ["- ``metric_name``, ``status`` (PASSED/FAILED), ``score``, ``threshold`` — always present"] + if metric_name == PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: + out.append("- ``reason`` — short string (deterministic comparator; synthesized " + "from the criterion config when the matcher leaves it empty)") + return out + + out.append("- ``reason`` — natural-language explanation written by the LLM judge") + if metric_name in { + PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value, + PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value, + }: + out.append("- ``rubric_scores`` — per-rubric breakdown; each item has ``id``, " + "``score``, and a ``reason`` written by the judge") + out.append("- ``per_model_scores`` (when multiple judge_models are configured) — " + "each judge's independent score/reason") + return out + + +def _render_reflection_hints(metric_name: str, criterion: dict) -> list[str]: + out: list[str] = [] + + if metric_name == PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: + if _has_custom_compare(metric_name): + out.append("- Matching is delegated to user-provided Python code; format " + "requirements depend entirely on that comparator.") + return out + fr = _pick(criterion, "final_response", "finalResponse") or {} + text = fr.get("text") if isinstance(fr, dict) else None + match = "" + if isinstance(text, dict): + match = str(text.get("match") or text.get("match_strategy") or "exact").lower() + if match == "exact": + out.append("- Output must be **byte-exact**: stray whitespace or punctuation will FAIL.") + out.append("- Prompt should constrain the agent to emit *only* the expected literal text " + "with no extra prose or formatting.") + elif match == "contains": + out.append("- Output must literally **contain** the expected substring.") + out.append("- Prompt should drive the agent to emit that substring with correct " + "word order, punctuation, and units.") + elif match == "regex": + out.append("- Output is tested with ``re.search``; ensure the agent's response " + "satisfies the regex (think about how greediness and anchoring affect matching).") + if isinstance(fr, dict) and (fr.get("json") or fr.get("json_strategy") or fr.get("jsonStrategy")): + out.append("- JSON comparison is active; when the agent's output is parsed as JSON, " + "structural equality (after ``ignore_tree`` removal) matters.") + return out + + if metric_name == PrebuiltMetrics.LLM_FINAL_RESPONSE.value: + out.append("- The LLM judge gives a holistic verdict; read its ``reason`` for what swayed it.") + out.append("- Align the prompt with the qualities the judge consistently rewards.") + return out + + if metric_name in { + PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value, + PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value, + }: + out.append("- The judge reads each rubric body **literally**. To lift a failing rubric, " + "the agent's output must visibly satisfy what that rubric describes.") + out.append("- Do NOT remove qualities currently scoring 1.0 — examine the passing " + "rubrics in the feedback and keep their requirements in your new prompt.") + out.append("- When a rubric is being judged unfairly, prompt the agent to call out " + "the relevant quality explicitly so the judge cannot miss it.") + return out + + +def _count_rubrics(criterion: dict) -> int: + llm = _pick(criterion, "llm_judge", "llmJudge") or {} + if not isinstance(llm, dict): + return 0 + rubrics = llm.get("rubrics") or [] + if not isinstance(rubrics, list): + return 0 + return len(rubrics) + + +def _has_custom_compare(metric_name: str) -> bool: + """Detect whether a user-registered custom compare callable is present. + + Reads the registry's internal map by getattr (no public accessor exists); + falls back to ``False`` if the attribute is missing or non-mapping. + """ + registry = getattr(EVALUATOR_REGISTRY, "_criterion_compares", None) + if not isinstance(registry, dict): + return False + return metric_name in registry + + +_REFLECTION_PROMPT_PREFIX = ("I provided an assistant with the following instruction(s):\n" + "```\n\n```\n") + +_REFLECTION_PROMPT_MID_WITH_DOC = ( + "\n\nThe assistant's output is graded by the metrics described below. " + "READ THEM CAREFULLY — every per-case feedback row references one of these metrics.\n\n") + +_REFLECTION_PROMPT_MID_BARE = ("\n\nBelow are example inputs, the assistant's responses, and per-case feedback " + "summarising how each metric scored the response.\n\n") + +_REFLECTION_PROMPT_FEEDBACK = ("## How to read each example\n\n" + "Every ``# Example N`` block below is a failed case rendered by GEPA " + "as nested markdown headers. The non-self-evident fields:\n\n" + "- ``## score`` is the case-level aggregate on [0, 1] (every metric, " + "every turn, every run rolled into one number); ``1.0`` would mean " + "every metric passed, so all examples here have ``score < 1.0``.\n" + "- ``## Case Body`` — a turn-sliced markdown block; the bulk of the " + "evidence lives here. Format described below.\n" + "- ``## Other Active Components`` *(present iff the candidate has " + "more than one prompt)* — the current text of every prompt OTHER " + "than the one you are about to rewrite (the target prompt is the " + "code-fenced block at the very top of this message). The verdict " + "you see was produced by the agent running with all prompts active, " + "so use these to:\n" + " · avoid restating requirements already enforced elsewhere;\n" + " · avoid contradicting another prompt's instructions;\n" + " · spot gaps that no prompt currently covers.\n" + "- ``## history_top_k`` *(optional, present iff the case has prior " + "high-score runs from earlier candidates)* — a small list of " + "``{score, best_response}`` entries showing what previously scored well " + "on this case. Treat these as anchors: a rewrite that preserves the " + "pattern that produced those high scores is preferable to one that " + "regresses cases the optimizer already solved before.\n\n" + "## Case Body layout\n\n" + "``Case Body`` is a free-text markdown block. Each turn is one " + "``### Turn N`` section containing the conversational truth, the " + "agent's actual behaviour, and the per-turn verdict — kept together " + "so each failing metric is visually anchored to the turn that " + "produced it. Inside one turn:\n\n" + "```\n" + "### Turn N\n" + "**User**: \n" + "**Expected**: \n" + "**Agent Response**: \n" + "**Tool Trace**: (omitted if no tools were used)\n" + "- (=, ...) → [id=]\n" + "**Verdict** (Turn N):\n" + " [PASSED|FAILED] : score=, threshold=\n" + " reason: \n" + " · rubric[]: PASS|FAIL score= reason: \n" + "```\n\n" + "Multi-run cases (``num_runs > 1``) nest each run inside the turn:\n\n" + "```\n" + "### Turn N\n" + "**User**: ...\n" + "**Expected**: ...\n" + "\n" + "#### Run 1\n" + "**Agent Response**: ...\n" + "**Tool Trace**: ...\n" + "**Verdict** (Turn N, Run 1):\n" + " ...\n" + "\n" + "#### Run 2\n" + "...\n" + "```\n\n" + "Multi-turn or multi-run cases close with an ``### Overall`` block " + "(``### Overall (case-level aggregate)`` for single-run, " + "``### Overall (per-run aggregate)`` for multi-run). Single-turn " + "single-run cases skip the Overall block because Turn 1 already " + "carries the only verdict that exists.\n\n" + "## Reading rules\n\n" + "- The reference answer ONLY appears in ``**Expected**``; it is " + "deliberately not echoed inside the Verdict line, so do not look for " + "it there.\n" + "- Every ```` in a Verdict line maps directly to a " + "``### Metric: `` section in the Metrics Reference above " + "— consult it for how the score is computed before deciding what to " + "change.\n" + "- Treat PASSING metrics as constraints, not noise: a rewrite that " + "fixes a FAILING metric while regressing a PASSING one is a " + "regression, not an improvement.\n\n" + "Examples follow:\n" + "```\n\n```\n\n" + "Read each example end-to-end, then rewrite the instruction so PASSING " + "metrics stay passing and FAILING metrics improve. Provide the new " + "instruction inside ``` blocks.\n") + + +def build_reflection_prompt_template(metric_reference_doc: str) -> str: + """Build the prompt template handed to GEPA's reflection LM. + + GEPA fills ```` with the current prompt text and ```` + with the rendered per-case feedback. The metric reference doc is wedged + between them so the LM has: (1) the current prompt, (2) a static metric + syllabus, (3) live per-case feedback, in that order. + + GEPA's ``InstructionProposalSignature.validate_prompt_template`` enforces + that both placeholders are present, so we always keep them — even when + ``metric_reference_doc`` is empty. + """ + doc = (metric_reference_doc or "").strip() + if doc: + middle = _REFLECTION_PROMPT_MID_WITH_DOC + doc + "\n\n" + else: + middle = _REFLECTION_PROMPT_MID_BARE + return _REFLECTION_PROMPT_PREFIX + middle + _REFLECTION_PROMPT_FEEDBACK + + +def _pick(d: dict, *keys: str) -> Optional[Any]: + """Return the first present value among ``keys`` (handles camelCase/snake_case aliases).""" + if not isinstance(d, dict): + return None + for k in keys: + if k in d: + return d[k] + return None diff --git a/trpc_agent_sdk/evaluation/_optimize_model_callable.py b/trpc_agent_sdk/evaluation/_optimize_model_callable.py new file mode 100644 index 00000000..9465dce1 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_model_callable.py @@ -0,0 +1,309 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Synchronous LLM callable for optimizer prompt-rewrite operations. + +Conforms to gepa's ``LanguageModel`` Protocol so the same instance +serves as ``reflection_lm`` for ``gepa.optimize``. Internally drives a +framework :class:`LlmAgent` so optimize-model configuration honours +the framework's provider routing, env-variable expansion, and +``extra_fields`` pass-through. +""" + +from __future__ import annotations + +import asyncio +import copy +import os +import uuid +from typing import Any +from typing import Optional +from typing import Union + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.context import InvocationContext +from trpc_agent_sdk.context import create_agent_context +from trpc_agent_sdk.context import new_invocation_context_id +from trpc_agent_sdk.models import ModelRegistry +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.planners import BuiltInPlanner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import HttpOptions +from trpc_agent_sdk.types import Part +from trpc_agent_sdk.types import ThinkingConfig + +from ._optimize_model_options import OptimizeModelOptions + +DEFAULT_OPTIMIZE_MAX_TOKENS = 4096 +DEFAULT_OPTIMIZE_TEMPERATURE = 0.8 + + +def _expand_env(s: str) -> str: + """Expand environment variables in a string (e.g. $VAR or ${VAR}).""" + if not s or not isinstance(s, str): + return s or "" + return os.path.expandvars(s) + + +def _merge_extra_body( + http_options: Optional[HttpOptions], + patch: dict[str, Any], +) -> HttpOptions: + """Deep-merge patch into http_options.extra_body at nested-dict granularity.""" + base = (http_options.extra_body or {}) if http_options is not None else {} + merged: dict[str, Any] = dict(base) + for key, patch_val in patch.items(): + base_val = merged.get(key) + if isinstance(base_val, dict) and isinstance(patch_val, dict): + new_child = dict(base_val) + for subkey, subval in patch_val.items(): + new_child[subkey] = copy.deepcopy(subval) + merged[key] = new_child + else: + merged[key] = copy.deepcopy(patch_val) + if http_options is None: + return HttpOptions(extra_body=merged) + return http_options.model_copy(update={"extra_body": merged}) + + +def _create_optimize_model(opts: OptimizeModelOptions) -> Any: + """Build the underlying LLM model for an optimizer's LLM-driven operations. + + Provider routing: + - provider_name empty or "openai" -> OpenAIModel(...) directly. This + matches the framework's standard pattern for OpenAI-compatible + endpoints and forwards http_options.extra_body to the backend. + - Any other provider_name -> ModelRegistry.create_model("{provider}/{model}") + which routes to LiteLLMModel for multi-provider support. + """ + provider_name = _expand_env(opts.provider_name or "") + model_name = _expand_env(opts.model_name or "") + base_url = _expand_env(opts.base_url or "") + api_key = _expand_env(opts.api_key or "") + extra = dict(opts.extra_fields or {}) + + if not provider_name or provider_name.lower() == "openai": + return OpenAIModel( + model_name=model_name, + api_key=api_key, + base_url=base_url or None, + **extra, + ) + + return ModelRegistry.create_model( + f"{provider_name}/{model_name}", + api_key=api_key, + base_url=base_url or "", + **extra, + ) + + +# yapf: disable +def _build_optimize_generation_config( + opts: OptimizeModelOptions, +) -> tuple[GenerateContentConfig, Optional[ThinkingConfig]]: + # yapf: enable + """Build (GenerateContentConfig, ThinkingConfig | None) from OptimizeModelOptions. + + Returns thinking_config separately because LlmAgent rejects it on + GenerateContentConfig and requires it via BuiltInPlanner. + + Resolution order: + 1. Base fields (max_tokens/temperature/top_p/stop/...) from generation_config. + 2. thinking_config dict -> candidate ThinkingConfig (not written to cfg). + 3. http_options dict -> cfg.http_options (if present). + 4. opts.think overrides both paths when set. + """ + gen = opts.generation_config or {} + cfg = GenerateContentConfig() + cfg.max_output_tokens = (gen.get("max_tokens") or gen.get("max_output_tokens") or DEFAULT_OPTIMIZE_MAX_TOKENS) + cfg.temperature = gen.get("temperature", DEFAULT_OPTIMIZE_TEMPERATURE) + if "top_p" in gen and gen["top_p"] is not None: + cfg.top_p = gen["top_p"] + if "stop" in gen and gen["stop"] is not None: + cfg.stop_sequences = (gen["stop"] if isinstance(gen["stop"], list) else [gen["stop"]]) + elif "stop_sequences" in gen and gen["stop_sequences"] is not None: + cfg.stop_sequences = gen["stop_sequences"] + if "presence_penalty" in gen and gen["presence_penalty"] is not None: + setattr(cfg, "presence_penalty", gen["presence_penalty"]) + if "frequency_penalty" in gen and gen["frequency_penalty"] is not None: + setattr(cfg, "frequency_penalty", gen["frequency_penalty"]) + + effective_thinking_config: Optional[ThinkingConfig] = None + tc_dict = gen.get("thinking_config") + if isinstance(tc_dict, dict): + effective_thinking_config = ThinkingConfig(**tc_dict) + + http_opts_dict = gen.get("http_options") + if isinstance(http_opts_dict, dict): + cfg.http_options = HttpOptions(**http_opts_dict) + + if opts.think is True: + effective_thinking_config = ThinkingConfig( + include_thoughts=True, + thinking_budget=-1, + ) + cfg.http_options = _merge_extra_body( + cfg.http_options, + {"chat_template_kwargs": { + "enable_thinking": True + }}, + ) + elif opts.think is False: + effective_thinking_config = ThinkingConfig( + include_thoughts=False, + thinking_budget=0, + ) + cfg.http_options = _merge_extra_body( + cfg.http_options, + {"chat_template_kwargs": { + "enable_thinking": False + }}, + ) + + return cfg, effective_thinking_config + + +def _extract_final_text(event: Any) -> str: + """Collect non-thought text from a single LlmAgent final-response event. + + Returns empty string when the event is not a final response, lacks content, + or contains only thought parts. + """ + if not event.is_final_response(): + return "" + if not event.content or not event.content.parts: + return "" + return "\n".join((p.text or "").strip() for p in event.content.parts if p.thought is not True).strip() + + +def _flatten_messages(prompt: Union[str, list[dict[str, Any]]]) -> str: + """Flatten gepa's prompt forms into a single user-text string. + + Accepts: + - str: returned verbatim + - list[dict]: messages with role/content; joined with role tags so the + downstream LlmAgent receives a single user turn that preserves the + original conversation structure + """ + if isinstance(prompt, str): + return prompt + if not isinstance(prompt, list): + return str(prompt) + parts: list[str] = [] + for msg in prompt: + if not isinstance(msg, dict): + parts.append(str(msg)) + continue + role = msg.get("role", "user") + content = msg.get("content", "") + if isinstance(content, list): + content = "".join(c.get("text", str(c)) for c in content if isinstance(c, dict)) + parts.append(f"[{role}]\n{content}") + return "\n\n".join(parts) + + +class _OptimizeModelCallable: + """Synchronous LLM callable wrapping a framework `LlmAgent`. + + Conforms to gepa's `LanguageModel` Protocol: + - `__call__(prompt: str | list[dict]) -> str` + - `total_cost: float` attribute (used by gepa's MaxReflectionCostStopper) + + LlmAgent topology: instruction = "" (callers embed their own system text + inside the prompt), single user turn, no tools, no planner unless + `think` requests one, output_schema = None. + """ + + def __init__(self, opts: OptimizeModelOptions) -> None: + model = _create_optimize_model(opts) + cfg, thinking_config = _build_optimize_generation_config(opts) + planner = (BuiltInPlanner(thinking_config=thinking_config) if thinking_config is not None else None) + self._agent = LlmAgent( + name="optimize_model", + model=model, + instruction="", + generate_content_config=cfg, + add_name_to_instruction=False, + output_schema=None, + tools=[], + planner=planner, + ) + self._session_service = InMemorySessionService() + self.total_cost: float = 0.0 + self.total_calls: int = 0 + self.total_token_usage: dict[str, int] = { + "prompt": 0, + "completion": 0, + "total": 0, + } + + def __call__(self, prompt: Union[str, list[dict[str, Any]]]) -> str: + user_text = _flatten_messages(prompt) + self.total_calls += 1 + return asyncio.run(self._run_async(user_text)) + + async def _run_async(self, user_text: str) -> str: + user_content = Content(role="user", parts=[Part.from_text(text=user_text)]) + agent_context = create_agent_context() + session = await self._session_service.create_session( + app_name="optimizer", + user_id="optimize_model", + session_id=str(uuid.uuid4()), + agent_context=agent_context, + ) + ctx = InvocationContext( + session_service=self._session_service, + invocation_id=new_invocation_context_id(), + agent=self._agent, + session=session, + agent_context=agent_context, + user_content=user_content, + override_messages=[user_content], + ) + last_text = "" + async for event in self._agent.run_async(ctx): + part_text = _extract_final_text(event) + if part_text: + last_text += part_text + usage = getattr(event, "usage_metadata", None) + if usage is not None: + self._accumulate_usage(usage) + return last_text.strip() + + def _accumulate_usage(self, usage: Any) -> None: + """Add a single ``usage_metadata`` snapshot into ``total_token_usage``. + + Tolerant to Pydantic models, dict, or arbitrary attribute-bearing + objects so it works across model providers. + """ + prompt = self._read_count(usage, ("prompt_token_count", "input_tokens", "prompt_tokens")) + completion = self._read_count( + usage, + ("candidates_token_count", "output_tokens", "completion_tokens"), + ) + total = self._read_count(usage, ("total_token_count", "total_tokens")) + if total <= 0 and (prompt > 0 or completion > 0): + total = prompt + completion + self.total_token_usage["prompt"] += prompt + self.total_token_usage["completion"] += completion + self.total_token_usage["total"] += total + + @staticmethod + def _read_count(usage: Any, names: tuple[str, ...]) -> int: + """Return the first non-None int among the candidate attribute / key names.""" + for name in names: + value = None + if isinstance(usage, dict): + value = usage.get(name) + else: + value = getattr(usage, name, None) + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + return 0 diff --git a/trpc_agent_sdk/evaluation/_optimize_model_options.py b/trpc_agent_sdk/evaluation/_optimize_model_options.py new file mode 100644 index 00000000..7e15c549 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_model_options.py @@ -0,0 +1,45 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""LLM options for the optimizer's prompt rewrite step.""" + +from __future__ import annotations + +from typing import Any +from typing import Optional + +from pydantic import Field + +from ._common import EvalBaseModel + + +class OptimizeModelOptions(EvalBaseModel): + """LLM configuration for proposing new prompt candidates.""" + + provider_name: str = Field(default="", description="LLM provider name.") + model_name: str = Field(default="", description="Model name.") + variant: str = Field(default="", description="OpenAI-compatible variant when provider is openai.") + base_url: Optional[str] = Field(default=None, description="Custom endpoint URL.") + api_key: str = Field(default="", description="API key.") + extra_fields: Optional[dict[str, Any]] = Field( + default=None, + description="Extra provider-specific fields.", + ) + num_samples: Optional[int] = Field( + default=None, + description="Number of samples per call.", + ) + generation_config: Optional[dict[str, Any]] = Field( + default=None, + description="Generation params: max_tokens, temperature, stream, etc.", + ) + weight: float = Field( + default=1.0, + description="Weight for aggregation across samples.", + ) + think: Optional[bool] = Field( + default=None, + description="Thinking mode toggle. None: no change; False: disable; True: enable.", + ) diff --git a/trpc_agent_sdk/evaluation/_optimize_registrations.py b/trpc_agent_sdk/evaluation/_optimize_registrations.py new file mode 100644 index 00000000..74df6870 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_registrations.py @@ -0,0 +1,22 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Central registration of optimizer algorithms. + +Each algorithm is registered under ``try/except ImportError`` so optional +third-party deps that are missing simply omit the algorithm rather than +breaking package import. +""" + +from __future__ import annotations + +from ._optimize_registry import OPTIMIZER_REGISTRY + +try: + from ._optimize_gepa_reflective import GepaReflectiveOptimizer +except ImportError: + pass +else: + OPTIMIZER_REGISTRY.register("gepa_reflective", GepaReflectiveOptimizer) diff --git a/trpc_agent_sdk/evaluation/_optimize_registry.py b/trpc_agent_sdk/evaluation/_optimize_registry.py new file mode 100644 index 00000000..d1c72398 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_registry.py @@ -0,0 +1,41 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Registry mapping optimizer algorithm name to BaseOptimizer subclass.""" + +from __future__ import annotations + +import inspect +from typing import Type + +from ._base_optimizer import BaseOptimizer + + +class OptimizerRegistry: + """Maps optimizer algorithm name to a BaseOptimizer subclass.""" + + def __init__(self) -> None: + self._registry: dict[str, Type[BaseOptimizer]] = {} + + def register(self, name: str, optimizer_class: Type[BaseOptimizer]) -> None: + """Register an optimizer class under the given algorithm name.""" + if not inspect.isclass(optimizer_class) or not issubclass(optimizer_class, BaseOptimizer): + raise TypeError(f"optimizer_class must be a subclass of BaseOptimizer, " + f"got {optimizer_class!r}") + self._registry[name] = optimizer_class + + def list_registered(self) -> list[str]: + """Return sorted algorithm names currently registered.""" + return sorted(self._registry.keys()) + + def get(self, name: str) -> Type[BaseOptimizer]: + """Return the optimizer class registered under name; raise if absent.""" + if name not in self._registry: + raise ValueError(f"No optimizer registered for algorithm: {name}. " + f"Available algorithms: {self.list_registered()}") + return self._registry[name] + + +OPTIMIZER_REGISTRY = OptimizerRegistry() diff --git a/trpc_agent_sdk/evaluation/_optimize_reporter.py b/trpc_agent_sdk/evaluation/_optimize_reporter.py new file mode 100644 index 00000000..51a3ca90 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_reporter.py @@ -0,0 +1,1001 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Algorithm-agnostic progress sink for AgentOptimizer. + +Defines :class:`OptimizeReporter` (the surface algorithms emit progress +events to) and three concrete backends: + + * :class:`_NullReporter` drops every event (``verbose=0``). + * :class:`_RichReporter` Rich panel header, Live progress bar over + the budget, colourised round lines, closing + summary panel with per-metric comparison. + * :class:`_AsciiReporter` plain-``print`` fallback for non-Rich + environments. + +:class:`_SilentGepaLogger` is a ``gepa.LoggerProtocol``-compatible sink +the optimizer hands to gepa to keep library logs out of the reporter +timeline. + +:func:`create_reporter` picks a backend by ``verbose`` level and ``rich`` +availability. +""" + +from __future__ import annotations + +import logging +import os +import sys +from dataclasses import dataclass +from dataclasses import field +from typing import TYPE_CHECKING +from typing import Any +from typing import Literal +from typing import Optional +from typing import Protocol +from typing import TextIO +from typing import runtime_checkable + +if TYPE_CHECKING: + from ._optimize_result import OptimizeResult + +logger = logging.getLogger(__name__) + +_GEPA_LOGGER_NAME = "trpc_agent_sdk.optimizer.gepa" + +_MAX_TARGET_FIELDS_IN_HEADER = 8 +_FIELD_NAME_DISPLAY_LIMIT = 40 + + +@dataclass(frozen=True) +class RunHeader: + """Static run context shown at run start. + + Attributes: + algorithm: Registered algorithm name (e.g. ``gepa_reflective``). + target_fields: Ordered ``(field_name, source_repr)`` pairs; + ``source_repr`` is the file path for ``add_path`` fields or + ``""`` for ``add_callback`` fields. + train_size: Training case count. + val_size: Validation case count. + metric_names: Display names of every reported metric. + output_dir: Resolved artifact directory. + budget_total: Configured metric-call budget (e.g. + ``max_metric_calls``); ``None`` falls back to an + indeterminate progress display. + """ + + algorithm: str + target_fields: list[tuple[str, str]] + train_size: int + val_size: int + metric_names: list[str] + output_dir: str + budget_total: Optional[int] = None + + +@dataclass(frozen=True) +class RoundView: + """Single-round summary for one per-round line. + + Attributes: + round: 1-based round index from the algorithm. + kind: ``"reflective"`` (default) or ``"merge"``; unknown values + render as ``"reflective"``. + train_minibatch_size: ``M`` in ``train(M/N)``; 0 when the round + skipped before sampling. + train_size: ``N`` — full training set size. + train_subsample_parent_score: Parent's score on the minibatch + (None when no subsample produced). + train_subsample_candidate_score: New candidate's score (None + when not evaluated). + val_pass_rate: Full validation pass rate when the candidate + cleared the subsample gate (None otherwise). + accepted: True iff the candidate joined the pool. + skip_reason: Human-readable reason for skipped rounds. + error_message: Set when the round ended in an error. + duration_seconds: Wall-clock seconds. + budget_used: Cumulative metric calls used (None when the + algorithm doesn't track a budget). + budget_total: Configured ``max_metric_calls`` (None means + ``"auto"``). + extras: Free-form algorithm-specific payload. + """ + + round: int + kind: Literal["reflective", "merge"] + train_minibatch_size: int + train_size: int + train_subsample_parent_score: Optional[float] + train_subsample_candidate_score: Optional[float] + val_pass_rate: Optional[float] + accepted: bool + skip_reason: Optional[str] + error_message: Optional[str] + duration_seconds: float + budget_used: Optional[int] + budget_total: Optional[int] + extras: dict[str, Any] = field(default_factory=dict) + + +@runtime_checkable +class OptimizeReporter(Protocol): + """Five-event surface every backend implements. + + Implementations swallow render errors; the facade also guards each + call so a broken reporter never breaks optimization. + """ + + def run_started(self, header: RunHeader) -> None: + ... + + def baseline_evaluated( + self, + pass_rate: float, + metric_breakdown: dict[str, float], + *, + metric_thresholds: Optional[dict[str, float]] = None, + ) -> None: + ... + + def round_completed(self, view: RoundView) -> None: + ... + + def run_finished( + self, + result: "OptimizeResult", + *, + output_dir: str, + update_source: bool, + ) -> None: + ... + + def run_failed( + self, + *, + baseline_prompts: dict[str, str], + output_dir: str, + error_message: str, + ) -> None: + ... + + +class _NullReporter: + """No-op reporter used when ``verbose=0``.""" + + def run_started(self, header: RunHeader) -> None: + return None + + def baseline_evaluated( + self, + pass_rate: float, + metric_breakdown: dict[str, float], + *, + metric_thresholds: Optional[dict[str, float]] = None, + ) -> None: + return None + + def round_completed(self, view: RoundView) -> None: + return None + + def run_finished( + self, + result: "OptimizeResult", + *, + output_dir: str, + update_source: bool, + ) -> None: + return None + + def run_failed( + self, + *, + baseline_prompts: dict[str, str], + output_dir: str, + error_message: str, + ) -> None: + return None + + +def _truncate(text: str, limit: int) -> str: + """Return ``text`` shortened to at most ``limit`` characters with ellipsis.""" + if len(text) <= limit: + return text + return text[:max(0, limit - 3)] + "..." + + +def _format_source(source_repr: str) -> str: + """Compact a target source for display in the run header. + + File-backed sources collapse to their basename (full path remains in + ``config.snapshot.json`` / ``result.json``); callback sources keep their + sentinel ```` form. + """ + if source_repr == "": + return source_repr + return os.path.basename(source_repr) or source_repr + + +def _format_sample_score_segment(view: RoundView, *, ascii_only: bool) -> str: + """Render the ``sample score parent → candidate`` segment, or empty when absent.""" + parent = view.train_subsample_parent_score + candidate = view.train_subsample_candidate_score + if parent is None and candidate is None: + return "" + arrow = "->" if ascii_only else "→" + if parent is None: + return f"sample score {candidate:.2f}" + if candidate is None: + return f"sample score {parent:.2f}" + return f"sample score {parent:.2f} {arrow} {candidate:.2f}" + + +def _format_evaluations_segment(view: RoundView) -> str: + """Render the trailing ``evaluations used/total`` segment, or empty when not tracked.""" + if view.budget_used is None: + return "" + total = "auto" if view.budget_total is None else str(view.budget_total) + return f"evaluations {view.budget_used}/{total}" + + +def _round_marker(view: RoundView, *, ascii_only: bool) -> str: + """Return the leading marker glyph for a round line. + + Glyph → meaning mapping (kept identical between ASCII and Rich): + + * ``✓`` accepted — candidate beat the current best on valset. + * ``○`` explored — full valset evaluation ran but did not improve. + * ``·`` skipped — subsample gate / no-proposal / cache hit etc. + * ``↻`` merge — gepa system-aware merge round. + * ``✗`` error — round ended in an algorithm error. + """ + if view.error_message: + return "x" if ascii_only else "✗" + if view.skip_reason: + return "." if ascii_only else "·" + if view.kind == "merge": + return "~" if ascii_only else "↻" + if view.accepted: + return "OK" if ascii_only else "✓" + return "-" if ascii_only else "○" + + +def _round_status_word(view: RoundView) -> str: + """Return the textual status label rendered next to the round marker.""" + if view.error_message: + return "error" + if view.skip_reason: + return "skipped" + if view.kind == "merge": + return "merged" if view.accepted else "merge" + if view.accepted: + return "accepted" + return "explored" + + +def _format_stop_reason_text(stop_reason: Optional[str]) -> Optional[str]: + """Translate ``OptimizeResult.stop_reason`` into the reporter row text. + + Returns ``None`` when no row should be emitted (i.e. the run errored + before any stopper could classify a reason). + """ + if stop_reason is None: + return None + text_by_reason = { + "required_metrics_passing": "required metrics met thresholds", + "budget_exhausted": "budget exhausted (max_metric_calls reached)", + "no_improvement": "no improvement for the configured number of rounds", + "timeout": "timeout reached", + "score_threshold": "score threshold reached", + "max_candidate_proposals": "max candidate proposals reached", + "max_tracked_candidates": "max tracked candidates reached", + "user_requested_stop": "user requested stop (optimize.stop touched)", + "completed": "completed (no stopper triggered)", + } + return text_by_reason.get(stop_reason, stop_reason) + + +def _round_legend_lines(*, ascii_only: bool) -> list[str]: + """Return the static legend block describing round-line semantics. + + Printed once between header and baseline so users can decode every + subsequent round line without scrolling back. + """ + arrow = "->" if ascii_only else "→" + accepted = "OK" if ascii_only else "✓" + explored = "-" if ascii_only else "○" + skipped = "." if ascii_only else "·" + merge = "~" if ascii_only else "↻" + error = "x" if ascii_only else "✗" + return [ + "Round line legend:", + f" format : round N train sample M/N " + f"sample score parent {arrow} candidate " + f"valset pass_rate Z evaluations used/total duration", + f" status : {accepted} accepted {explored} explored " + f"{skipped} skipped {merge} merge {error} error", + " train : a minibatch of M cases sampled from the N-case training set " + "for the reflective step.", + " sample : parent vs new candidate score on that minibatch " + "(skip gate decides whether to run valset).", + " valset : pass_rate over the full validation set when the candidate " + "cleared the skip gate.", + " budget : evaluations used / configured budget (metric calls).", + ] + + +def _improvement_arrow(delta: float, *, ascii_only: bool) -> str: + """Return the directional arrow for a pass-rate delta.""" + if delta > 0: + return "^" if ascii_only else "▲" + if delta < 0: + return "v" if ascii_only else "▼" + return "=" + + +def _format_improvement_label(delta: float) -> str: + """Return a textual label describing the improvement direction.""" + if delta > 0: + return "improved" + if delta < 0: + return "regressed" + return "no improvement" + + +def _format_round_line(view: RoundView, *, ascii_only: bool) -> str: + """Render a single-line per-round summary in ASCII form. + + Layout: `` round N train sample M/N sample score X -> Y + evaluations U/T ``. Segments + that do not apply to the current round (e.g. ``sample score`` for skipped + rounds without subsample data) are omitted. + """ + marker = _round_marker(view, ascii_only=ascii_only) + status_word = _round_status_word(view) + head = f"{marker} round {view.round} {status_word}" + + segments: list[str] = [] + if view.train_minibatch_size > 0: + segments.append(f"train sample {view.train_minibatch_size}/{view.train_size}") + sample = _format_sample_score_segment(view, ascii_only=ascii_only) + if sample: + segments.append(sample) + + if view.error_message: + segments.append(f"message: {view.error_message}") + elif view.skip_reason: + segments.append(f"reason: {view.skip_reason}") + elif view.val_pass_rate is not None: + segments.append(f"valset pass_rate {view.val_pass_rate:.4f}") + + evaluations = _format_evaluations_segment(view) + if evaluations: + segments.append(evaluations) + + body = " ".join(segments) + tail = f" {view.duration_seconds:.1f}s" + return f"{head} {body}{tail}" + + +def _ordered_metric_keys(*breakdowns: dict[str, float], extra: Optional[list[str]] = None) -> list[str]: + """Stable union of metric keys across baseline/best breakdowns and an + optional ``extra`` ordering hint.""" + seen: dict[str, None] = {} + if extra: + for name in extra: + seen.setdefault(name, None) + for breakdown in breakdowns: + for name in breakdown.keys(): + seen.setdefault(name, None) + return list(seen.keys()) + + +def _format_score(value: Optional[float]) -> str: + """Return a fixed-width formatted metric score, or ``-`` when missing.""" + if value is None: + return " - " + return f"{value:.4f}" + + +def _format_delta(value: float, *, ascii_only: bool) -> tuple[str, str]: + """Return a ``(arrow, text)`` pair describing a per-metric improvement.""" + arrow = _improvement_arrow(value, ascii_only=ascii_only) + sign = "+" if value >= 0 else "" + return arrow, f"{sign}{value:.4f}" + + +def _baseline_metric_status( + score: Optional[float], + threshold: Optional[float], + *, + ascii_only: bool, +) -> str: + """Return ``PASS`` / ``FAIL`` (or ``-``) based on whether ``score`` cleared the threshold. + + Mirrors evaluator semantics (``PASSED if score >= threshold``) so the + reporter never disagrees with the evaluator's own PASS / FAIL decision. + """ + if score is None or threshold is None: + return " - " + if score >= threshold: + return "PASS" if ascii_only else "PASS" + return "FAIL" if ascii_only else "FAIL" + + +class _AsciiReporter: + """Dependency-free reporter used as fallback for non-Rich environments. + + Renders every event as ordered plain text via ``print``; safe for log + files and CI pipes. Falls back to ASCII glyphs when the stream encoding + cannot represent the Unicode marker set. + """ + + def __init__(self, *, stream: TextIO = sys.stdout, verbose: int = 1) -> None: + self._stream = stream + self._verbose = verbose + self._ascii_only = self._detect_ascii_only() + + def _detect_ascii_only(self) -> bool: + """Return True when the stream encoding cannot render Unicode glyphs.""" + encoding = getattr(self._stream, "encoding", None) or sys.getdefaultencoding() + try: + "✓✗·↻▲▼○".encode(encoding) + except (LookupError, UnicodeEncodeError): + return True + return False + + def run_started(self, header: RunHeader) -> None: + lines = [ + "", + "=" * 80, + f" AgentOptimizer · {header.algorithm}", + "=" * 80, + self._format_targets_line(header.target_fields), + ] + for name, src in header.target_fields[:_MAX_TARGET_FIELDS_IN_HEADER]: + display_name = _truncate(name, _FIELD_NAME_DISPLAY_LIMIT) + lines.append(f" - {display_name:<40s} ({_format_source(src)})") + if len(header.target_fields) > _MAX_TARGET_FIELDS_IN_HEADER: + extra = len(header.target_fields) - _MAX_TARGET_FIELDS_IN_HEADER + lines.append(f" ... and {extra} more") + lines.append(f" train/val : {header.train_size} / {header.val_size} cases") + lines.append(f" metrics : {len(header.metric_names)} configured") + for name in header.metric_names: + lines.append(f" - {name}") + if header.budget_total is not None: + lines.append(f" budget : {header.budget_total} metric calls") + else: + lines.append(" budget : auto (no explicit cap)") + lines.append(f" output_dir : {header.output_dir}") + lines.append("-" * 80) + lines.append("") + lines.extend(_round_legend_lines(ascii_only=self._ascii_only)) + lines.append("") + self._writelines(lines) + + @staticmethod + def _format_targets_line(target_fields: list[tuple[str, str]]) -> str: + if len(target_fields) == 1: + return " target : 1 field" + return f" targets : {len(target_fields)} fields" + + def baseline_evaluated( + self, + pass_rate: float, + metric_breakdown: dict[str, float], + *, + metric_thresholds: Optional[dict[str, float]] = None, + ) -> None: + thresholds = metric_thresholds or {} + lines = [f"baseline pass_rate = {pass_rate:.4f}"] + keys = _ordered_metric_keys(metric_breakdown, extra=list(thresholds.keys())) + if keys: + lines.append(" per-metric (threshold | score | status):") + for name in keys: + score = metric_breakdown.get(name) + threshold = thresholds.get(name) + status = _baseline_metric_status(score, threshold, ascii_only=self._ascii_only) + threshold_str = (f"{threshold:.4f}" if threshold is not None else " - ") + score_str = _format_score(score) + lines.append(f" - {name:<40s} threshold {threshold_str} " + f"{score_str} {status}") + lines.append("") + self._writelines(lines) + + def round_completed(self, view: RoundView) -> None: + self._writelines([_format_round_line(view, ascii_only=self._ascii_only)]) + + def run_finished( + self, + result: "OptimizeResult", + *, + output_dir: str, + update_source: bool, + ) -> None: + self._writelines([""]) + self._writelines(self._build_summary_lines( + result=result, + output_dir=output_dir, + update_source=update_source, + )) + + def run_failed( + self, + *, + baseline_prompts: dict[str, str], + output_dir: str, + error_message: str, + ) -> None: + self._writelines([ + "", + "=" * 80, + " Optimization FAILED", + "=" * 80, + f" error : {error_message}", + f" output_dir : {output_dir}", + f" baseline preserved at {os.path.join(output_dir, 'baseline_prompts')}", + "=" * 80, + "", + ]) + + def _build_summary_lines( + self, + *, + result: "OptimizeResult", + output_dir: str, + update_source: bool, + ) -> list[str]: + """Return the multi-line summary block printed at run finish.""" + arrow = _improvement_arrow(result.pass_rate_improvement, ascii_only=self._ascii_only) + label = _format_improvement_label(result.pass_rate_improvement) + accepted = sum(1 for r in result.rounds if r.accepted) + sign = "+" if result.pass_rate_improvement >= 0 else "" + rate_line = (f" pass_rate : {result.baseline_pass_rate:.4f} -> {result.best_pass_rate:.4f}" + f" {arrow} {sign}{result.pass_rate_improvement:.4f} ({label})") + lines = [ + "=" * 80, + f" Optimization complete · {result.status}", + "=" * 80, + rate_line, + f" rounds : {accepted} accepted / {result.total_rounds} total", + f" duration : {result.duration_seconds:.2f}s", + ] + stop_text = _format_stop_reason_text(result.stop_reason) + if stop_text is not None: + lines.append(f" stopped by : {stop_text}") + if result.status != "SUCCEEDED" and result.error_message: + lines.append(f" error : {result.error_message}") + metric_keys = _ordered_metric_keys( + result.baseline_metric_breakdown, + result.best_metric_breakdown, + extra=list(result.metric_thresholds.keys()), + ) + if metric_keys: + lines.append(" per-metric : threshold | baseline -> best | delta | status") + for name in metric_keys: + base = result.baseline_metric_breakdown.get(name) + best = result.best_metric_breakdown.get(name) + threshold = result.metric_thresholds.get(name) + delta = (best or 0.0) - (base or 0.0) + d_arrow, d_text = _format_delta(delta, ascii_only=self._ascii_only) + base_str = _format_score(base) + best_str = _format_score(best) + threshold_str = (f"{threshold:.4f}" if threshold is not None else " - ") + status = _baseline_metric_status(best, threshold, ascii_only=self._ascii_only) + lines.append(f" - {name:<40s} threshold {threshold_str} " + f"{base_str} -> {best_str} {d_arrow} {d_text} {status}") + update_msg = self._format_update_source_line(result=result, output_dir=output_dir, update_source=update_source) + if update_msg: + lines.append(update_msg) + lines.extend(self._format_artifacts_block(result=result, output_dir=output_dir)) + lines.append("=" * 80) + lines.append("") + return lines + + @staticmethod + def _format_update_source_line( + *, + result: "OptimizeResult", + output_dir: str, + update_source: bool, + ) -> Optional[str]: + """Return the ``update_source`` row text or ``None`` to omit it.""" + if not update_source: + best_dir = os.path.join(output_dir, "best_prompts") + return f" update_source: false (best prompts at {best_dir}/)" + if result.status == "SUCCEEDED": + return " update_source: true (best written back to target sources)" + return " update_source: true (run failed; sources restored from baseline)" + + @staticmethod + def _format_artifacts_block( + *, + result: "OptimizeResult", + output_dir: str, + ) -> list[str]: + """Return the artifact directory listing lines for the summary.""" + lines = [" artifacts :"] + lines.append(f" {output_dir}/") + for name, content in result.best_prompts.items(): + display = _truncate(name, _FIELD_NAME_DISPLAY_LIMIT) + lines.append(f" best_prompts/{display}.md ({len(content)} chars)") + lines.append(" result.json summary.txt rounds/ run.log") + return lines + + def _writelines(self, lines: list[str]) -> None: + """Write a list of lines to the stream, swallowing render errors.""" + try: + self._stream.write("\n".join(lines)) + self._stream.write("\n") + try: + self._stream.flush() + except (AttributeError, ValueError): # pragma: no cover - non-flushable buffers + pass + except Exception: # pragma: no cover - never break optimization on render error + logger.warning("AsciiReporter write failed", exc_info=True) + + +class _RichReporter: + """Rich-backed reporter that degrades to plain output on non-TTY streams. + + Uses Rich panels for the header and the closing summary, a Live region + with a progress bar over the configured metric-call budget for the + duration of the run, and a single coloured line per round. The underlying + ``rich.console.Console`` auto-detects whether the stream supports ANSI + sequences. + """ + + def __init__(self, *, stream: TextIO = sys.stdout, verbose: int = 1) -> None: + from rich.console import Console + + self._stream = stream + self._verbose = verbose + self._console = Console( + file=stream, + force_terminal=None, + highlight=False, + soft_wrap=False, + ) + self._ascii = _AsciiReporter(stream=stream, verbose=verbose) + self._progress = None + self._budget_task = None + self._budget_total: Optional[int] = None + + def run_started(self, header: RunHeader) -> None: + from rich.panel import Panel + from rich.table import Table + from rich import box + from rich.progress import ( + Progress, + BarColumn, + TextColumn, + TimeElapsedColumn, + ) + + table = Table.grid(padding=(0, 2)) + table.add_column(no_wrap=True, style="dim") + table.add_column(no_wrap=False) + + targets_label = ("target" if len(header.target_fields) == 1 else "targets") + targets_value = ("1 field" if len(header.target_fields) == 1 else f"{len(header.target_fields)} fields") + table.add_row(targets_label, targets_value) + visible = header.target_fields[:_MAX_TARGET_FIELDS_IN_HEADER] + for name, src in visible: + display_name = _truncate(name, _FIELD_NAME_DISPLAY_LIMIT) + table.add_row("", f"- {display_name} [dim]({_format_source(src)})[/dim]") + if len(header.target_fields) > len(visible): + remainder = len(header.target_fields) - len(visible) + table.add_row("", f"[dim]... and {remainder} more[/dim]") + + table.add_row("train/val", f"{header.train_size} / {header.val_size} cases") + metric_count_label = ("metric" if len(header.metric_names) == 1 else f"metrics ({len(header.metric_names)})") + table.add_row(metric_count_label, "") + for name in header.metric_names: + table.add_row("", f"- {name}") + + budget_text = (f"{header.budget_total} metric calls" + if header.budget_total is not None else "auto (no explicit cap)") + table.add_row("budget", budget_text) + table.add_row("output_dir", header.output_dir) + + panel = Panel( + table, + title=f"[bold]AgentOptimizer[/bold] · [cyan]{header.algorithm}[/cyan]", + box=box.ROUNDED, + padding=(0, 1), + ) + self._console.print(panel) + self._console.print("") + for line in _round_legend_lines(ascii_only=False): + self._console.print(f"[dim]{line}[/dim]") + self._console.print("") + + self._budget_total = header.budget_total + # ``auto_refresh=False`` keeps the Live region quiescent between + # explicit refresh calls — embedded IDE terminals and some CI + # captures don't honour rich's cursor-up escapes, so the default + # 10Hz auto-refresh would re-print the bar instead of erasing + # it. Manual refresh on each ``round_completed`` keeps the + # output bounded to one line per event. + self._progress = Progress( + TextColumn("[bold]progress[/bold]"), + BarColumn(bar_width=None), + TextColumn("{task.completed}/{task.total} metric calls"), + TextColumn("•"), + TimeElapsedColumn(), + console=self._console, + transient=False, + expand=True, + auto_refresh=False, + ) + total = header.budget_total if header.budget_total is not None else 100 + self._budget_task = self._progress.add_task("budget", total=total) + try: + self._progress.start() + self._progress.refresh() + except Exception: # pragma: no cover - Live region best-effort + self._progress = None + self._budget_task = None + + def baseline_evaluated( + self, + pass_rate: float, + metric_breakdown: dict[str, float], + *, + metric_thresholds: Optional[dict[str, float]] = None, + ) -> None: + from rich.table import Table + from rich import box + + thresholds = metric_thresholds or {} + self._console.print(f"[bold]baseline pass_rate = {pass_rate:.4f}[/bold]") + keys = _ordered_metric_keys(metric_breakdown, extra=list(thresholds.keys())) + if keys: + t = Table(box=box.SIMPLE, show_header=True, header_style="dim") + t.add_column("metric", no_wrap=True) + t.add_column("threshold", justify="right") + t.add_column("baseline", justify="right") + t.add_column("status", justify="right") + for name in keys: + score = metric_breakdown.get(name) + threshold = thresholds.get(name) + threshold_str = (f"{threshold:.4f}" if threshold is not None else "-") + score_str = (f"{score:.4f}" if score is not None else "-") + status = _baseline_metric_status(score, threshold, ascii_only=False) + color = ("green" if status == "PASS" else "red" if status == "FAIL" else "dim") + t.add_row( + name, + threshold_str, + score_str, + f"[{color}]{status}[/{color}]", + ) + self._console.print(t) + self._console.print("") + + def round_completed(self, view: RoundView) -> None: + if self._progress is not None and view.budget_used is not None: + try: + if self._budget_total is None: + # When no upper bound was set, grow the bar with usage. + self._progress.update( + self._budget_task, + completed=view.budget_used, + total=max(view.budget_used, 1), + ) + else: + self._progress.update( + self._budget_task, + completed=min(view.budget_used, self._budget_total), + ) + # Explicit refresh because ``auto_refresh=False`` keeps + # the Live region quiescent between events. + self._progress.refresh() + except Exception: # pragma: no cover + pass + + marker = _round_marker(view, ascii_only=False) + status_word = _round_status_word(view) + style = self._round_style(view) + head = (f"[{style}]{marker} round {view.round} {status_word}[/{style}]") + + segments: list[str] = [] + if view.train_minibatch_size > 0: + segments.append(f"train sample {view.train_minibatch_size}/{view.train_size}") + sample = _format_sample_score_segment(view, ascii_only=False) + if sample: + segments.append(sample) + if view.error_message: + segments.append(f"[red]message: {view.error_message}[/red]") + elif view.skip_reason: + segments.append(f"[dim]reason: {view.skip_reason}[/dim]") + elif view.val_pass_rate is not None: + segments.append(f"[green]valset pass_rate {view.val_pass_rate:.4f}[/green]") + evaluations = _format_evaluations_segment(view) + if evaluations: + segments.append(f"[dim]{evaluations}[/dim]") + body = " ".join(segments) + tail = f" [dim]{view.duration_seconds:.1f}s[/dim]" + self._console.print(f"{head} {body}{tail}") + + @staticmethod + def _round_style(view: RoundView) -> str: + """Return the Rich style string for the round marker.""" + if view.error_message: + return "bold red" + if view.skip_reason: + return "dim" + if view.accepted: + return "bold green" + return "yellow" + + def _stop_progress(self) -> None: + if self._progress is None: + return + try: + self._progress.stop() + except Exception: # pragma: no cover + pass + self._progress = None + self._budget_task = None + + def run_finished( + self, + result: "OptimizeResult", + *, + output_dir: str, + update_source: bool, + ) -> None: + from rich.panel import Panel + from rich.table import Table + from rich import box + + self._stop_progress() + + accepted = sum(1 for r in result.rounds if r.accepted) + sign = "+" if result.pass_rate_improvement >= 0 else "" + arrow = _improvement_arrow(result.pass_rate_improvement, ascii_only=False) + label = _format_improvement_label(result.pass_rate_improvement) + delta_color = ("green" + if result.pass_rate_improvement > 0 else "red" if result.pass_rate_improvement < 0 else "dim") + + table = Table.grid(padding=(0, 2)) + table.add_column(no_wrap=True, style="dim") + table.add_column(no_wrap=False) + rate_value = (f"{result.baseline_pass_rate:.4f} -> [bold]{result.best_pass_rate:.4f}[/bold] " + f"[{delta_color}]{arrow} {sign}{result.pass_rate_improvement:.4f}[/{delta_color}] " + f"[{delta_color}]({label})[/{delta_color}]") + table.add_row("pass_rate", rate_value) + table.add_row("rounds", f"{accepted} accepted / {result.total_rounds} total") + table.add_row("duration", f"{result.duration_seconds:.2f}s") + stop_text = _format_stop_reason_text(result.stop_reason) + if stop_text is not None: + table.add_row("stopped by", stop_text) + if result.status != "SUCCEEDED" and result.error_message: + table.add_row("error", f"[red]{result.error_message}[/red]") + update_msg = _AsciiReporter._format_update_source_line(result=result, + output_dir=output_dir, + update_source=update_source) + if update_msg: + table.add_row("update_source", update_msg.split(":", 1)[1].strip()) + table.add_row("artifacts", f"{output_dir}/") + for name, content in result.best_prompts.items(): + display = _truncate(name, _FIELD_NAME_DISPLAY_LIMIT) + table.add_row("", f"best_prompts/{display}.md [dim]({len(content)} chars)[/dim]") + table.add_row("", "result.json summary.txt rounds/ run.log") + + title_style = "bold green" if result.status == "SUCCEEDED" else "bold red" + panel = Panel( + table, + title=f"[{title_style}]Optimization complete · {result.status}[/{title_style}]", + box=box.ROUNDED, + padding=(0, 1), + ) + self._console.print("") + self._console.print(panel) + + metric_keys = _ordered_metric_keys( + result.baseline_metric_breakdown, + result.best_metric_breakdown, + extra=list(result.metric_thresholds.keys()), + ) + if metric_keys: + mt = Table( + title="per-metric scores", + box=box.SIMPLE_HEAVY, + show_header=True, + header_style="bold", + title_style="dim", + ) + mt.add_column("metric", no_wrap=True) + mt.add_column("threshold", justify="right") + mt.add_column("baseline", justify="right") + mt.add_column("best", justify="right") + mt.add_column("delta", justify="right") + mt.add_column("status", justify="right") + for name in metric_keys: + base = result.baseline_metric_breakdown.get(name) + best = result.best_metric_breakdown.get(name) + threshold = result.metric_thresholds.get(name) + delta = (best or 0.0) - (base or 0.0) + d_color = ("green" if delta > 0 else "red" if delta < 0 else "dim") + d_arrow, d_text = _format_delta(delta, ascii_only=False) + base_str = _format_score(base) + best_str = _format_score(best) + threshold_str = (f"{threshold:.4f}" if threshold is not None else "-") + status = _baseline_metric_status(best, threshold, ascii_only=False) + status_color = ("green" if status == "PASS" else "red" if status == "FAIL" else "dim") + mt.add_row( + name, + threshold_str, + base_str, + best_str, + f"[{d_color}]{d_arrow} {d_text}[/{d_color}]", + f"[{status_color}]{status}[/{status_color}]", + ) + self._console.print(mt) + + def run_failed( + self, + *, + baseline_prompts: dict[str, str], + output_dir: str, + error_message: str, + ) -> None: + from rich.panel import Panel + from rich import box + + self._stop_progress() + + body = (f"[red]error :[/red] {error_message}\n" + f"output_dir : {output_dir}\n" + f"baseline preserved at {os.path.join(output_dir, 'baseline_prompts')}") + panel = Panel( + body, + title="[bold red]Optimization FAILED[/bold red]", + box=box.ROUNDED, + padding=(0, 1), + ) + self._console.print("") + self._console.print(panel) + + +class _SilentGepaLogger: + """gepa-LoggerProtocol-compatible sink used to suppress library logs. + + With ``verbose<=1`` every message is dropped; with ``verbose>=2`` messages + are forwarded to the ``trpc_agent_sdk.optimizer.gepa`` logger at INFO + level so callers can route them via the standard logging configuration. + """ + + def __init__(self, *, verbose: int) -> None: + self._verbose = verbose + self._target = logging.getLogger(_GEPA_LOGGER_NAME) if verbose >= 2 else None + + def log(self, message: str) -> None: + if self._target is not None: + self._target.info("%s", message) + + +def create_reporter( + *, + verbose: int = 1, + stream: TextIO = sys.stdout, +) -> OptimizeReporter: + """Pick the appropriate reporter backend. + + Resolution order: ``verbose == 0`` returns :class:`_NullReporter`; + otherwise the factory attempts to import ``rich`` and returns + :class:`_RichReporter` on success or :class:`_AsciiReporter` on failure. + Unknown ``verbose`` values are normalised to ``1``. + """ + if verbose == 0: + return _NullReporter() + if verbose not in (1, 2): + verbose = 1 + try: + import rich # noqa: F401 + except ImportError: + return _AsciiReporter(stream=stream, verbose=verbose) + return _RichReporter(stream=stream, verbose=verbose) diff --git a/trpc_agent_sdk/evaluation/_optimize_result.py b/trpc_agent_sdk/evaluation/_optimize_result.py new file mode 100644 index 00000000..28e7928d --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_result.py @@ -0,0 +1,361 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Optimization result data structures.""" + +from __future__ import annotations + +import os +from typing import Any +from typing import Literal +from typing import Optional + +from pydantic import Field + +from ._common import EvalBaseModel + +RunStatus = Literal["SUCCEEDED", "FAILED", "CANCELED"] + +FinishReason = Literal[ + "completed", + "perfect_pass_rate", + "no_improvement", + "error", +] + +StopReason = Literal[ + "required_metrics_passing", + "budget_exhausted", + "no_improvement", + "timeout", + "score_threshold", + "max_candidate_proposals", + "max_tracked_candidates", + "user_requested_stop", + "completed", +] + +RoundKind = Literal["reflective", "merge"] + + +class RoundRecord(EvalBaseModel): + """Per-round optimization record. + + Attributes: + round: 1-based round index. + optimized_field_names: Field names actually rewritten by the optimize model this round. + candidate_prompts: Full candidate map for the round; reused fields carry the previous text. + train_pass_rate: Currently always 0.0; see field description below. + validation_pass_rate: Pass rate on the validation split. + metric_breakdown: Mean score per metric on the validation split. + accepted: True iff the candidate was accepted as new best. + acceptance_reason: Human-readable reason for the acceptance decision. + failed_case_ids: Eval case ids that failed the validation split this round. + failed_cases_truncated: Number of failed cases dropped by token-budget truncation. + per_field_diagnosis: Diagnosis text from the reflection LM, keyed by optimized field name. + reflection_lm_calls: Number of reflection LM invocations this round (including retries). + round_llm_cost: USD cost for this round (reflection LM + evaluator). + round_token_usage: Token usage for this round; keys are "prompt", "completion", "total". + started_at: ISO-8601 timestamp when the round started. + duration_seconds: Wall-clock duration of the round in seconds. + extras: Free-form business payload; the optimizer never reads or modifies it. + """ + + round: int = Field(description="1-based round index.") + optimized_field_names: list[str] = Field(description="Field names rewritten by the optimize model this round.", ) + candidate_prompts: dict[str, str] = Field(description="Full candidate prompt map for the round.", ) + + train_pass_rate: float = Field( + default=0.0, + description=("Currently always 0.0: gepa does not expose a full-train-set pass " + "rate (it only samples minibatches each round). Use " + "train_subsample_parent_score / train_subsample_candidate_score " + "for per-round minibatch metrics instead."), + ) + validation_pass_rate: float = Field(description="Pass rate on the validation split.") + metric_breakdown: dict[str, float] = Field( + default_factory=dict, + description=("Mean score per metric on the validation split. Empty when the " + "round was skipped before valset evaluation, or when the " + "evaluator did not expose per-metric scores."), + ) + + accepted: bool = Field(description="True iff the candidate was accepted as new best.") + acceptance_reason: str = Field(default="", description="Human-readable acceptance reason.") + + failed_case_ids: list[str] = Field( + default_factory=list, + description="Eval case ids that failed validation this round.", + ) + failed_cases_truncated: int = Field( + default=0, + description="Number of failed cases dropped by token-budget truncation.", + ) + per_field_diagnosis: dict[str, str] = Field( + default_factory=dict, + description="Diagnosis text from the reflection LM, keyed by optimized field name.", + ) + reflection_lm_calls: int = Field( + default=0, + description="Number of reflection LM invocations this round (including retries).", + ) + + round_llm_cost: float = Field( + default=0.0, + description="USD cost for this round (reflection LM + evaluator).", + ) + round_token_usage: dict[str, int] = Field( + default_factory=lambda: { + "prompt": 0, + "completion": 0, + "total": 0 + }, + description='Token usage for this round; keys are "prompt", "completion", "total".', + ) + + started_at: str = Field(description="ISO-8601 timestamp when the round started.") + duration_seconds: float = Field(description="Wall-clock duration of the round in seconds.") + + kind: RoundKind = Field( + default="reflective", + description=("Mutation kind for this round: 'reflective' for the standard " + "reflective proposal step and 'merge' for system-aware merges."), + ) + train_minibatch_size: int = Field( + default=0, + description=("Cases sampled from the training set this round. 0 when the round " + "skipped before sampling (e.g. 'no proposal')."), + ) + train_subsample_parent_score: Optional[float] = Field( + default=None, + description=("Parent candidate's score on the sampled minibatch; None when no " + "subsample was produced."), + ) + train_subsample_candidate_score: Optional[float] = Field( + default=None, + description=("New candidate's score on the sampled minibatch; None when no " + "candidate was evaluated."), + ) + skip_reason: Optional[str] = Field( + default=None, + description=("Human-readable reason set on skipped rounds (e.g. " + "'subsample perfect', 'no proposal'). None when the round ran " + "normally or ended in an error."), + ) + error_message: Optional[str] = Field( + default=None, + description="Error message when the round ended in an algorithm error.", + ) + budget_used: Optional[int] = Field( + default=None, + description=("Cumulative metric calls consumed across all rounds so far. None " + "when the algorithm does not track a budget."), + ) + budget_total: Optional[int] = Field( + default=None, + description="Configured budget cap (e.g. max_metric_calls); None means 'auto'.", + ) + + extras: dict[str, Any] = Field( + default_factory=dict, + description="Free-form business payload; optimizer ignores it.", + ) + + +class OptimizeResult(EvalBaseModel): + """Top-level optimization result. + + Attributes: + schema_version: Result schema version; bumped on breaking layout changes. + algorithm: Algorithm name that produced this result. + status: Final run status. + finish_reason: Why the loop stopped. + error_message: Error message when status is FAILED. + baseline_pass_rate: Validation pass rate of the baseline prompts. + best_pass_rate: Validation pass rate of the best prompts. + pass_rate_improvement: best_pass_rate minus baseline_pass_rate. + baseline_metric_breakdown: Mean score per metric for the baseline. + best_metric_breakdown: Mean score per metric for the best prompts. + baseline_prompts: Initial prompt text keyed by TargetPrompt name. + best_prompts: Best prompt text keyed by TargetPrompt name. + total_rounds: Number of rounds executed. + rounds: Per-round records in order. + total_reflection_lm_calls: Total reflection LM invocations (including retries). + total_judge_model_calls: Currently always 0; see field description below. + total_llm_cost: USD cost across the whole run (reflection LM + evaluator). + total_token_usage: Token usage across the whole run; keys are "prompt", "completion", "total". + duration_seconds: Wall-clock duration of the whole run in seconds. + started_at: ISO-8601 timestamp when the run started. + finished_at: ISO-8601 timestamp when the run finished. + extras: Free-form business payload; the optimizer never reads or modifies it. + """ + + schema_version: str = Field(default="v1", description="Result schema version.") + algorithm: str = Field(description=("Algorithm name that produced this result; matches the registered key in " + "OPTIMIZER_REGISTRY (e.g. 'gepa_reflective')."), ) + + status: RunStatus = Field(description="Final run status.") + finish_reason: FinishReason = Field(description="Why the loop stopped.") + stop_reason: Optional[StopReason] = Field( + default=None, + description=("Which stop policy ended the run: 'required_metrics_passing' when " + "the framework's per-metric threshold policy fired; " + "'budget_exhausted' on MaxMetricCallsStopper; 'no_improvement' on " + "NoImprovementStopper; 'timeout' on TimeoutStopCondition; " + "'score_threshold' on ScoreThresholdStopper; " + "'max_candidate_proposals' / 'max_tracked_candidates' on the " + "respective candidate caps; 'completed' when the GEPA loop ended " + "without any registered stopper firing. None on FAILED runs that " + "errored before any stopper ran."), + ) + error_message: str = Field(default="", description="Error message when status is FAILED.") + + baseline_pass_rate: float = Field(description="Baseline validation pass rate.") + best_pass_rate: float = Field(description="Best validation pass rate.") + pass_rate_improvement: float = Field(description="best_pass_rate minus baseline_pass_rate.") + + baseline_metric_breakdown: dict[str, float] = Field( + default_factory=dict, + description="Mean score per metric for the baseline.", + ) + best_metric_breakdown: dict[str, float] = Field( + default_factory=dict, + description="Mean score per metric for the best prompts.", + ) + metric_thresholds: dict[str, float] = Field( + default_factory=dict, + description=("PASS/FAIL threshold per metric, copied from evaluate.metrics[].threshold. " + "Lets reporters and summary.txt show baseline / best scores alongside " + "the per-metric threshold so users can see at a glance whether a metric " + "is now above or below its acceptance bar."), + ) + + per_metric_best_candidates: dict[str, list[int]] = Field( + default_factory=dict, + description=("Per-metric Pareto-best candidate indices reported by GEPA. Keyed by " + "metric name; the list contains 0-based indices into the candidate " + "trajectory. Empty when the underlying algorithm does not expose " + "per-objective fronts. Useful for diagnosing which candidate excels " + "on which metric independent of the aggregated best."), + ) + + baseline_prompts: dict[str, str] = Field( + default_factory=dict, + description="Initial prompt text keyed by TargetPrompt name.", + ) + best_prompts: dict[str, str] = Field( + default_factory=dict, + description="Best prompt text keyed by TargetPrompt name.", + ) + + total_rounds: int = Field(description="Number of rounds executed.") + rounds: list[RoundRecord] = Field( + default_factory=list, + description="Per-round records in order.", + ) + + total_reflection_lm_calls: int = Field(description="Total reflection LM invocations (including retries).", ) + total_judge_model_calls: int = Field( + default=0, + description=("Currently always 0: the evaluator does not surface per-judge " + "invocation counts. Reflection LM cost is reflected in " + "total_reflection_lm_calls / total_llm_cost; for judge cost use " + "your LLM provider's billing dashboard."), + ) + total_llm_cost: float = Field( + default=0.0, + description="USD cost across the whole run.", + ) + total_token_usage: dict[str, int] = Field( + default_factory=lambda: { + "prompt": 0, + "completion": 0, + "total": 0 + }, + description='Token usage across the whole run; keys are "prompt", "completion", "total".', + ) + + duration_seconds: float = Field(description="Wall-clock duration of the run in seconds.") + started_at: str = Field(description="ISO-8601 timestamp when the run started.") + finished_at: str = Field(description="ISO-8601 timestamp when the run finished.") + + extras: dict[str, Any] = Field( + default_factory=dict, + description="Free-form business payload; optimizer ignores it.", + ) + + def dump_to(self, path: str) -> None: + """Serialize the result to a JSON file using model_dump_json(indent=2).""" + payload = self.model_dump_json(indent=2, by_alias=True) + with open(path, "w", encoding="utf-8") as fp: + fp.write(payload) + + @classmethod + def from_file(cls, path: str) -> "OptimizeResult": + """Load an OptimizeResult previously written by dump_to.""" + with open(path, "r", encoding="utf-8") as fp: + payload = fp.read() + return cls.model_validate_json(payload) + + def format_summary(self, *, output_dir: str, update_source: bool) -> str: + """Render the human-readable text summary persisted as ``summary.txt``. + + The layout mirrors the terminal summary so users can copy paste any + line directly. Algorithm name, status, baseline / best pass rates, + delta, rounds, duration, error message (when present), best prompt + inventory and the output directory are always included. + """ + sign = "+" if self.pass_rate_improvement >= 0 else "" + if self.pass_rate_improvement > 0: + label = "improved" + elif self.pass_rate_improvement < 0: + label = "regressed" + else: + label = "no improvement" + accepted = sum(1 for r in self.rounds if r.accepted) + lines: list[str] = [ + f"Optimization complete | status={self.status} | algorithm={self.algorithm}", + "", + f"pass_rate : {self.baseline_pass_rate:.4f} -> {self.best_pass_rate:.4f}" + f" ({sign}{self.pass_rate_improvement:.4f}, {label})", + f"rounds : {accepted} accepted / {self.total_rounds} total", + f"duration : {self.duration_seconds:.2f}s", + f"started_at : {self.started_at}", + f"finished_at : {self.finished_at}", + ] + if self.status != "SUCCEEDED" and self.error_message: + lines.append(f"error_message : {self.error_message}") + if self.stop_reason is not None: + lines.append(f"stop_reason : {self.stop_reason}") + lines.append(f"update_source : {'true' if update_source else 'false'}") + lines.append(f"output_dir : {output_dir}") + if (self.baseline_metric_breakdown or self.best_metric_breakdown or self.metric_thresholds): + lines.append("") + lines.append("metric breakdown (threshold | baseline -> best):") + keys = sorted({ + *self.baseline_metric_breakdown.keys(), + *self.best_metric_breakdown.keys(), + *self.metric_thresholds.keys(), + }) + for name in keys: + b = self.baseline_metric_breakdown.get(name, float("nan")) + t = self.best_metric_breakdown.get(name, float("nan")) + if name in self.metric_thresholds: + threshold_str = f"{self.metric_thresholds[name]:.4f}" + else: + threshold_str = " - " + lines.append(f" - {name:<40s} threshold {threshold_str} " + f"{b:.4f} -> {t:.4f}") + if self.best_prompts: + lines.append("") + lines.append("best prompts:") + for name, content in self.best_prompts.items(): + rel = os.path.join("best_prompts", f"{name}.md") + lines.append(f" - {name:<40s} {len(content)} chars ({rel})") + lines.append("") + lines.append(f"artifacts directory: {output_dir}") + lines.append(" result.json summary.txt rounds/ run.log " + "baseline_prompts/ best_prompts/ config.snapshot.json") + return "\n".join(lines) + "\n" diff --git a/trpc_agent_sdk/evaluation/_remote_eval_service.py b/trpc_agent_sdk/evaluation/_remote_eval_service.py index 25a43199..edde246d 100644 --- a/trpc_agent_sdk/evaluation/_remote_eval_service.py +++ b/trpc_agent_sdk/evaluation/_remote_eval_service.py @@ -48,8 +48,17 @@ from ._evaluator_registry import EvaluatorRegistry CallAgent = Callable[[str], Awaitable[str]] +# Metrics that cannot run under RemoteEvalService (black-box / call_agent +# mode) because they need information this service does not capture: +# - ``tool_trajectory_avg_score`` needs per-step tool call traces. +# - ``llm_rubric_knowledge_recall`` reads tool responses from +# ``Invocation.intermediate_data``; this service always emits +# ``intermediate_data=None`` (see ``_perform_inference_single_eval_item``), +# so the judge would silently see "No knowledge search results were +# found." for every case. REMOTE_EVAL_INCOMPATIBLE_METRICS: frozenset[str] = frozenset({ "tool_trajectory_avg_score", + "llm_rubric_knowledge_recall", }) EVAL_SESSION_ID_PREFIX = "___remote_eval___session___" diff --git a/trpc_agent_sdk/evaluation/_target_prompt.py b/trpc_agent_sdk/evaluation/_target_prompt.py new file mode 100644 index 00000000..da104f15 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_target_prompt.py @@ -0,0 +1,243 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Multi-field prompt registry with atomic write_all for AgentOptimizer.""" + +from __future__ import annotations + +import inspect +import os +from pathlib import Path +from typing import Awaitable +from typing import Callable +from typing import Optional + +AsyncRead = Callable[[], Awaitable[str]] +AsyncWrite = Callable[[str], Awaitable[None]] + + +class _RollbackError(RuntimeError): + """Aggregate error raised when one or more path-field rollbacks fail. + + Carries ``(field_name, error)`` pairs for every field whose rollback + raised. The original ``write_all`` failure is preserved as + ``__context__`` (via ``raise ... from primary_err``) so chained + tracebacks surface both the root cause and every rollback failure. + + Private (underscore-prefixed) — users only observe it through the + formatted message in tracebacks; never declared in the public API. + """ + + def __init__(self, failures: list[tuple[str, BaseException]]) -> None: + self.failures = failures + details = "; ".join(f"{name}: {type(err).__name__}: {err}" for name, err in failures) + super().__init__(f"TargetPrompt.write_all rollback failed for " + f"{len(failures)} field(s): {details}") + + +class _Source: + """Base for a single registered prompt source.""" + + +class _PathSource(_Source): + """File-backed prompt source: read/write a UTF-8 text file at a fixed path.""" + + __slots__ = ("path", ) + + def __init__(self, path: str) -> None: + self.path = path + + +class _CallbackSource(_Source): + """Callback-backed prompt source: caller-provided async read/write functions.""" + + __slots__ = ("read_fn", "write_fn") + + def __init__(self, read_fn: AsyncRead, write_fn: AsyncWrite) -> None: + self.read_fn = read_fn + self.write_fn = write_fn + + +class TargetPrompt: + """Registry of prompt fields to be optimized by AgentOptimizer. + + Each field is registered with a unique name and one of two source forms: + - add_path(name, path): file-backed source; framework reads/writes the file + - add_callback(name, read=, write=): caller-backed source with async functions + + Typical use: + target = ( + TargetPrompt() + .add_path("system_prompt", "my_pkg/system.md") + .add_callback("retriever", read=load_fn, write=save_fn) + ) + + read_all / write_all operate on every registered field. write_all is atomic + for path-backed fields (tmp file + os.replace, rollback on partial failure); + callback-backed atomicity is the caller's responsibility. + """ + + def __init__(self) -> None: + self._sources: dict[str, _Source] = {} + + def add_path(self, name: str, path: str) -> "TargetPrompt": + """Register a file-backed prompt field. name must be unique.""" + self._reject_duplicate(name) + self._sources[name] = _PathSource(path) + return self + + def add_callback( + self, + name: str, + *, + read: AsyncRead, + write: AsyncWrite, + ) -> "TargetPrompt": + """Register a callback-backed prompt field with async read / write functions.""" + self._reject_duplicate(name) + if not inspect.iscoroutinefunction(read): + raise TypeError(f"add_callback {name!r}: read must be an async function") + if not inspect.iscoroutinefunction(write): + raise TypeError(f"add_callback {name!r}: write must be an async function") + self._sources[name] = _CallbackSource(read, write) + return self + + def names(self) -> list[str]: + """Return registered field names in insertion order.""" + return list(self._sources.keys()) + + def describe_source(self, name: str) -> str: + """Human-readable source label for a field. + + Path-backed fields return the file path verbatim; callback-backed + fields return the literal ``""``. Raises KeyError if name + is unknown. Used by the optimizer reporter header. + """ + src = self._sources[name] + if isinstance(src, _PathSource): + return src.path + return "" + + async def read(self, name: str) -> str: + """Read the value of a single registered field. Raises KeyError if name is unknown.""" + src = self._sources[name] + return await self._read_one(src) + + async def read_all(self) -> dict[str, str]: + """Read every registered field. Propagates underlying errors (FileNotFoundError / callback exceptions).""" + out: dict[str, str] = {} + for name, src in self._sources.items(): + out[name] = await self._read_one(src) + return out + + async def write_all(self, prompts: dict[str, str]) -> None: + """Atomically write all registered fields. Keys must exactly match registered names. + + Atomicity contract: + - Path fields: write to {path}.tmp, then os.replace (single-file POSIX-atomic rename). + - On any path write failure: already-renamed paths are rolled back to pre-call content, + residual .tmp files are removed, and the original exception propagates. Rollback uses + the same tmp + os.replace primitive, so an interrupted rollback cannot leave a path + field half-written. + - If rollback of any field also fails, the original exception is preserved on + ``__context__`` and a single ``_RollbackError`` listing every per-field rollback + failure propagates. Rollback is best-effort: a failure on one field does not skip + the remaining fields. + - Callback fields: invoked sequentially after every path write succeeds. A callback + failure rolls back path fields to the pre-call snapshot before propagating; callback + fields themselves are not rolled back (caller-owned idempotency). + """ + if set(prompts.keys()) != set(self._sources.keys()): + raise ValueError(f"TargetPrompt.write_all: prompts keys mismatch; " + f"expected {sorted(self._sources.keys())}, got {sorted(prompts.keys())}") + + path_backups = self._snapshot_path_contents() + written_paths: list[str] = [] + try: + for name, src in self._sources.items(): + if isinstance(src, _PathSource): + self._atomic_write_path(src.path, prompts[name]) + written_paths.append(name) + for name, src in self._sources.items(): + if isinstance(src, _CallbackSource): + await src.write_fn(prompts[name]) + except BaseException as primary_err: + rollback_failures = self._rollback_paths(written_paths, path_backups) + self._cleanup_tmp_files() + if rollback_failures: + raise _RollbackError(rollback_failures) from primary_err + raise + + def _reject_duplicate(self, name: str) -> None: + if name in self._sources: + raise ValueError(f"TargetPrompt: name {name!r} already registered") + + def _snapshot_path_contents(self) -> dict[str, Optional[str]]: + """Capture pre-call content of every path-backed field (None if source did not exist).""" + snapshot: dict[str, Optional[str]] = {} + for name, src in self._sources.items(): + if isinstance(src, _PathSource): + try: + snapshot[name] = Path(src.path).read_text(encoding="utf-8") + except FileNotFoundError: + snapshot[name] = None + return snapshot + + def _rollback_paths( + self, + written: list[str], + backups: dict[str, Optional[str]], + ) -> list[tuple[str, BaseException]]: + """Best-effort atomic rollback of every successfully written path field. + + For each field in ``written`` whose source did not exist before + write_all (``backups[name] is None``) the file is unlinked; for + fields that had pre-call content the content is restored via + ``_atomic_write_path`` (tmp + os.replace), so an interrupted + rollback cannot leave a path field half-written. + + Failures are collected and returned rather than raised, so a + single field's failure does not skip subsequent fields. The + caller wraps the collected failures into ``_RollbackError``. + """ + failures: list[tuple[str, BaseException]] = [] + for name in written: + src = self._sources[name] + if not isinstance(src, _PathSource): + continue + backup = backups.get(name) + try: + if backup is None: + try: + os.unlink(src.path) + except FileNotFoundError: + pass + else: + self._atomic_write_path(src.path, backup) + except BaseException as err: + failures.append((name, err)) + return failures + + def _cleanup_tmp_files(self) -> None: + for src in self._sources.values(): + if isinstance(src, _PathSource): + tmp = src.path + ".tmp" + try: + os.unlink(tmp) + except FileNotFoundError: + pass + + @staticmethod + def _atomic_write_path(path: str, content: str) -> None: + tmp = path + ".tmp" + Path(tmp).write_text(content, encoding="utf-8") + os.replace(tmp, path) + + async def _read_one(self, src: _Source) -> str: + if isinstance(src, _PathSource): + return Path(src.path).read_text(encoding="utf-8") + if isinstance(src, _CallbackSource): + return await src.read_fn() + raise TypeError(f"unknown source type: {type(src).__name__}")