Appearance
第80天:个人助理Agent-高级功能开发
学习目标
- 掌握多模态交互
- 学习自主决策机制
- 理解自我优化实现
- 掌握协作Agent实现
- 学习异常处理机制
多模态交互
语音交互
python
import speech_recognition as sr
import pyttsx3
from typing import Dict, Optional
import asyncio
class VoiceInteraction:
def __init__(self):
self.recognizer = sr.Recognizer()
self.engine = pyttsx3.init()
self.engine.setProperty('rate', 150)
self.engine.setProperty('volume', 1.0)
async def listen(self) -> Optional[str]:
with sr.Microphone() as source:
self.recognizer.adjust_for_ambient_noise(source, duration=1)
try:
audio = self.recognizer.listen(source, timeout=5, phrase_time_limit=10)
text = self.recognizer.recognize_google(audio, language='zh-CN')
return text
except sr.WaitTimeoutError:
return None
except sr.UnknownValueError:
return None
except sr.RequestError:
return None
async def speak(self, text: str):
self.engine.say(text)
self.engine.runAndWait()
async def listen_and_respond(self, agent) -> Dict:
user_input = await self.listen()
if not user_input:
return {
"status": "no_input",
"message": "未检测到语音输入"
}
response = await agent.process_input(user_input)
if response.get("success"):
await self.speak(response["response"])
return {
"status": "success",
"user_input": user_input,
"response": response
}图像交互
python
from PIL import Image
import base64
from typing import Dict, Optional
import io
class ImageInteraction:
def __init__(self, llm_client):
self.llm_client = llm_client
async def analyze_image(
self,
image_path: str,
query: str
) -> Dict:
try:
with open(image_path, 'rb') as image_file:
image_data = image_file.read()
base64_image = base64.b64encode(image_data).decode('utf-8')
response = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": query
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
max_tokens=1000
)
return {
"success": True,
"analysis": response.choices[0].message.content
}
except Exception as e:
return {
"success": False,
"error": str(e)
}
async def generate_image(
self,
prompt: str,
size: str = "1024x1024"
) -> Dict:
try:
response = self.llm_client.images.generate(
model="dall-e-3",
prompt=prompt,
size=size,
n=1
)
image_url = response.data[0].url
return {
"success": True,
"image_url": image_url
}
except Exception as e:
return {
"success": False,
"error": str(e)
}
async def process_image_with_agent(
self,
image_path: str,
query: str,
agent
) -> Dict:
analysis = await self.analyze_image(image_path, query)
if not analysis.get("success"):
return analysis
agent_input = f"""图像分析结果:
{analysis['analysis']}
请根据以上分析结果回答用户的问题:{query}"""
response = await agent.process_input(agent_input)
return {
"success": True,
"image_analysis": analysis["analysis"],
"agent_response": response
}多模态协调器
python
class MultimodalCoordinator:
def __init__(
self,
voice_interaction: VoiceInteraction,
image_interaction: ImageInteraction,
agent
):
self.voice_interaction = voice_interaction
self.image_interaction = image_interaction
self.agent = agent
async def process_multimodal_input(
self,
input_type: str,
data: Dict
) -> Dict:
if input_type == "voice":
return await self._process_voice_input(data)
elif input_type == "text":
return await self._process_text_input(data)
elif input_type == "image":
return await self._process_image_input(data)
elif input_type == "multimodal":
return await self._process_combined_input(data)
else:
return {
"success": False,
"error": f"不支持的输入类型: {input_type}"
}
async def _process_voice_input(self, data: Dict) -> Dict:
return await self.voice_interaction.listen_and_respond(self.agent)
async def _process_text_input(self, data: Dict) -> Dict:
text = data.get("text", "")
return await self.agent.process_input(text)
async def _process_image_input(self, data: Dict) -> Dict:
image_path = data.get("image_path")
query = data.get("query", "请描述这张图片")
return await self.image_interaction.process_image_with_agent(
image_path,
query,
self.agent
)
async def _process_combined_input(self, data: Dict) -> Dict:
text = data.get("text", "")
image_path = data.get("image_path")
if image_path:
image_analysis = await self.image_interaction.analyze_image(
image_path,
"请详细描述这张图片的内容"
)
if image_analysis.get("success"):
combined_input = f"""图片分析:
{image_analysis['analysis']}
用户问题:{text}"""
response = await self.agent.process_input(combined_input)
return {
"success": True,
"image_analysis": image_analysis["analysis"],
"agent_response": response
}
return await self.agent.process_input(text)自主决策机制
决策引擎
python
class DecisionEngine:
def __init__(self, llm_client):
self.llm_client = llm_client
self.decision_history = []
async def make_decision(
self,
context: Dict,
options: List[Dict],
criteria: Optional[List[str]] = None
) -> Dict:
prompt = self._build_decision_prompt(context, options, criteria)
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的决策引擎"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
decision = json.loads(result)
self.decision_history.append({
"context": context,
"options": options,
"decision": decision,
"timestamp": datetime.now().isoformat()
})
return decision
except Exception as e:
return {
"success": False,
"error": str(e)
}
def _build_decision_prompt(
self,
context: Dict,
options: List[Dict],
criteria: Optional[List[str]]
) -> str:
prompt = f"""请根据以下信息做出最佳决策:
上下文:
{self._format_context(context)}
可选方案:
{self._format_options(options)}
决策标准:
{criteria if criteria else '根据上下文选择最佳方案'}
请返回JSON格式的决策:
{{
"selected_option": "选择的方案ID",
"reasoning": "决策理由",
"confidence": "置信度(0-1)",
"alternatives": ["备选方案ID列表"]
}}"""
return prompt
def _format_context(self, context: Dict) -> str:
return "\n".join([
f"- {key}: {value}"
for key, value in context.items()
])
def _format_options(self, options: List[Dict]) -> str:
return "\n".join([
f"- {opt['id']}: {opt.get('name', '')} - {opt.get('description', '')}"
for opt in options
])
async def evaluate_decision(
self,
decision_id: str,
outcome: Dict
) -> Dict:
decision = self._find_decision_by_id(decision_id)
if not decision:
return {
"success": False,
"error": "决策不存在"
}
evaluation = await self._analyze_outcome(decision, outcome)
decision["evaluation"] = evaluation
return {
"success": True,
"evaluation": evaluation
}
def _find_decision_by_id(self, decision_id: str) -> Optional[Dict]:
for decision in reversed(self.decision_history):
if decision.get("decision", {}).get("id") == decision_id:
return decision
return None
async def _analyze_outcome(
self,
decision: Dict,
outcome: Dict
) -> Dict:
prompt = f"""请评估以下决策的结果:
原始决策:
{decision['decision']}
实际结果:
{outcome}
请返回JSON格式的评估:
{{
"success": true/false,
"score": "评分(0-1)",
"lessons": "经验教训"
}}"""
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的决策评估器"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
return json.loads(result)
except Exception as e:
return {
"success": False,
"error": str(e)
}自主规划
python
class AutonomousPlanner:
def __init__(self, llm_client):
self.llm_client = llm_client
self.plans = []
async def create_autonomous_plan(
self,
goal: str,
constraints: Optional[Dict] = None,
resources: Optional[Dict] = None
) -> Dict:
prompt = f"""请为以下目标创建自主执行计划:
目标:{goal}
约束条件:
{constraints if constraints else '无'}
可用资源:
{resources if resources else '无'}
请返回JSON格式的计划:
{{
"plan_id": "计划ID",
"goal": "目标",
"steps": [
{{
"step": 1,
"action": "执行动作",
"description": "步骤描述",
"tool": "使用的工具",
"parameters": "工具参数",
"expected_result": "预期结果",
"dependencies": ["依赖的步骤ID"]
}}
],
"estimated_time": "预估总时间",
"risks": ["潜在风险"],
"mitigations": ["风险缓解措施"]
}}"""
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的自主规划器"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
plan = json.loads(result)
self.plans.append({
"plan": plan,
"status": "created",
"created_at": datetime.now().isoformat()
})
return plan
except Exception as e:
return {
"success": False,
"error": str(e)
}
async def adapt_plan(
self,
plan_id: str,
new_context: Dict
) -> Dict:
plan = self._find_plan_by_id(plan_id)
if not plan:
return {
"success": False,
"error": "计划不存在"
}
prompt = f"""请根据新的上下文调整以下计划:
原计划:
{plan['plan']}
新上下文:
{new_context}
请返回JSON格式的调整后计划:
{{
"plan_id": "计划ID",
"steps": ["调整后的步骤"],
"changes": ["所做的变更"],
"reasoning": "调整理由"
}}"""
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的计划调整器"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
adapted_plan = json.loads(result)
plan["plan"] = adapted_plan
plan["status"] = "adapted"
plan["adapted_at"] = datetime.now().isoformat()
return adapted_plan
except Exception as e:
return {
"success": False,
"error": str(e)
}
def _find_plan_by_id(self, plan_id: str) -> Optional[Dict]:
for plan in reversed(self.plans):
if plan["plan"].get("plan_id") == plan_id:
return plan
return None自我优化实现
学习引擎
python
class LearningEngine:
def __init__(self, llm_client):
self.llm_client = llm_client
self.experiences = []
self.patterns = {}
async def learn_from_experience(
self,
experience: Dict
) -> Dict:
self.experiences.append({
**experience,
"timestamp": datetime.now().isoformat()
})
patterns = await self._extract_patterns(self.experiences)
self.patterns = patterns
return {
"success": True,
"patterns": patterns,
"experience_count": len(self.experiences)
}
async def _extract_patterns(
self,
experiences: List[Dict]
) -> Dict:
if len(experiences) < 10:
return self.patterns
prompt = f"""请从以下经验中提取学习模式:
经验:
{self._format_experiences(experiences[-50:])}
请返回JSON格式的模式:
{{
"user_preferences": {{
"communication_style": "沟通风格",
"response_length": "回复长度偏好",
"tone": "语气偏好"
}},
"task_patterns": [
{{
"task_type": "任务类型",
"preferred_approach": "偏好方法",
"success_rate": "成功率"
}}
],
"common_issues": [
{{
"issue": "常见问题",
"solution": "解决方案",
"frequency": "频率"
}}
]
}}"""
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的模式提取器"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
return json.loads(result)
except Exception as e:
return self.patterns
def _format_experiences(self, experiences: List[Dict]) -> str:
return "\n".join([
f"- {exp.get('task', '')}: {exp.get('outcome', '')} ({exp.get('timestamp', '')})"
for exp in experiences
])
async def optimize_behavior(
self,
current_behavior: Dict
) -> Dict:
prompt = f"""请根据学习到的模式优化以下行为:
当前行为:
{current_behavior}
学习到的模式:
{self.patterns}
请返回JSON格式的优化建议:
{{
"optimized_behavior": "优化后的行为",
"changes": ["所做的变更"],
"reasoning": "优化理由",
"expected_improvement": "预期改进"
}}"""
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的行为优化器"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
return json.loads(result)
except Exception as e:
return current_behavior反馈学习
python
class FeedbackLearner:
def __init__(self, llm_client):
self.llm_client = llm_client
self.feedback_history = []
async def process_feedback(
self,
action: str,
feedback: str,
rating: Optional[int] = None
) -> Dict:
feedback_entry = {
"action": action,
"feedback": feedback,
"rating": rating,
"timestamp": datetime.now().isoformat()
}
self.feedback_history.append(feedback_entry)
analysis = await self._analyze_feedback(feedback_entry)
return {
"success": True,
"analysis": analysis,
"feedback_count": len(self.feedback_history)
}
async def _analyze_feedback(
self,
feedback_entry: Dict
) -> Dict:
recent_feedback = self.feedback_history[-20:]
prompt = f"""请分析以下反馈:
当前反馈:
{feedback_entry}
最近反馈:
{self._format_feedback(recent_feedback)}
请返回JSON格式的分析:
{{
"sentiment": "情感倾向(positive/negative/neutral)",
"key_issues": ["关键问题"],
"suggestions": ["改进建议"],
"priority": "优先级(high/medium/low)"
}}"""
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的反馈分析器"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
return json.loads(result)
except Exception as e:
return {
"sentiment": "neutral",
"key_issues": [],
"suggestions": [],
"priority": "low"
}
def _format_feedback(self, feedback_list: List[Dict]) -> str:
return "\n".join([
f"- {fb.get('action', '')}: {fb.get('feedback', '')} (评分: {fb.get('rating', 'N/A')})"
for fb in feedback_list
])
async def get_improvement_plan(self) -> Dict:
prompt = f"""请根据反馈历史制定改进计划:
反馈历史:
{self._format_feedback(self.feedback_history[-50:])}
请返回JSON格式的改进计划:
{{
"priority_improvements": [
{{
"area": "改进领域",
"action": "改进措施",
"expected_impact": "预期影响"
}}
],
"overall_sentiment": "整体情感",
"trend": "趋势(improving/stable/declining)"
}}"""
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的改进计划制定器"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
return json.loads(result)
except Exception as e:
return {
"priority_improvements": [],
"overall_sentiment": "neutral",
"trend": "stable"
}协作Agent实现
Agent协作框架
python
class AgentCollaboration:
def __init__(self):
self.agents = {}
self.collaboration_history = []
def register_agent(
self,
agent_id: str,
agent: BaseAgent,
capabilities: List[str]
):
self.agents[agent_id] = {
"agent": agent,
"capabilities": capabilities,
"status": "available"
}
async def collaborate(
self,
task: str,
context: Optional[Dict] = None
) -> Dict:
selected_agents = await self._select_agents(task)
if not selected_agents:
return {
"success": False,
"error": "没有可用的Agent"
}
collaboration_result = await self._execute_collaboration(
task,
selected_agents,
context
)
self.collaboration_history.append({
"task": task,
"agents": selected_agents,
"result": collaboration_result,
"timestamp": datetime.now().isoformat()
})
return collaboration_result
async def _select_agents(
self,
task: str
) -> List[str]:
selected = []
for agent_id, agent_info in self.agents.items():
if agent_info["status"] != "available":
continue
capabilities = agent_info["capabilities"]
if self._is_capable(task, capabilities):
selected.append(agent_id)
return selected[:3]
def _is_capable(self, task: str, capabilities: List[str]) -> bool:
task_keywords = task.lower().split()
for capability in capabilities:
if any(
keyword in capability.lower()
for keyword in task_keywords
):
return True
return False
async def _execute_collaboration(
self,
task: str,
agent_ids: List[str],
context: Optional[Dict]
) -> Dict:
results = []
for agent_id in agent_ids:
agent_info = self.agents[agent_id]
agent = agent_info["agent"]
try:
result = await agent.process_input(task, context)
results.append({
"agent_id": agent_id,
"result": result,
"success": result.get("success", False)
})
except Exception as e:
results.append({
"agent_id": agent_id,
"error": str(e),
"success": False
})
return {
"success": any(r["success"] for r in results),
"results": results,
"best_result": max(
results,
key=lambda r: r.get("success", False)
)
}Agent协调器
python
class AgentCoordinator:
def __init__(
self,
collaboration: AgentCollaboration,
llm_client
):
self.collaboration = collaboration
self.llm_client = llm_client
async def coordinate_task(
self,
task: str,
context: Optional[Dict] = None
) -> Dict:
plan = await self._create_coordination_plan(task, context)
if not plan.get("single_agent"):
collaboration_result = await self.collaboration.collaborate(
task,
context
)
return collaboration_result
agent_id = plan["primary_agent"]
agent_info = self.collaboration.agents[agent_id]
agent = agent_info["agent"]
result = await agent.process_input(task, context)
return result
async def _create_coordination_plan(
self,
task: str,
context: Optional[Dict]
) -> Dict:
available_agents = [
{
"id": agent_id,
"capabilities": agent_info["capabilities"]
}
for agent_id, agent_info in self.collaboration.agents.items()
if agent_info["status"] == "available"
]
prompt = f"""请为以下任务制定Agent协调计划:
任务:{task}
上下文:
{context if context else '无'}
可用Agent:
{self._format_agents(available_agents)}
请返回JSON格式的计划:
{{
"single_agent": true/false,
"primary_agent": "主要Agent ID",
"collaboration_agents": ["协作Agent ID列表"],
"coordination_strategy": "协调策略",
"reasoning": "计划理由"
}}"""
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的Agent协调器"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
return json.loads(result)
except Exception as e:
return {
"single_agent": True,
"primary_agent": list(self.collaboration.agents.keys())[0],
"collaboration_agents": [],
"coordination_strategy": "simple",
"reasoning": "默认使用第一个Agent"
}
def _format_agents(self, agents: List[Dict]) -> str:
return "\n".join([
f"- {agent['id']}: {', '.join(agent['capabilities'])}"
for agent in agents
])异常处理机制
错误处理器
python
class ErrorHandler:
def __init__(self, llm_client):
self.llm_client = llm_client
self.error_history = []
self.recovery_strategies = {}
async def handle_error(
self,
error: Exception,
context: Dict
) -> Dict:
error_info = {
"error_type": type(error).__name__,
"error_message": str(error),
"context": context,
"timestamp": datetime.now().isoformat()
}
self.error_history.append(error_info)
recovery_strategy = await self._determine_recovery_strategy(error_info)
return {
"success": False,
"error": error_info,
"recovery_strategy": recovery_strategy
}
async def _determine_recovery_strategy(
self,
error_info: Dict
) -> Dict:
prompt = f"""请为以下错误确定恢复策略:
错误信息:
{error_info}
请返回JSON格式的恢复策略:
{{
"strategy": "恢复策略",
"action": "恢复动作",
"fallback": "备用方案",
"user_message": "用户消息"
}}"""
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的错误恢复策略制定器"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
return json.loads(result)
except Exception as e:
return {
"strategy": "retry",
"action": "重试操作",
"fallback": "联系人工客服",
"user_message": "抱歉,我遇到了一些问题,请稍后再试"
}
async def learn_from_errors(self) -> Dict:
if len(self.error_history) < 10:
return {
"success": False,
"message": "错误数据不足"
}
prompt = f"""请从以下错误历史中学习:
错误历史:
{self._format_errors(self.error_history[-50:])}
请返回JSON格式的学习结果:
{{
"common_errors": [
{{
"error_type": "错误类型",
"frequency": "频率",
"prevention": "预防措施"
}}
],
"improvements": ["改进建议"]
}}"""
try:
completion = self.llm_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的错误分析器"},
{"role": "user", "content": prompt}
],
temperature=0.3,
response_format={"type": "json_object"}
)
result = completion.choices[0].message.content
import json
return json.loads(result)
except Exception as e:
return {
"common_errors": [],
"improvements": []
}
def _format_errors(self, errors: List[Dict]) -> str:
return "\n".join([
f"- {err['error_type']}: {err['error_message']} ({err['timestamp']})"
for err in errors
])实践练习
练习1:实现多模态交互
python
def implement_multimodal():
llm_client = openai.OpenAI(api_key="your-api-key")
voice_interaction = VoiceInteraction()
image_interaction = ImageInteraction(llm_client)
return voice_interaction, image_interaction练习2:实现自主决策
python
def implement_autonomous_decision():
llm_client = openai.OpenAI(api_key="your-api-key")
decision_engine = DecisionEngine(llm_client)
return decision_engine练习3:实现协作Agent
python
def implement_collaborative_agents():
collaboration = AgentCollaboration()
return collaboration总结
本节我们学习了个人助理Agent的高级功能开发:
- 多模态交互
- 自主决策机制
- 自我优化实现
- 协作Agent实现
- 异常处理机制
高级功能使Agent更加智能和可靠。
