model_tool.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. from base64 import b64encode
  2. from enum import Enum
  3. from typing import Any, cast
  4. from core.model_manager import ModelInstance
  5. from core.model_runtime.entities.llm_entities import LLMResult
  6. from core.model_runtime.entities.message_entities import (
  7. PromptMessageContent,
  8. PromptMessageContentType,
  9. SystemPromptMessage,
  10. UserPromptMessage,
  11. )
  12. from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel
  13. from core.tools.entities.tool_entities import ModelToolPropertyKey, ToolInvokeMessage, ToolProviderType
  14. from core.tools.tool.tool import Tool
  15. VISION_PROMPT = """## Image Recognition Task
  16. ### Task Description
  17. I require a powerful vision language model for an image recognition task. The model should be capable of extracting various details from the images, including but not limited to text content, layout distribution, color distribution, main subjects, and emotional expressions.
  18. ### Specific Requirements
  19. 1. **Text Content Extraction:** Ensure that the model accurately recognizes and extracts text content from the images, regardless of text size, font, or color.
  20. 2. **Layout Distribution Analysis:** The model should analyze the layout structure of the images, capturing the relationships between various elements and providing detailed information about the image layout.
  21. 3. **Color Distribution Analysis:** Extract information about color distribution in the images, including primary colors, color combinations, and other relevant details.
  22. 4. **Main Subject Recognition:** The model should accurately identify the main subjects in the images and provide detailed descriptions of these subjects.
  23. 5. **Emotional Expression Analysis:** Analyze and describe the emotions or expressions conveyed in the images based on facial expressions, postures, and other relevant features.
  24. ### Additional Considerations
  25. - Ensure that the extracted information is as comprehensive and accurate as possible.
  26. - For each task, provide confidence scores or relevance scores for the model outputs to assess the reliability of the results.
  27. - If necessary, pose specific questions for different tasks to guide the model in better understanding the images and providing relevant information."""
  28. class ModelTool(Tool):
  29. class ModelToolType(Enum):
  30. """
  31. the type of the model tool
  32. """
  33. VISION = 'vision'
  34. model_configuration: dict[str, Any] = None
  35. tool_type: ModelToolType
  36. def __init__(self, model_instance: ModelInstance = None, model: str = None,
  37. tool_type: ModelToolType = ModelToolType.VISION,
  38. properties: dict[ModelToolPropertyKey, Any] = None,
  39. **kwargs):
  40. """
  41. init the tool
  42. """
  43. kwargs['model_configuration'] = {
  44. 'model_instance': model_instance,
  45. 'model': model,
  46. 'properties': properties
  47. }
  48. kwargs['tool_type'] = tool_type
  49. super().__init__(**kwargs)
  50. """
  51. Model tool
  52. """
  53. def fork_tool_runtime(self, meta: dict[str, Any]) -> 'Tool':
  54. """
  55. fork a new tool with meta data
  56. :param meta: the meta data of a tool call processing, tenant_id is required
  57. :return: the new tool
  58. """
  59. return self.__class__(
  60. identity=self.identity.copy() if self.identity else None,
  61. parameters=self.parameters.copy() if self.parameters else None,
  62. description=self.description.copy() if self.description else None,
  63. model_instance=self.model_configuration['model_instance'],
  64. model=self.model_configuration['model'],
  65. tool_type=self.tool_type,
  66. runtime=Tool.Runtime(**meta)
  67. )
  68. def validate_credentials(self, credentials: dict[str, Any], parameters: dict[str, Any], format_only: bool = False) -> None:
  69. """
  70. validate the credentials for Model tool
  71. """
  72. pass
  73. def tool_provider_type(self) -> ToolProviderType:
  74. return ToolProviderType.BUILT_IN
  75. def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage | list[ToolInvokeMessage]:
  76. """
  77. """
  78. model_instance = self.model_configuration['model_instance']
  79. if not model_instance:
  80. return self.create_text_message('the tool is not configured correctly')
  81. if self.tool_type == ModelTool.ModelToolType.VISION:
  82. return self._invoke_llm_vision(user_id, tool_parameters)
  83. else:
  84. return self.create_text_message('the tool is not configured correctly')
  85. def _invoke_llm_vision(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage | list[ToolInvokeMessage]:
  86. # get image
  87. image_parameter_name = self.model_configuration['properties'].get(ModelToolPropertyKey.IMAGE_PARAMETER_NAME, 'image_id')
  88. image_id = tool_parameters.pop(image_parameter_name, '')
  89. if not image_id:
  90. image = self.get_default_image_variable()
  91. if not image:
  92. return self.create_text_message('Please upload an image or input image_id')
  93. else:
  94. image = self.get_variable(image_id)
  95. if not image:
  96. image = self.get_default_image_variable()
  97. if not image:
  98. return self.create_text_message('Please upload an image or input image_id')
  99. if not image:
  100. return self.create_text_message('Please upload an image or input image_id')
  101. # get image
  102. image = self.get_variable_file(image.name)
  103. if not image:
  104. return self.create_text_message('Failed to get image')
  105. # organize prompt messages
  106. prompt_messages = [
  107. SystemPromptMessage(
  108. content=VISION_PROMPT
  109. ),
  110. UserPromptMessage(
  111. content=[
  112. PromptMessageContent(
  113. type=PromptMessageContentType.TEXT,
  114. data='Recognize the image and extract the information from the image.'
  115. ),
  116. PromptMessageContent(
  117. type=PromptMessageContentType.IMAGE,
  118. data=f'data:image/png;base64,{b64encode(image).decode("utf-8")}'
  119. )
  120. ]
  121. )
  122. ]
  123. llm_instance = cast(LargeLanguageModel, self.model_configuration['model_instance'])
  124. result: LLMResult = llm_instance.invoke(
  125. model=self.model_configuration['model'],
  126. credentials=self.runtime.credentials,
  127. prompt_messages=prompt_messages,
  128. model_parameters=tool_parameters,
  129. tools=[],
  130. stop=[],
  131. stream=False,
  132. user=user_id,
  133. )
  134. if not result:
  135. return self.create_text_message('Failed to extract information from the image')
  136. # get result
  137. content = result.message.content
  138. if not content:
  139. return self.create_text_message('Failed to extract information from the image')
  140. return self.create_text_message(content)