|  | @@ -17,6 +17,7 @@ from core.errors.error import ProviderTokenNotInitError
 | 
	
		
			
				|  |  |  from core.llm_generator.llm_generator import LLMGenerator
 | 
	
		
			
				|  |  |  from core.model_manager import ModelInstance, ModelManager
 | 
	
		
			
				|  |  |  from core.model_runtime.entities.model_entities import ModelType
 | 
	
		
			
				|  |  | +from core.rag.cleaner.clean_processor import CleanProcessor
 | 
	
		
			
				|  |  |  from core.rag.datasource.keyword.keyword_factory import Keyword
 | 
	
		
			
				|  |  |  from core.rag.docstore.dataset_docstore import DatasetDocumentStore
 | 
	
		
			
				|  |  |  from core.rag.extractor.entity.extract_setting import ExtractSetting
 | 
	
	
		
			
				|  | @@ -597,26 +598,9 @@ class IndexingRunner:
 | 
	
		
			
				|  |  |              rules = DatasetProcessRule.AUTOMATIC_RULES
 | 
	
		
			
				|  |  |          else:
 | 
	
		
			
				|  |  |              rules = json.loads(processing_rule.rules) if processing_rule.rules else {}
 | 
	
		
			
				|  |  | +        document_text = CleanProcessor.clean(text, rules)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -        if "pre_processing_rules" in rules:
 | 
	
		
			
				|  |  | -            pre_processing_rules = rules["pre_processing_rules"]
 | 
	
		
			
				|  |  | -            for pre_processing_rule in pre_processing_rules:
 | 
	
		
			
				|  |  | -                if pre_processing_rule["id"] == "remove_extra_spaces" and pre_processing_rule["enabled"] is True:
 | 
	
		
			
				|  |  | -                    # Remove extra spaces
 | 
	
		
			
				|  |  | -                    pattern = r"\n{3,}"
 | 
	
		
			
				|  |  | -                    text = re.sub(pattern, "\n\n", text)
 | 
	
		
			
				|  |  | -                    pattern = r"[\t\f\r\x20\u00a0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]{2,}"
 | 
	
		
			
				|  |  | -                    text = re.sub(pattern, " ", text)
 | 
	
		
			
				|  |  | -                elif pre_processing_rule["id"] == "remove_urls_emails" and pre_processing_rule["enabled"] is True:
 | 
	
		
			
				|  |  | -                    # Remove email
 | 
	
		
			
				|  |  | -                    pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
 | 
	
		
			
				|  |  | -                    text = re.sub(pattern, "", text)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -                    # Remove URL
 | 
	
		
			
				|  |  | -                    pattern = r"https?://[^\s]+"
 | 
	
		
			
				|  |  | -                    text = re.sub(pattern, "", text)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -        return text
 | 
	
		
			
				|  |  | +        return document_text
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      @staticmethod
 | 
	
		
			
				|  |  |      def format_split_text(text):
 |