clean_processor.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. import re
  2. class CleanProcessor:
  3. @classmethod
  4. def clean(cls, text: str, process_rule: dict) -> str:
  5. # default clean
  6. # remove invalid symbol
  7. text = re.sub(r'<\|', '<', text)
  8. text = re.sub(r'\|>', '>', text)
  9. text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xEF\xBF\xBE]', '', text)
  10. # Unicode U+FFFE
  11. text = re.sub('\uFFFE', '', text)
  12. rules = process_rule['rules'] if process_rule else None
  13. if 'pre_processing_rules' in rules:
  14. pre_processing_rules = rules["pre_processing_rules"]
  15. for pre_processing_rule in pre_processing_rules:
  16. if pre_processing_rule["id"] == "remove_extra_spaces" and pre_processing_rule["enabled"] is True:
  17. # Remove extra spaces
  18. pattern = r'\n{3,}'
  19. text = re.sub(pattern, '\n\n', text)
  20. pattern = r'[\t\f\r\x20\u00a0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]{2,}'
  21. text = re.sub(pattern, ' ', text)
  22. elif pre_processing_rule["id"] == "remove_urls_emails" and pre_processing_rule["enabled"] is True:
  23. # Remove email
  24. pattern = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
  25. text = re.sub(pattern, '', text)
  26. # Remove URL
  27. pattern = r'https?://[^\s]+'
  28. text = re.sub(pattern, '', text)
  29. return text
  30. def filter_string(self, text):
  31. return text