opendal_storage.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. import logging
  2. import os
  3. from collections.abc import Generator
  4. from pathlib import Path
  5. import opendal # type: ignore[import]
  6. from dotenv import dotenv_values
  7. from extensions.storage.base_storage import BaseStorage
  8. logger = logging.getLogger(__name__)
  9. def _get_opendal_kwargs(*, scheme: str, env_file_path: str = ".env", prefix: str = "OPENDAL_"):
  10. kwargs = {}
  11. config_prefix = prefix + scheme.upper() + "_"
  12. for key, value in os.environ.items():
  13. if key.startswith(config_prefix):
  14. kwargs[key[len(config_prefix) :].lower()] = value
  15. file_env_vars: dict = dotenv_values(env_file_path) or {}
  16. for key, value in file_env_vars.items():
  17. if key.startswith(config_prefix) and key[len(config_prefix) :].lower() not in kwargs and value:
  18. kwargs[key[len(config_prefix) :].lower()] = value
  19. return kwargs
  20. class OpenDALStorage(BaseStorage):
  21. def __init__(self, scheme: str, **kwargs):
  22. kwargs = kwargs or _get_opendal_kwargs(scheme=scheme)
  23. if scheme == "fs":
  24. root = kwargs.get("root", "storage")
  25. Path(root).mkdir(parents=True, exist_ok=True)
  26. self.op = opendal.Operator(scheme=scheme, **kwargs) # type: ignore
  27. logger.debug(f"opendal operator created with scheme {scheme}")
  28. retry_layer = opendal.layers.RetryLayer(max_times=3, factor=2.0, jitter=True)
  29. self.op = self.op.layer(retry_layer)
  30. logger.debug("added retry layer to opendal operator")
  31. def save(self, filename: str, data: bytes) -> None:
  32. self.op.write(path=filename, bs=data)
  33. logger.debug(f"file {filename} saved")
  34. def load_once(self, filename: str) -> bytes:
  35. if not self.exists(filename):
  36. raise FileNotFoundError("File not found")
  37. content: bytes = self.op.read(path=filename)
  38. logger.debug(f"file {filename} loaded")
  39. return content
  40. def load_stream(self, filename: str) -> Generator:
  41. if not self.exists(filename):
  42. raise FileNotFoundError("File not found")
  43. batch_size = 4096
  44. file = self.op.open(path=filename, mode="rb")
  45. while chunk := file.read(batch_size):
  46. yield chunk
  47. logger.debug(f"file {filename} loaded as stream")
  48. def download(self, filename: str, target_filepath: str):
  49. if not self.exists(filename):
  50. raise FileNotFoundError("File not found")
  51. with Path(target_filepath).open("wb") as f:
  52. f.write(self.op.read(path=filename))
  53. logger.debug(f"file {filename} downloaded to {target_filepath}")
  54. def exists(self, filename: str) -> bool:
  55. res: bool = self.op.exists(path=filename)
  56. return res
  57. def delete(self, filename: str):
  58. if self.exists(filename):
  59. self.op.delete(path=filename)
  60. logger.debug(f"file {filename} deleted")
  61. return
  62. logger.debug(f"file {filename} not found, skip delete")