Browse Source

update clean embedding cache query logic (#6483)

Jyong 9 months ago
parent
commit
1e0e573165

+ 32 - 0
api/migrations/versions/6e957a32015b_add_embedding_cache_created_at_index.py

@@ -0,0 +1,32 @@
+"""add-embedding-cache-created_at_index
+
+Revision ID: 6e957a32015b
+Revises: fecff1c3da27
+Create Date: 2024-07-19 17:21:34.414705
+
+"""
+from alembic import op
+
+import models as models
+
+# revision identifiers, used by Alembic.
+revision = '6e957a32015b'
+down_revision = 'fecff1c3da27'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('embeddings', schema=None) as batch_op:
+        batch_op.create_index('created_at_idx', ['created_at'], unique=False)
+
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('embeddings', schema=None) as batch_op:
+        batch_op.drop_index('created_at_idx')
+
+    # ### end Alembic commands ###

+ 2 - 1
api/models/dataset.py

@@ -630,7 +630,8 @@ class Embedding(db.Model):
     __tablename__ = 'embeddings'
     __table_args__ = (
         db.PrimaryKeyConstraint('id', name='embedding_pkey'),
-        db.UniqueConstraint('model_name', 'hash', 'provider_name', name='embedding_hash_idx')
+        db.UniqueConstraint('model_name', 'hash', 'provider_name', name='embedding_hash_idx'),
+        db.Index('created_at_idx', 'created_at')
     )
 
     id = db.Column(StringUUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))

+ 1 - 1
api/models/model.py

@@ -1383,7 +1383,7 @@ class TraceAppConfig(db.Model):
     __tablename__ = 'trace_app_config'
     __table_args__ = (
         db.PrimaryKeyConstraint('id', name='tracing_app_config_pkey'),
-        db.Index('tracing_app_config_app_id_idx', 'app_id'),
+        db.Index('trace_app_config_app_id_idx', 'app_id'),
     )
 
     id = db.Column(StringUUID, server_default=db.text('uuid_generate_v4()'))

+ 11 - 4
api/schedule/clean_embedding_cache_task.py

@@ -2,6 +2,7 @@ import datetime
 import time
 
 import click
+from sqlalchemy import text
 from werkzeug.exceptions import NotFound
 
 import app
@@ -18,12 +19,18 @@ def clean_embedding_cache_task():
     thirty_days_ago = datetime.datetime.now() - datetime.timedelta(days=clean_days)
     while True:
         try:
-            embeddings = db.session.query(Embedding).filter(Embedding.created_at < thirty_days_ago) \
+            embedding_ids = db.session.query(Embedding.id).filter(Embedding.created_at < thirty_days_ago) \
                 .order_by(Embedding.created_at.desc()).limit(100).all()
+            embedding_ids = [embedding_id[0] for embedding_id in embedding_ids]
         except NotFound:
             break
-        for embedding in embeddings:
-            db.session.delete(embedding)
-        db.session.commit()
+        if embedding_ids:
+            db.session.execute(text(
+                "DELETE FROM embeddings WHERE id in :embedding_ids"
+            ), {'embedding_ids': tuple(embedding_ids)})
+
+            db.session.commit()
+        else:
+            break
     end_at = time.perf_counter()
     click.echo(click.style('Cleaned embedding cache from db success latency: {}'.format(end_at - start_at), fg='green'))