簡単なウェブ検索エンジンを作りたくなったので、Weaviate を試してみることにしました。
以前、Dify インストール時に使用した Milvus も同じベクトルデータベースですが、Weaviate は例えば、ページの作成日、URLなどといったテキスト以外の付加情報(メタデータ)も同時に管理することができるため、検索エンジン用に向いています。
Docker Compose の作成
にアクセスすると Configurator
というDocker Composeを生成するためのツールが提供されています。これを使って好みのDocker Compose を先生することができます。
今回は、
https://configuration.weaviate.io/v2/docker-compose/docker-compose.yml?generative_anyscale=false&generative_aws=false&generative_cohere=false&generative_mistral=false&generative_octoai=false&generative_ollama=false&generative_openai=false&generative_palm=false&gpu_support=true&media_type=text&modules=modules&ner_module=false&qna_module=false&ref2vec_centroid=false&reranker_cohere=false&reranker_transformers=false&runtime=docker-compose&spellcheck_module=false&sum_module=true&sum_module_model=facebook-bart-large-cnn-1.0.0&text_module=text2vec-transformers&transformers_model=sentence-transformers-paraphrase-multilingual-mpnet-base-v2&weaviate_version=v1.25.4&weaviate_volume=named-volume
---
services:
weaviate:
command:
- --host
- 0.0.0.0
- --port
- '8080'
- --scheme
- http
image: cr.weaviate.io/semitechnologies/weaviate:1.25.4
ports:
- 8080:8080
- 50051:50051
volumes:
- weaviate_data:/var/lib/weaviate
restart: on-failure:0
environment:
TRANSFORMERS_INFERENCE_API: 'http://t2v-transformers:8080'
SUM_INFERENCE_API: 'http://sum-transformers:8080'
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'text2vec-transformers'
ENABLE_MODULES: 'text2vec-transformers,sum-transformers'
CLUSTER_HOSTNAME: 'node1'
t2v-transformers:
image: cr.weaviate.io/semitechnologies/transformers-inference:sentence-transformers-paraphrase-multilingual-mpnet-base-v2
environment:
ENABLE_CUDA: '1'
NVIDIA_VISIBLE_DEVICES: 'all'
deploy:
resources:
reservations:
devices:
- capabilities:
- 'gpu'
sum-transformers:
image: cr.weaviate.io/semitechnologies/sum-transformers:facebook-bart-large-cnn-1.0.0
environment:
ENABLE_CUDA: '1'
NVIDIA_VISIBLE_DEVICES: 'all'
deploy:
resources:
reservations:
devices:
- capabilities:
- 'gpu'
volumes:
weaviate_data:
...
このように作りました。上記は、Nvidia の GPUを使う設定になっています。
ただ、一旦、M1 Mac で試したいので、Nvidia GPUなしの
https://configuration.weaviate.io/v2/docker-compose/docker-compose.yml?generative_anyscale=false&generative_aws=false&generative_cohere=false&generative_mistral=false&generative_octoai=false&generative_ollama=false&generative_openai=false&generative_palm=false&gpu_support=false&media_type=text&modules=modules&ner_module=false&qna_module=false&ref2vec_centroid=false&reranker_cohere=false&reranker_transformers=false&runtime=docker-compose&spellcheck_module=false&sum_module=false&text_module=text2vec-transformers&transformers_model=sentence-transformers-paraphrase-multilingual-mpnet-base-v2&weaviate_version=v1.25.4&weaviate_volume=named-volume
---
services:
weaviate:
command:
- --host
- 0.0.0.0
- --port
- '8080'
- --scheme
- http
image: cr.weaviate.io/semitechnologies/weaviate:1.25.4
ports:
- 8080:8080
- 50052:50051
volumes:
- weaviate_data:/var/lib/weaviate
restart: on-failure:0
environment:
TRANSFORMERS_INFERENCE_API: 'http://t2v-transformers:8080'
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'text2vec-transformers'
ENABLE_MODULES: 'text2vec-transformers'
CLUSTER_HOSTNAME: 'node1'
t2v-transformers:
image: cr.weaviate.io/semitechnologies/transformers-inference:sentence-transformers-paraphrase-multilingual-mpnet-base-v2
environment:
ENABLE_CUDA: '0'
volumes:
weaviate_data:
...
こちらを用いて、試してみます。(Macでは、ポート50051はlaunchdが使用しているため50052に変更しています)
Docker Compose で起動する
起動確認
にアクセスすると、
このように表示されていれば、起動成功です
テスト
データを追加
curl -X POST http://localhost:8080/v1/objects \
-H 'Content-Type: application/json' \
-d '{
"class": "Article",
"properties": {
"title": "Persistent Data Test",
"content": "This data should persist across container restarts."
}
}'
{"class":"Article","creationTimeUnix":1720734914608,"id":"a19bddb0-3ea1-4865-a1aa-fe77bc5efc9a","lastUpdateTimeUnix":1720734914608,"properties":{"content":"This data should persist across container restarts.","title":"Persistent Data Test"},"vector":[-0.12931107,0.07554765,-0.008322654,0.031988136,0.05109657,0.046475142,0.086101204...]}
データ取得
$ curl -X POST http://localhost:8080/v1/graphql \
-H 'Content-Type: application/json' \
-d '{"query":"{ Get { Article { title content _additional { id } } } }"}'
{
"data": {
"Get": {
"Article": [
{
"_additional": {
"id": "a19bddb0-3ea1-4865-a1aa-fe77bc5efc9a"
},
"content": "This data should persist across container restarts.",
"title": "Persistent Data Test"
}
]
}
}
}
件数を指定したいときは QUERY_DEFAULTS_LIMIT
を指定します。デフォルトは 20
です。
$ curl -X POST http://localhost:8080/v1/graphql \
-H 'Content-Type: application/json' \
-d '{"query":"{ Get { Article(limit: 100, offset: 0) { title content _additional { id } } } }"}'
{
"data": {
"Get": {
"Article": [
{
"_additional": {
"id": "a19bddb0-3ea1-4865-a1aa-fe77bc5efc9a"
},
"content": "This data should persist across container restarts.",
"title": "Persistent Data Test"
}
]
}
}
}
データ検索(完全一致)
$ curl -X POST http://localhost:8080/v1/graphql \
-H 'Content-Type: application/json' \
-d '{"query":"{ Get { Article(where: { path: [\"title\"], operator: Equal, valueText: \"Persistent Data Test\" }) { title content _additional { id } } } }"}'
{
"data": {
"Get": {
"Article": [
{
"_additional": {
"id": "a19bddb0-3ea1-4865-a1aa-fe77bc5efc9a"
},
"content": "This data should persist across container restarts.",
"title": "Persistent Data Test"
}
]
}
}
}
データ検索(ベクトル検索=類似検索)
curl -X POST http://localhost:8080/v1/graphql \
-H 'Content-Type: application/json' \
-d '{"query":"{ Get { Article(nearText: {concepts: [\"Persistent Data Test\"]}) { title content _additional { id } } } }"}'
{
"data": {
"Get": {
"Article": [
{
"_additional": {
"id": "a19bddb0-3ea1-4865-a1aa-fe77bc5efc9a"
},
"content": "This data should persist across container restarts.",
"title": "Persistent Data Test"
}
]
}
}
}
データ更新
curl -X PUT "http://localhost:8080/v1/objects/Article/<object-id>" \
-H 'Content-Type: application/json' \
-d '{
"class": "Article",
"properties": {
"title": "Updated Persistent Data Test",
"content": "This data has been updated and should persist across container restarts."
}
}'
データ削除
$ curl -X DELETE http://localhost:8080/v1/objects/Article/a19bddb0-3ea1-4865-a1aa-fe77bc5efc9a
一通り、M1 Mac 上で動きました🍺
スキーマに何件のデータが登録されているか確認
curl -X POST http://localhost:8080/v1/graphql \
-H "Authorization: Bearer a3b1c4d6e7f8g9h0i1j2k3l4m5n6o7p8" \
-H 'Content-Type: application/json' \
-d '{
"query": "{ Aggregate { WebPage { meta { count } } } }"
}' | jq
距離指定で検索
curl -X POST http://localhost:8080/v1/graphql -H "Authorization: Bearer a3b1c4d6e7f8g9h0i1j2k3l4m5n6o7p8" -H 'Content-Type: application/json' -d '{"query":"{ Get { WebPage(limit:5, nearText: {concepts: [\"ゲーム\"], distance: 0.5}) { date title source image _additional { id distance count } } } }"}' | jq
スキーマ定義
$ curl -X POST "http://localhost:8080/v1/schema" \
-H "Content-Type: application/json" \
-d '{
"class": "NewsArticle",
"properties": [
{
"name": "title",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": false
}
}
},
{
"name": "raw_text",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": false
}
}
},
{
"name": "excerpt",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "author",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "hostname",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "date",
"dataType": ["date"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "fingerprint",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "license",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "comments",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "language",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "image",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "pagetype",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "filedate",
"dataType": ["date"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "source",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "categories",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "tags",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
},
{
"name": "source_hostname",
"dataType": ["text"],
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
}
}
}
]
}'
スキーマ一覧
$ curl -X GET "http://localhost:8080/v1/schema"
100 6785 0 6785 0 0 2018k 0 --:--:-- --:--:-- --:--:-- 2208k
{
"classes": [
{
"class": "NewsArticle",
"invertedIndexConfig": {
"bm25": {
"b": 0.75,
"k1": 1.2
},
"cleanupIntervalSeconds": 60,
"stopwords": {
"additions": null,
"preset": "en",
"removals": null
}
},
"moduleConfig": {
"text2vec-transformers": {
"poolingStrategy": "masked_mean",
"vectorizeClassName": true
}
},
"multiTenancyConfig": {
"autoTenantActivation": false,
"autoTenantCreation": false,
"enabled": false
},
"properties": [
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": false
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "title",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": false
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "raw_text",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "excerpt",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "author",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "hostname",
"tokenization": "word"
},
{
"dataType": [
"date"
],
"indexFilterable": true,
"indexSearchable": false,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "date"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "fingerprint",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "license",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "comments",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "language",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "image",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "pagetype",
"tokenization": "word"
},
{
"dataType": [
"date"
],
"indexFilterable": true,
"indexSearchable": false,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "filedate"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "source",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "categories",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "tags",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-contextionary": {
"skip": true
},
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "source_hostname",
"tokenization": "word"
}
],
"replicationConfig": {
"factor": 1
},
"shardingConfig": {
"actualCount": 1,
"actualVirtualCount": 128,
"desiredCount": 1,
"desiredVirtualCount": 128,
"function": "murmur3",
"key": "_id",
"strategy": "hash",
"virtualPerPhysical": 128
},
"vectorIndexConfig": {
"bq": {
"enabled": false
},
"cleanupIntervalSeconds": 300,
"distance": "cosine",
"dynamicEfFactor": 8,
"dynamicEfMax": 500,
"dynamicEfMin": 100,
"ef": -1,
"efConstruction": 128,
"flatSearchCutoff": 40000,
"maxConnections": 64,
"pq": {
"bitCompression": false,
"centroids": 256,
"enabled": false,
"encoder": {
"distribution": "log-normal",
"type": "kmeans"
},
"segments": 0,
"trainingLimit": 100000
},
"skip": false,
"vectorCacheMaxObjects": 1000000000000
},
"vectorIndexType": "hnsw",
"vectorizer": "text2vec-transformers"
},
{
"class": "Article",
"description": "This property was generated by Weaviate's auto-schema feature on Thu Jul 11 21:55:14 2024",
"invertedIndexConfig": {
"bm25": {
"b": 0.75,
"k1": 1.2
},
"cleanupIntervalSeconds": 60,
"stopwords": {
"additions": null,
"preset": "en",
"removals": null
}
},
"moduleConfig": {
"text2vec-transformers": {
"poolingStrategy": "masked_mean",
"vectorizeClassName": true
}
},
"multiTenancyConfig": {
"autoTenantActivation": false,
"autoTenantCreation": false,
"enabled": false
},
"properties": [
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Thu Jul 11 21:55:14 2024",
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "title",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Thu Jul 11 21:55:14 2024",
"indexFilterable": true,
"indexSearchable": true,
"moduleConfig": {
"text2vec-transformers": {
"skip": false,
"vectorizePropertyName": false
}
},
"name": "content",
"tokenization": "word"
}
],
"replicationConfig": {
"factor": 1
},
"shardingConfig": {
"actualCount": 1,
"actualVirtualCount": 128,
"desiredCount": 1,
"desiredVirtualCount": 128,
"function": "murmur3",
"key": "_id",
"strategy": "hash",
"virtualPerPhysical": 128
},
"vectorIndexConfig": {
"bq": {
"enabled": false
},
"cleanupIntervalSeconds": 300,
"distance": "cosine",
"dynamicEfFactor": 8,
"dynamicEfMax": 500,
"dynamicEfMin": 100,
"ef": -1,
"efConstruction": 128,
"flatSearchCutoff": 40000,
"maxConnections": 64,
"pq": {
"bitCompression": false,
"centroids": 256,
"enabled": false,
"encoder": {
"distribution": "log-normal",
"type": "kmeans"
},
"segments": 0,
"trainingLimit": 100000
},
"skip": false,
"vectorCacheMaxObjects": 1000000000000
},
"vectorIndexType": "hnsw",
"vectorizer": "text2vec-transformers"
}
]
}
スキーマ削除
$ curl -X DELETE "http://localhost:8080/v1/schema/NewsArticle"
スキーマ変更
スキーマ変更はできないため、
- データエクスポート
- スキーマ削除
- スキーマ再作成
- データインポート
の順で行う。
データエクスポート
# スキーマを取得し、必要なフィールドを抽出
schema=$(curl -X GET "http://localhost:8080/v1/schema/Article")
fields=$(echo $schema | jq -r '.properties[].name' | paste -sd "," -)
# クエリを生成してデータを取得
query="{
Get {
Article {
$fields
}
}
}"
curl -X POST "http://localhost:8080/v1/graphql" \
-H "Content-Type: application/json" \
-d "{\"query\": \"$query\"}" > data.json
データのインポート
while read -r line; do
curl -X POST "http://localhost:8080/v1/objects" \
-H "Content-Type: application/json" \
-d "$line"
done < data.json