Ingestion
openai-chatgptlangchain-pythonchatgptgenaielasticsearchelasticopenaiAIproduct-store-searchchatlogvectordatabasePythonsearchgenaistackhybrid-search-for-an-e-commerce-product-cataloguesupporting-blog-contentvectorelasticsearch-labsingestionlangchainapplications
Export
Ingestion Data
You'll need to install the following libraries if they are not already installed:
[ ]
[8]
/Library/Python/3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
[ ]
[11]
Step 2: Text Vectorization using SentenceTransformers
[2]
Step 3: Read JSON file containing the dataset
[3]
Step 4: Chunk data for batch processing
[4]
Step 5: Generate bulk actions for Elasticsearch indexing
[5]
Step 6: Indexing data in batches to Elasticsearch
[6]
[10]
Batch indexed: 100 successful, [] failed Batch indexed: 100 successful, [] failed
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Cell In[10], line 1 ----> 1 index_data_in_batches("../files/dataset/products.json", "products-catalog-2", batch_size=100) Cell In[6], line 9, in index_data_in_batches(file_path, index_name, batch_size) 7 for batch in chunk_data(data, batch_size): 8 actions = generate_bulk_actions(index_name, batch) ----> 9 success, failed = helpers.bulk(get_client_es(), actions) 10 print(f"Batch indexed: {success} successful, {failed} failed") File /Library/Python/3.9/site-packages/elasticsearch/helpers/actions.py:531, in bulk(client, actions, stats_only, ignore_status, *args, **kwargs) 529 # make streaming_bulk yield successful results so we can count them 530 kwargs["yield_ok"] = True --> 531 for ok, item in streaming_bulk( 532 client, actions, ignore_status=ignore_status, span_name="helpers.bulk", *args, **kwargs # type: ignore[misc] 533 ): 534 # go through request-response pairs and detect failures 535 if not ok: 536 if not stats_only: File /Library/Python/3.9/site-packages/elasticsearch/helpers/actions.py:427, in streaming_bulk(client, actions, chunk_size, max_chunk_bytes, raise_on_error, expand_action_callback, raise_on_exception, max_retries, initial_backoff, max_backoff, yield_ok, ignore_status, span_name, *args, **kwargs) 420 bulk_data: List[ 421 Union[ 422 Tuple[_TYPE_BULK_ACTION_HEADER], 423 Tuple[_TYPE_BULK_ACTION_HEADER, _TYPE_BULK_ACTION_BODY], 424 ] 425 ] 426 bulk_actions: List[bytes] --> 427 for bulk_data, bulk_actions in _chunk_actions( 428 map(expand_action_callback, actions), 429 chunk_size, 430 max_chunk_bytes, 431 serializer, 432 ): 433 for attempt in range(max_retries + 1): 434 to_retry: List[bytes] = [] File /Library/Python/3.9/site-packages/elasticsearch/helpers/actions.py:234, in _chunk_actions(actions, chunk_size, max_chunk_bytes, serializer) 227 """ 228 Split actions into chunks by number or size, serialize them into strings in 229 the process. 230 """ 231 chunker = _ActionChunker( 232 chunk_size=chunk_size, max_chunk_bytes=max_chunk_bytes, serializer=serializer 233 ) --> 234 for action, data in actions: 235 ret = chunker.feed(action, data) 236 if ret: Cell In[5], line 8, in generate_bulk_actions(index_name, data_batch) 6 for item in data_batch: 7 document_id = item['id'] ----> 8 item['description_embeddings'] = get_text_vector(item['description']) 9 yield { 10 "_index": index_name, 11 "_id": document_id, 12 "_source": item 13 } Cell In[2], line 5, in get_text_vector(sentences) 1 def get_text_vector(sentences): 2 """ 3 Generates sentence embeddings using pre-trained model 'all-MiniLM-L6-v2'. 4 """ ----> 5 model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') 6 embeddings = model.encode(sentences) 7 return embeddings File /Library/Python/3.9/site-packages/sentence_transformers/SentenceTransformer.py:95, in SentenceTransformer.__init__(self, model_name_or_path, modules, device, cache_folder, use_auth_token) 87 snapshot_download(model_name_or_path, 88 cache_dir=cache_folder, 89 library_name='sentence-transformers', 90 library_version=__version__, 91 ignore_files=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'], 92 use_auth_token=use_auth_token) 94 if os.path.exists(os.path.join(model_path, 'modules.json')): #Load as SentenceTransformer model ---> 95 modules = self._load_sbert_model(model_path) 96 else: #Load with AutoModel 97 modules = self._load_auto_model(model_path) File /Library/Python/3.9/site-packages/sentence_transformers/SentenceTransformer.py:840, in SentenceTransformer._load_sbert_model(self, model_path) 838 for module_config in modules_config: 839 module_class = import_from_string(module_config['type']) --> 840 module = module_class.load(os.path.join(model_path, module_config['path'])) 841 modules[module_config['name']] = module 843 return modules File /Library/Python/3.9/site-packages/sentence_transformers/models/Transformer.py:137, in Transformer.load(input_path) 135 with open(sbert_config_path) as fIn: 136 config = json.load(fIn) --> 137 return Transformer(model_name_or_path=input_path, **config) File /Library/Python/3.9/site-packages/sentence_transformers/models/Transformer.py:29, in Transformer.__init__(self, model_name_or_path, max_seq_length, model_args, cache_dir, tokenizer_args, do_lower_case, tokenizer_name_or_path) 26 self.do_lower_case = do_lower_case 28 config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) ---> 29 self._load_model(model_name_or_path, config, cache_dir) 31 self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, cache_dir=cache_dir, **tokenizer_args) 33 #No max_seq_length set. Try to infer from model File /Library/Python/3.9/site-packages/sentence_transformers/models/Transformer.py:49, in Transformer._load_model(self, model_name_or_path, config, cache_dir) 47 self._load_t5_model(model_name_or_path, config, cache_dir) 48 else: ---> 49 self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) File /Library/Python/3.9/site-packages/transformers/models/auto/auto_factory.py:463, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs) 461 elif type(config) in cls._model_mapping.keys(): 462 model_class = _get_model_class(config, cls._model_mapping) --> 463 return model_class.from_pretrained( 464 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs 465 ) 466 raise ValueError( 467 f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" 468 f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." 469 ) File /Library/Python/3.9/site-packages/transformers/modeling_utils.py:2228, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs) 2225 init_contexts.append(init_empty_weights()) 2227 with ContextManagers(init_contexts): -> 2228 model = cls(config, *model_args, **model_kwargs) 2230 if load_in_8bit: 2231 from .utils.bitsandbytes import get_keys_to_not_convert, replace_8bit_linear File /Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:884, in BertModel.__init__(self, config, add_pooling_layer) 881 self.config = config 883 self.embeddings = BertEmbeddings(config) --> 884 self.encoder = BertEncoder(config) 886 self.pooler = BertPooler(config) if add_pooling_layer else None 888 # Initialize weights and apply final processing File /Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:552, in BertEncoder.__init__(self, config) 550 super().__init__() 551 self.config = config --> 552 self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) 553 self.gradient_checkpointing = False File /Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:552, in <listcomp>(.0) 550 super().__init__() 551 self.config = config --> 552 self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) 553 self.gradient_checkpointing = False File /Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:474, in BertLayer.__init__(self, config) 472 raise ValueError(f"{self} should be used as a decoder model if cross attention is added") 473 self.crossattention = BertAttention(config, position_embedding_type="absolute") --> 474 self.intermediate = BertIntermediate(config) 475 self.output = BertOutput(config) File /Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:436, in BertIntermediate.__init__(self, config) 434 def __init__(self, config): 435 super().__init__() --> 436 self.dense = nn.Linear(config.hidden_size, config.intermediate_size) 437 if isinstance(config.hidden_act, str): 438 self.intermediate_act_fn = ACT2FN[config.hidden_act] File /Library/Python/3.9/site-packages/torch/nn/modules/linear.py:104, in Linear.__init__(self, in_features, out_features, bias, device, dtype) 102 else: 103 self.register_parameter('bias', None) --> 104 self.reset_parameters() File /Library/Python/3.9/site-packages/torch/nn/modules/linear.py:110, in Linear.reset_parameters(self) 106 def reset_parameters(self) -> None: 107 # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with 108 # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see 109 # https://github.com/pytorch/pytorch/issues/57109 --> 110 init.kaiming_uniform_(self.weight, a=math.sqrt(5)) 111 if self.bias is not None: 112 fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) File /Library/Python/3.9/site-packages/torch/nn/init.py:460, in kaiming_uniform_(tensor, a, mode, nonlinearity, generator) 458 bound = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation 459 with torch.no_grad(): --> 460 return tensor.uniform_(-bound, bound, generator=generator) KeyboardInterrupt: