Ingestion

openai-chatgptlangchain-pythonchatgptgenaielasticsearchelasticopenaiAIproduct-store-searchchatlogvectordatabasePythonsearchgenaistackhybrid-search-for-an-e-commerce-product-cataloguesupporting-blog-contentvectorelasticsearch-labsingestionlangchainapplications

Ingestion Data

You'll need to install the following libraries if they are not already installed:

[ ]
[8]
/Library/Python/3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
[ ]
[11]

Step 2: Text Vectorization using SentenceTransformers

[2]

Step 3: Read JSON file containing the dataset

[3]

Step 4: Chunk data for batch processing

[4]

Step 5: Generate bulk actions for Elasticsearch indexing

[5]

Step 6: Indexing data in batches to Elasticsearch

[6]
[10]
Batch indexed: 100 successful, [] failed
Batch indexed: 100 successful, [] failed
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[10], line 1
----> 1 index_data_in_batches("../files/dataset/products.json", "products-catalog-2", batch_size=100)

Cell In[6], line 9, in index_data_in_batches(file_path, index_name, batch_size)
      7 for batch in chunk_data(data, batch_size):
      8     actions = generate_bulk_actions(index_name, batch)
----> 9     success, failed = helpers.bulk(get_client_es(), actions)
     10     print(f"Batch indexed: {success} successful, {failed} failed")

File /Library/Python/3.9/site-packages/elasticsearch/helpers/actions.py:531, in bulk(client, actions, stats_only, ignore_status, *args, **kwargs)
    529 # make streaming_bulk yield successful results so we can count them
    530 kwargs["yield_ok"] = True
--> 531 for ok, item in streaming_bulk(
    532     client, actions, ignore_status=ignore_status, span_name="helpers.bulk", *args, **kwargs  # type: ignore[misc]
    533 ):
    534     # go through request-response pairs and detect failures
    535     if not ok:
    536         if not stats_only:

File /Library/Python/3.9/site-packages/elasticsearch/helpers/actions.py:427, in streaming_bulk(client, actions, chunk_size, max_chunk_bytes, raise_on_error, expand_action_callback, raise_on_exception, max_retries, initial_backoff, max_backoff, yield_ok, ignore_status, span_name, *args, **kwargs)
    420 bulk_data: List[
    421     Union[
    422         Tuple[_TYPE_BULK_ACTION_HEADER],
    423         Tuple[_TYPE_BULK_ACTION_HEADER, _TYPE_BULK_ACTION_BODY],
    424     ]
    425 ]
    426 bulk_actions: List[bytes]
--> 427 for bulk_data, bulk_actions in _chunk_actions(
    428     map(expand_action_callback, actions),
    429     chunk_size,
    430     max_chunk_bytes,
    431     serializer,
    432 ):
    433     for attempt in range(max_retries + 1):
    434         to_retry: List[bytes] = []

File /Library/Python/3.9/site-packages/elasticsearch/helpers/actions.py:234, in _chunk_actions(actions, chunk_size, max_chunk_bytes, serializer)
    227 """
    228 Split actions into chunks by number or size, serialize them into strings in
    229 the process.
    230 """
    231 chunker = _ActionChunker(
    232     chunk_size=chunk_size, max_chunk_bytes=max_chunk_bytes, serializer=serializer
    233 )
--> 234 for action, data in actions:
    235     ret = chunker.feed(action, data)
    236     if ret:

Cell In[5], line 8, in generate_bulk_actions(index_name, data_batch)
      6 for item in data_batch:
      7     document_id = item['id']
----> 8     item['description_embeddings'] = get_text_vector(item['description'])
      9     yield {
     10         "_index": index_name,
     11         "_id": document_id,
     12         "_source": item
     13     }

Cell In[2], line 5, in get_text_vector(sentences)
      1 def get_text_vector(sentences):
      2     """
      3     Generates sentence embeddings using pre-trained model 'all-MiniLM-L6-v2'.
      4     """
----> 5     model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
      6     embeddings = model.encode(sentences)
      7     return embeddings

File /Library/Python/3.9/site-packages/sentence_transformers/SentenceTransformer.py:95, in SentenceTransformer.__init__(self, model_name_or_path, modules, device, cache_folder, use_auth_token)
     87         snapshot_download(model_name_or_path,
     88                             cache_dir=cache_folder,
     89                             library_name='sentence-transformers',
     90                             library_version=__version__,
     91                             ignore_files=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'],
     92                             use_auth_token=use_auth_token)
     94 if os.path.exists(os.path.join(model_path, 'modules.json')):    #Load as SentenceTransformer model
---> 95     modules = self._load_sbert_model(model_path)
     96 else:   #Load with AutoModel
     97     modules = self._load_auto_model(model_path)

File /Library/Python/3.9/site-packages/sentence_transformers/SentenceTransformer.py:840, in SentenceTransformer._load_sbert_model(self, model_path)
    838 for module_config in modules_config:
    839     module_class = import_from_string(module_config['type'])
--> 840     module = module_class.load(os.path.join(model_path, module_config['path']))
    841     modules[module_config['name']] = module
    843 return modules

File /Library/Python/3.9/site-packages/sentence_transformers/models/Transformer.py:137, in Transformer.load(input_path)
    135 with open(sbert_config_path) as fIn:
    136     config = json.load(fIn)
--> 137 return Transformer(model_name_or_path=input_path, **config)

File /Library/Python/3.9/site-packages/sentence_transformers/models/Transformer.py:29, in Transformer.__init__(self, model_name_or_path, max_seq_length, model_args, cache_dir, tokenizer_args, do_lower_case, tokenizer_name_or_path)
     26 self.do_lower_case = do_lower_case
     28 config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
---> 29 self._load_model(model_name_or_path, config, cache_dir)
     31 self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, cache_dir=cache_dir, **tokenizer_args)
     33 #No max_seq_length set. Try to infer from model

File /Library/Python/3.9/site-packages/sentence_transformers/models/Transformer.py:49, in Transformer._load_model(self, model_name_or_path, config, cache_dir)
     47     self._load_t5_model(model_name_or_path, config, cache_dir)
     48 else:
---> 49     self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)

File /Library/Python/3.9/site-packages/transformers/models/auto/auto_factory.py:463, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
    461 elif type(config) in cls._model_mapping.keys():
    462     model_class = _get_model_class(config, cls._model_mapping)
--> 463     return model_class.from_pretrained(
    464         pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
    465     )
    466 raise ValueError(
    467     f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
    468     f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
    469 )

File /Library/Python/3.9/site-packages/transformers/modeling_utils.py:2228, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
   2225     init_contexts.append(init_empty_weights())
   2227 with ContextManagers(init_contexts):
-> 2228     model = cls(config, *model_args, **model_kwargs)
   2230 if load_in_8bit:
   2231     from .utils.bitsandbytes import get_keys_to_not_convert, replace_8bit_linear

File /Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:884, in BertModel.__init__(self, config, add_pooling_layer)
    881 self.config = config
    883 self.embeddings = BertEmbeddings(config)
--> 884 self.encoder = BertEncoder(config)
    886 self.pooler = BertPooler(config) if add_pooling_layer else None
    888 # Initialize weights and apply final processing

File /Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:552, in BertEncoder.__init__(self, config)
    550 super().__init__()
    551 self.config = config
--> 552 self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
    553 self.gradient_checkpointing = False

File /Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:552, in <listcomp>(.0)
    550 super().__init__()
    551 self.config = config
--> 552 self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
    553 self.gradient_checkpointing = False

File /Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:474, in BertLayer.__init__(self, config)
    472         raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
    473     self.crossattention = BertAttention(config, position_embedding_type="absolute")
--> 474 self.intermediate = BertIntermediate(config)
    475 self.output = BertOutput(config)

File /Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:436, in BertIntermediate.__init__(self, config)
    434 def __init__(self, config):
    435     super().__init__()
--> 436     self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
    437     if isinstance(config.hidden_act, str):
    438         self.intermediate_act_fn = ACT2FN[config.hidden_act]

File /Library/Python/3.9/site-packages/torch/nn/modules/linear.py:104, in Linear.__init__(self, in_features, out_features, bias, device, dtype)
    102 else:
    103     self.register_parameter('bias', None)
--> 104 self.reset_parameters()

File /Library/Python/3.9/site-packages/torch/nn/modules/linear.py:110, in Linear.reset_parameters(self)
    106 def reset_parameters(self) -> None:
    107     # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
    108     # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
    109     # https://github.com/pytorch/pytorch/issues/57109
--> 110     init.kaiming_uniform_(self.weight, a=math.sqrt(5))
    111     if self.bias is not None:
    112         fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)

File /Library/Python/3.9/site-packages/torch/nn/init.py:460, in kaiming_uniform_(tensor, a, mode, nonlinearity, generator)
    458 bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
    459 with torch.no_grad():
--> 460     return tensor.uniform_(-bound, bound, generator=generator)

KeyboardInterrupt: