Spaces:

ajad
/

My_Local_LLM

Runtime error

App Files Files Community

My_Local_LLM / modules /llama_cpp_python_hijack.py

ajad

Upload folder using huggingface_hub

251c1ae verified about 1 year ago

raw

history blame contribute delete

4.06 kB

	import importlib
	import platform
	from typing import Sequence

	import numpy as np
	from tqdm import tqdm

	from modules import shared
	from modules.cache_utils import process_llamacpp_cache

	imported_module = None
	not_available_modules = set()


	def llama_cpp_lib():
	global imported_module, not_available_modules

	# Determine the platform
	is_macos = platform.system() == 'Darwin'

	# Define the library names based on the platform
	if is_macos:
	lib_names = [
	(None, 'llama_cpp')
	]
	else:
	lib_names = [
	('cpu', 'llama_cpp'),
	('tensorcores', 'llama_cpp_cuda_tensorcores'),
	(None, 'llama_cpp_cuda'),
	(None, 'llama_cpp')
	]

	for arg, lib_name in lib_names:
	if lib_name in not_available_modules:
	continue

	should_import = (arg is None or getattr(shared.args, arg))

	if should_import:
	if imported_module and imported_module != lib_name:
	# Conflict detected, raise an exception
	raise Exception(f"Cannot import `{lib_name}` because `{imported_module}` is already imported. Switching to a different version of llama-cpp-python currently requires a server restart.")

	try:
	return_lib = importlib.import_module(lib_name)
	imported_module = lib_name
	monkey_patch_llama_cpp_python(return_lib)
	return return_lib
	except ImportError:
	not_available_modules.add(lib_name)
	continue

	return None


	def eval_with_progress(self, tokens: Sequence[int]):
	"""
	A copy of

	https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py

	with tqdm to show prompt processing progress.
	"""
	self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)

	if len(tokens) > self.n_batch:
	progress_bar = tqdm(range(0, len(tokens), self.n_batch), desc="Prompt evaluation", leave=False)
	else:
	progress_bar = range(0, len(tokens), self.n_batch)

	for i in progress_bar:
	batch = tokens[i : min(len(tokens), i + self.n_batch)]
	n_past = self.n_tokens
	n_tokens = len(batch)
	self._batch.set_batch(
	batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
	)
	self._ctx.decode(self._batch)
	# Save tokens
	self.input_ids[n_past : n_past + n_tokens] = batch
	# Save logits
	if self.context_params.logits_all:
	rows = n_tokens
	cols = self._n_vocab
	logits = np.ctypeslib.as_array(
	self._ctx.get_logits(), shape=(rows * cols,)
	)
	self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
	self.last_updated_index = n_past + n_tokens - 1
	else:
	rows = 1
	cols = self._n_vocab
	logits = np.ctypeslib.as_array(
	self._ctx.get_logits(), shape=(rows * cols,)
	)
	last_token_index = min(n_past + n_tokens - 1, self.scores.shape[0] - 1)
	self.scores[last_token_index, :] = logits.reshape(-1)
	self.last_updated_index = last_token_index
	# Update n_tokens
	self.n_tokens += n_tokens


	def monkey_patch_llama_cpp_python(lib):
	if getattr(lib.Llama, '_is_patched', False):
	# If the patch is already applied, do nothing
	return

	def my_generate(self, args, *kwargs):
	if shared.args.streaming_llm:
	new_sequence = args[0]
	past_sequence = self._input_ids

	# Do the cache trimming for StreamingLLM
	process_llamacpp_cache(self, new_sequence, past_sequence)

	for output in self.original_generate(args, *kwargs):
	yield output

	lib.Llama.eval = eval_with_progress
	lib.Llama.original_generate = lib.Llama.generate
	lib.Llama.generate = my_generate

	# Set the flag to indicate that the patch has been applied
	lib.Llama._is_patched = True