llm_cp2 / src /lmms-eval /lmms_eval /loggers /utils.py

Upload folder using huggingface_hub

b0c0df0 verified about 1 month ago

4.98 kB

	import os
	import pickle
	import re
	import subprocess
	from pathlib import Path
	from typing import Any, Dict, Optional, Tuple, Union

	import numpy as np
	from loguru import logger
	from torch.utils.collect_env import get_pretty_env_info
	from transformers import __version__ as trans_version


	def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
	"""Remove the ',none' substring from the input_string if it exists at the end.

	Args:
	input_string (str): The input string from which to remove the ',none' substring.

	Returns:
	Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
	and a boolean indicating whether the modification was made (True) or not (False).
	"""
	# Define the pattern to match ',none' at the end of the string
	pattern = re.compile(r",none$")

	# Use sub() to replace ',none' with an empty string
	result = re.sub(pattern, "", input_string)

	# check if the input_string changed
	removed = result != input_string

	return result, removed


	def is_serializable(o: Any) -> bool:
	try:
	pickle.dumps(o)
	return True
	except (pickle.PickleError, TypeError, AttributeError):
	return False


	def _handle_non_serializable(o: Any) -> Union[int, str, list]:
	"""Handle non-serializable objects by converting them to serializable types.

	Args:
	o (Any): The object to be handled.

	Returns:
	Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
	it will be converted to int. If the object is of type set, it will be converted
	to a list. Otherwise, it will be converted to str.
	"""
	if isinstance(o, np.int64) or isinstance(o, np.int32):
	return int(o)
	elif isinstance(o, set):
	return list(o)
	else:
	return str(o)


	def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
	try:
	git_folder = Path(repo_path, ".git")
	if git_folder.is_file():
	git_folder = Path(
	git_folder.parent,
	git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
	)
	if Path(git_folder, "HEAD").exists():
	head_name = Path(git_folder, "HEAD").read_text(encoding="utf-8").split("\n")[0].split(" ")[-1]
	head_ref = Path(git_folder, head_name)
	git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
	else:
	git_hash = None
	except Exception as err:
	logger.debug(f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}")
	return None
	return git_hash


	def get_git_commit_hash():
	"""
	Gets the git commit hash of your current repo (if it exists).
	Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
	"""
	try:
	git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
	git_hash = git_hash.decode()
	except (subprocess.CalledProcessError, FileNotFoundError):
	# FileNotFoundError occurs when git not installed on system
	git_hash = get_commit_from_path(os.getcwd()) # git hash of repo if exists
	return git_hash


	def add_env_info(storage: Dict[str, Any]):
	try:
	pretty_env_info = get_pretty_env_info()
	except Exception as err:
	pretty_env_info = str(err)
	transformers_version = trans_version
	upper_dir_commit = get_commit_from_path(Path(os.getcwd(), "..")) # git hash of upper repo if exists
	added_info = {
	"pretty_env_info": pretty_env_info,
	"transformers_version": transformers_version,
	"upper_git_hash": upper_dir_commit, # in case this repo is submodule
	}
	storage.update(added_info)


	def add_tokenizer_info(storage: Dict[str, Any], lm):
	if getattr(lm, "tokenizer", False):
	try:
	tokenizer_info = {
	"tokenizer_pad_token": [
	lm.tokenizer.pad_token,
	str(lm.tokenizer.pad_token_id),
	],
	"tokenizer_eos_token": [
	lm.tokenizer.eos_token,
	str(lm.tokenizer.eos_token_id),
	],
	"tokenizer_bos_token": [
	lm.tokenizer.bos_token,
	str(lm.tokenizer.bos_token_id),
	],
	"eot_token_id": getattr(lm, "eot_token_id", None),
	"max_length": getattr(lm, "max_length", None),
	}
	storage.update(tokenizer_info)
	except Exception as err:
	logger.debug(f"Logging detailed tokenizer info failed with {err}, skipping...")
	# seems gguf and textsynth do not have tokenizer
	else:
	logger.debug("LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results.")