csuhan's picture
Upload folder using huggingface_hub
b0c0df0 verified
import os
import pickle
import re
import subprocess
from pathlib import Path
from typing import Any, Dict, Optional, Tuple, Union
import numpy as np
from loguru import logger
from torch.utils.collect_env import get_pretty_env_info
from transformers import __version__ as trans_version
def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
"""Remove the ',none' substring from the input_string if it exists at the end.
Args:
input_string (str): The input string from which to remove the ',none' substring.
Returns:
Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
and a boolean indicating whether the modification was made (True) or not (False).
"""
# Define the pattern to match ',none' at the end of the string
pattern = re.compile(r",none$")
# Use sub() to replace ',none' with an empty string
result = re.sub(pattern, "", input_string)
# check if the input_string changed
removed = result != input_string
return result, removed
def is_serializable(o: Any) -> bool:
try:
pickle.dumps(o)
return True
except (pickle.PickleError, TypeError, AttributeError):
return False
def _handle_non_serializable(o: Any) -> Union[int, str, list]:
"""Handle non-serializable objects by converting them to serializable types.
Args:
o (Any): The object to be handled.
Returns:
Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
it will be converted to int. If the object is of type set, it will be converted
to a list. Otherwise, it will be converted to str.
"""
if isinstance(o, np.int64) or isinstance(o, np.int32):
return int(o)
elif isinstance(o, set):
return list(o)
else:
return str(o)
def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
try:
git_folder = Path(repo_path, ".git")
if git_folder.is_file():
git_folder = Path(
git_folder.parent,
git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
)
if Path(git_folder, "HEAD").exists():
head_name = Path(git_folder, "HEAD").read_text(encoding="utf-8").split("\n")[0].split(" ")[-1]
head_ref = Path(git_folder, head_name)
git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
else:
git_hash = None
except Exception as err:
logger.debug(f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}")
return None
return git_hash
def get_git_commit_hash():
"""
Gets the git commit hash of your current repo (if it exists).
Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
"""
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except (subprocess.CalledProcessError, FileNotFoundError):
# FileNotFoundError occurs when git not installed on system
git_hash = get_commit_from_path(os.getcwd()) # git hash of repo if exists
return git_hash
def add_env_info(storage: Dict[str, Any]):
try:
pretty_env_info = get_pretty_env_info()
except Exception as err:
pretty_env_info = str(err)
transformers_version = trans_version
upper_dir_commit = get_commit_from_path(Path(os.getcwd(), "..")) # git hash of upper repo if exists
added_info = {
"pretty_env_info": pretty_env_info,
"transformers_version": transformers_version,
"upper_git_hash": upper_dir_commit, # in case this repo is submodule
}
storage.update(added_info)
def add_tokenizer_info(storage: Dict[str, Any], lm):
if getattr(lm, "tokenizer", False):
try:
tokenizer_info = {
"tokenizer_pad_token": [
lm.tokenizer.pad_token,
str(lm.tokenizer.pad_token_id),
],
"tokenizer_eos_token": [
lm.tokenizer.eos_token,
str(lm.tokenizer.eos_token_id),
],
"tokenizer_bos_token": [
lm.tokenizer.bos_token,
str(lm.tokenizer.bos_token_id),
],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
except Exception as err:
logger.debug(f"Logging detailed tokenizer info failed with {err}, skipping...")
# seems gguf and textsynth do not have tokenizer
else:
logger.debug("LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results.")