seb/scripts at 07aa65cbb7b9212d5c5869a29a2d259c185d46f0 ・ Gitprep

scripts / renomme /

Newer ■ ■ ■ ■ ■ ■ ■ ■ ■ ■ Older

194 lines | 6.898kb

ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	1	#!/bin/bash
	2
	3	# script pour renommer des personnages ou lieux dans un document Manuskript
	4	# ./renomme <document manuskript> <ancien nom> <nouveau nom> [ins=poids] [del=poids] [rep=poids]
	5	#
	6	# Le document d'origine est sauvegardé (voir variable $backup)
	7	#
	8	# Si <nouveau nom> est "check", ou "prox" ou "leven", une étude de proximité est alors effectuée
	9	# sur l'algorithme Levenshtein. Les paramètres de poids sont des nombres entiers, et permettent
	10	# de pondérer l'ajout (ins=), la suppression (del=) et le remplacement (rep=) de caractère.
	11	# par défaut chacun des trois paramètres est égal à 1.
	12	# Il n'y a pas d'ordre obligatoire pour le paramétrage, et si un paramétrage est effectué plusieurs fois
	13	# c'est le plus à gauche qui prend la priorité. Il n'y a pas de backup effectué pour l'opération de vérification de la proximité
	14	# crédit algo: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance
	15
	16	set -e
	17
	18	manuscrit=${1:?}
	19	ancien=${2:?}
	20	nouveau=${3:?}
	21
	22	if test "${manuscrit:0:1}" != '/'; then
	23	manuscrit="$PWD/$manuscrit"
	24	fi
	25	test -r "$manuscrit" \|\| exit 1
	26	test $(file --brief --mime-type --dereference "$manuscrit") == 'application/zip' \|\| exit 2
	27
	28	backup="${manuscrit/%.msk} (avant renommage de «${ancien}» en «${nouveau}»).msk"
	29
	30	function trap_exit () {
	31	rm -fr $temp
	32	cd - > /dev/null
	33	}
	34
	35	function determinant () {
	36	eval "local nom=\"\$$1\""
	37	if [[ ${nom:0:1} == @(A\|E\|H\|I\|O\|U) \
	38	\|\| ${nom:0:1} == @(Â\|Ê\|H\|Î\|Ô\|Û) \
	39	\|\| ${nom:0:1} == @(Ä\|Ë\|H\|Ï\|Ö\|Ü) \
	40	\|\| ${nom:0:1} == @(À\|È\|H\|Ì\|Ò\|Ù) \
	41	\|\| ${nom:0:1} == @(Á\|É\|H\|Í\|Ó\|Ú) \
	42	]]; then
	43	eval "determinant_$1=\"d'\""
	44	eval "determinant_$1_formatted=\"d-\""
	45	else
	46	eval "determinant_$1=\"de \""
	47	eval "determinant_$1_formatted=\"de-\""
	48	fi
	49	}
	50
	51	function format () {
	52	eval "$1_formatted=\$(tr --complement --squeeze-repeats 'A-Za-z-_\n' - <<< \"\${$1// /_}\")"
	53	}
	54
	55	function renomme_fichiers () {
	56	for char in $(find characters -type f -regex "characters/[0-9]+-$1.txt"); do
	57	local new_char=$(sed "s/$1/$2/" <<< $char)
	58	echo "character: $char -> $new_char"
	59	mv $char $new_char
	60	break
	61	done
	62
	63	for chapter in $(find outline -type d -regex "outline/.$1."); do
	64	local new_chapter=$(sed "s/$1/$2/g" <<< $chapter)
	65	echo "chapter: $chapter -> $new_chapter"
	66	mv $chapter $new_chapter
	67	done
	68
	69	for part in $(find outline -type f -regex "outline/[^/]$1[^/].md"); do
	70	local new_part=$(sed "s/$1/$2/g" <<< $part)
	71	echo "part: $part -> $new_part"
	72	mv $part $new_part
	73	done
	74	}
	75
	76	trap trap_exit EXIT
	77
	78	temp=$(mktemp --dry-run /dev/shm/XXXXXXXXX)
	79	mkdir $temp
	80	cd $temp
	81
	82	if [[ $nouveau = @(check\|prox\|leven) ]]; then
	83	unzip -qq "$manuscrit"
	84	for param in $(seq 4 $#); do
	85	for dst in ins del rep; do
	86	eval "if test -n '\$$param' && [[ \"\$$param\" =~ $dst=[0-9]+ ]]; then cost_$dst=\${$param#*=}; fi"
	87	done
	88	done
	89	echo paramètres d\'approximation
	90	echo "caractère manquant (del=): ${cost_del:-1}"
	91	echo "caractère inséré (ins=): ${cost_ins:-1}"
	92	echo "caractère remplacé (rep=): ${cost_rep:-1}"
	93	for f in $(find . -type f); do
	94	let wc+=$(wc -w < $f)
	95	done
	96	awk -v ancien=$ancien -v wc=$wc -v cost_ins=${cost_ins:-1} -v cost_del=${cost_del:-1} -v cost_rep=${cost_rep:-1} '
	97	BEGIN {
	98	RS="[[:punct:]]"
	99	progress_mod = 10
	100	actual_progress = 0
	101	pct_progress = 0
	102	progress = 0
	103	found_words = 0
	104
	105	str1_len = length(ancien)
	106	for (i=1; i<=str1_len; i++)
	107	str1_substr[i]=substr(ancien, i, 1)
	108	}
	109	function levenshtein(str2) {
	110	str2_len = length(str2)
	111	if(str2_len == 0) return str1_len * cost_del
	112	for(j = 1; j <= str2_len; j++)
	113	str2_substr[j]=substr(str2, j, 1)
	114	matrix[0, 0] = 0
	115	for(i = 1; i <= str1_len; i++) {
	116	matrix[i, 0] = i * cost_del
	117	for(j = 1; j <= str2_len; j++) {
	118	matrix[0, j] = j * cost_ins
	119	x = matrix[i - 1, j] + cost_del
	120	y = matrix[i, j - 1] + cost_ins
	121	z = matrix[i - 1, j - 1] + (str1_substr[i] == str2_substr[j] ? 0 : cost_rep)
	122	x = x < y ? x : y
	123	matrix[i, j] = x < z ? x : z
	124	}
	125	}
	126	return matrix[str1_len, str2_len]
	127	}
	128	{
	129	for (word=1; word<=NF; word++) {
	130	progress++
	131	lvstn = levenshtein(gensub("[[:punct:]]","","g",$word))
	132	if (lvstn <= 3 && lvstn > 0) {
	133	approx_possibles[$word]++
	134	found_words++
	135	}
	136	pct_progress=int(progress / wc * 100)
	137	if (actual_progress < pct_progress && pct_progress % progress_mod == 0) {
	138	actual_progress = pct_progress
	139	printf("%i%\n", actual_progress)
	140	}
	141	}
	142	}
	143	END {
	144	if (found_words > 0) {
	145	printf("mot%s proche%s de «%s» [occurences]\n", found_words > 0 ? "s" : "", found_words > 0 ? "s" : "", ancien)
	146	for (i in approx_possibles)
	147	printf("- %s [%i]\n", i, approx_possibles[i])
	148	}
	149	else {
	150	print "aucun mot proche et différent de «" ancien "» trouvé"
	151	}
	152	}
	153	' $(find . -type f)
	154	exit
	155	fi
	156
	157	mv --backup=numbered "$manuscrit" "$backup"
	158
	159	unzip -qq "$backup"
	160
	161	for version in ancien nouveau; do
	162	format $version
	163	determinant $version
	164	done
	165
	166	declare -A remplacement
	167	remplacement=(
	168	[$ancien]="$nouveau"
	169	[$ancien_formatted]="$nouveau_formatted"
	170	[$determinant_ancien$ancien]="$determinant_nouveau$nouveau"
	171	[$determinant_ancien_formatted$ancien_formatted]="$determinant_nouveau_formatted$nouveau_formatted"
	172	)
	173
	174	renomme_fichiers "$ancien_formatted" "$nouveau_formatted"
	175	renomme_fichiers "$determinant_ancien_formatted$ancien_formatted" "$determinant_nouveau_formatted$nouveau_formatted"
	176
	177	egrep --word-regexp --only-matching --recursive --regexp="($determinant_ancien\|$determinant_ancien_formatted)\?($ancien\|$ancien_formatted)" . \
	178	\| awk -v name="$1" -F ':' '
	179	{
	180	if ($NF > 0) {
	181	file[$1]++
	182	nb++
	183	}
	184	}
	185	END {
	186	printf("remplacement de %i occurences pour %s dans %i fichiers\n", nb, name, asort(file))
	187	}'
	188
	189	for regexp in "${!remplacement[@]}"; do
	190	egrep --word-regexp --files-with-matches --recursive --regexp="$regexp" . \
	191	\| xargs --no-run-if-empty sed --regexp-extended --in-place "s/(\W\|^)$regexp(\W\|$)/\1${remplacement[$regexp]}\2/g"
	192	done
	193
	194	zip --recurse-paths --no-dir-entries -qq "${manuscrit}" *