seb/scripts at 487f54c3b69a1c9e8a2fd03f034ee934ac5f8442 ・ Gitprep

scripts / renomme /

Newer ■ ■ ■ ■ ■ ■ ■ ■ ■ ■ Older

206 lines | 7.406kb

ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	1	#!/bin/bash
	2
	3	# script pour renommer des personnages ou lieux dans un document Manuskript
	4	# ./renomme <document manuskript> <ancien nom> <nouveau nom> [ins=poids] [del=poids] [rep=poids]
	5	#
	6	# Le document d'origine est sauvegardé (voir variable $backup)
	7	#
	8	# Si <nouveau nom> est "check", ou "prox" ou "leven", une étude de proximité est alors effectuée
	9	# sur l'algorithme Levenshtein. Les paramètres de poids sont des nombres entiers, et permettent
	10	# de pondérer l'ajout (ins=), la suppression (del=) et le remplacement (rep=) de caractère.
	11	# par défaut chacun des trois paramètres est égal à 1.
	12	# Il n'y a pas d'ordre obligatoire pour le paramétrage, et si un paramétrage est effectué plusieurs fois
	13	# c'est le plus à gauche qui prend la priorité. Il n'y a pas de backup effectué pour l'opération de vérification de la proximité
	14	# crédit algo: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance
	15
	16	set -e
	17
	18	manuscrit=${1:?}
	19	ancien=${2:?}
	20	nouveau=${3:?}
	21
	22	if test "${manuscrit:0:1}" != '/'; then
	23	manuscrit="$PWD/$manuscrit"
	24	fi
	25	test -r "$manuscrit" \|\| exit 1
	26	test $(file --brief --mime-type --dereference "$manuscrit") == 'application/zip' \|\| exit 2
	27
	28	backup="${manuscrit/%.msk} (avant renommage de «${ancien}» en «${nouveau}»).msk"
	29
	30	function trap_exit () {
	31	rm -fr $temp
	32	cd - > /dev/null
	33	}
	34
	35	function determinant () {
	36	eval "local nom=\"\$$1\""
	37	if [[ ${nom:0:1} == @(A\|E\|H\|I\|O\|U) \
	38	\|\| ${nom:0:1} == @(Â\|Ê\|H\|Î\|Ô\|Û) \
	39	\|\| ${nom:0:1} == @(Ä\|Ë\|H\|Ï\|Ö\|Ü) \
	40	\|\| ${nom:0:1} == @(À\|È\|H\|Ì\|Ò\|Ù) \
	41	\|\| ${nom:0:1} == @(Á\|É\|H\|Í\|Ó\|Ú) \
	42	]]; then
	43	eval "determinant_$1=\"d'\""
	44	eval "determinant_$1_formatted=\"d-\""
	45	else
	46	eval "determinant_$1=\"de \""
	47	eval "determinant_$1_formatted=\"de-\""
	48	fi
	49	}
	50
	51	function format () {
	52	eval "$1_formatted=\$(tr --complement --squeeze-repeats 'A-Za-z-_\n' - <<< \"\${$1// /_}\")"
	53	}
	54
	55	function renomme_fichiers () {
	56	for char in $(find characters -type f -regex "characters/[0-9]+-$1.txt"); do
	57	local new_char=$(sed "s/$1/$2/" <<< $char)
	58	echo "character: $char -> $new_char"
	59	mv $char $new_char
	60	break
	61	done
	62
	63	for chapter in $(find outline -type d -regex "outline/.$1."); do
	64	local new_chapter=$(sed "s/$1/$2/g" <<< $chapter)
	65	echo "chapter: $chapter -> $new_chapter"
	66	mv $chapter $new_chapter
	67	done
	68
	69	for part in $(find outline -type f -regex "outline/[^/]$1[^/].md"); do
	70	local new_part=$(sed "s/$1/$2/g" <<< $part)
	71	echo "part: $part -> $new_part"
	72	mv $part $new_part
	73	done
	74	}
	75
	76	trap trap_exit EXIT
	77
simplification e69bb12 Sébastien MARQUE authored on 2021-08-19	78	temp=$(mktemp --directory /dev/shm/XXXXXXXXX)
ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	79	cd $temp
	80
	81	if [[ $nouveau = @(check\|prox\|leven) ]]; then
	82	unzip -qq "$manuscrit"
	83	for param in $(seq 4 $#); do
meilleure gestion des valeur... 7adb5d5 Sébastien MARQUE authored on 2021-08-19	84	for dst in ins del rep max; do
ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	85	eval "if test -n '\$$param' && [[ \"\$$param\" =~ $dst=[0-9]+ ]]; then cost_$dst=\${$param#*=}; fi"
	86	done
	87	done
meilleure gestion des valeur... 7adb5d5 Sébastien MARQUE authored on 2021-08-19	88	for dst in ins del rep; do
	89	eval "cost_$dst=\${cost_$dst:-1}"
	90	done
	91	cost_max=${cost_max:-$(($cost_ins + $cost_del + $cost_rep))}
	92	if test $cost_max -ge ${#ancien}; then
	93	cost_max=$(( ${#ancien} - 1 ))
	94	fi
	95
ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	96	echo paramètres d\'approximation
meilleure gestion des valeur... 7adb5d5 Sébastien MARQUE authored on 2021-08-19	97	echo "caractère manquant (del=): $cost_del"
	98	echo "caractère inséré (ins=): $cost_ins"
	99	echo "caractère remplacé (rep=): $cost_rep"
	100	echo "distance maximale (max=): ${cost_max:-$(($cost_ins + $cost_del + $cost_rep))}"
ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	101	for f in $(find . -type f); do
	102	let wc+=$(wc -w < $f)
	103	done
meilleure gestion des valeur... 7adb5d5 Sébastien MARQUE authored on 2021-08-19	104	awk -v ancien=$ancien -v wc=$wc -v cost_ins=$cost_ins -v cost_del=$cost_del -v cost_rep=$cost_rep -v cost_max=$cost_max '
ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	105	BEGIN {
	106	RS="[[:punct:]]"
	107	progress_mod = 10
	108	actual_progress = 0
	109	pct_progress = 0
	110	progress = 0
	111	found_words = 0
	112
fix: suppression d'une valeu... d3f6fa3 Sébastien MARQUE authored on 2021-08-19	113	cost_tot = cost_ins + cost_del + cost_rep
	114	cost_tot = cost_tot > cost_max ? cost_max : cost_tot
	115
ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	116	str1_len = length(ancien)
	117	for (i=1; i<=str1_len; i++)
	118	str1_substr[i]=substr(ancien, i, 1)
	119	}
	120	function levenshtein(str2) {
	121	str2_len = length(str2)
	122	if(str2_len == 0) return str1_len * cost_del
	123	for(j = 1; j <= str2_len; j++)
	124	str2_substr[j]=substr(str2, j, 1)
	125	matrix[0, 0] = 0
	126	for(i = 1; i <= str1_len; i++) {
	127	matrix[i, 0] = i * cost_del
	128	for(j = 1; j <= str2_len; j++) {
	129	matrix[0, j] = j * cost_ins
	130	x = matrix[i - 1, j] + cost_del
	131	y = matrix[i, j - 1] + cost_ins
	132	z = matrix[i - 1, j - 1] + (str1_substr[i] == str2_substr[j] ? 0 : cost_rep)
	133	x = x < y ? x : y
	134	matrix[i, j] = x < z ? x : z
	135	}
	136	}
	137	return matrix[str1_len, str2_len]
	138	}
	139	{
	140	for (word=1; word<=NF; word++) {
	141	progress++
	142	lvstn = levenshtein(gensub("[[:punct:]]","","g",$word))
fix: suppression d'une valeu... d3f6fa3 Sébastien MARQUE authored on 2021-08-19	143	if (lvstn <= cost_tot && lvstn > 0) {
	144	key = sprintf("%s (%d)", $word, lvstn)
	145	approx_possibles[key]++
ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	146	found_words++
	147	}
	148	pct_progress=int(progress / wc * 100)
	149	if (actual_progress < pct_progress && pct_progress % progress_mod == 0) {
	150	actual_progress = pct_progress
	151	printf("%i%\n", actual_progress)
	152	}
	153	}
	154	}
	155	END {
	156	if (found_words > 0) {
amélioration de la sortie 487f54c Sébastien MARQUE authored on 2021-08-19	157	printf("mot%s proche%s de «%s» (distance) [occurences]\n", found_words > 0 ? "s" : "", found_words > 0 ? "s" : "", ancien)
ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	158	for (i in approx_possibles)
	159	printf("- %s [%i]\n", i, approx_possibles[i])
	160	}
	161	else {
	162	print "aucun mot proche et différent de «" ancien "» trouvé"
	163	}
	164	}
	165	' $(find . -type f)
	166	exit
	167	fi
	168
	169	mv --backup=numbered "$manuscrit" "$backup"
	170
	171	unzip -qq "$backup"
	172
	173	for version in ancien nouveau; do
	174	format $version
	175	determinant $version
	176	done
	177
	178	declare -A remplacement
	179	remplacement=(
	180	[$ancien]="$nouveau"
	181	[$ancien_formatted]="$nouveau_formatted"
	182	[$determinant_ancien$ancien]="$determinant_nouveau$nouveau"
	183	[$determinant_ancien_formatted$ancien_formatted]="$determinant_nouveau_formatted$nouveau_formatted"
	184	)
	185
	186	renomme_fichiers "$ancien_formatted" "$nouveau_formatted"
	187	renomme_fichiers "$determinant_ancien_formatted$ancien_formatted" "$determinant_nouveau_formatted$nouveau_formatted"
	188
	189	egrep --word-regexp --only-matching --recursive --regexp="($determinant_ancien\|$determinant_ancien_formatted)\?($ancien\|$ancien_formatted)" . \
	190	\| awk -v name="$1" -F ':' '
	191	{
	192	if ($NF > 0) {
	193	file[$1]++
	194	nb++
	195	}
	196	}
	197	END {
	198	printf("remplacement de %i occurences pour %s dans %i fichiers\n", nb, name, asort(file))
	199	}'
	200
	201	for regexp in "${!remplacement[@]}"; do
	202	egrep --word-regexp --files-with-matches --recursive --regexp="$regexp" . \
	203	\| xargs --no-run-if-empty sed --regexp-extended --in-place "s/(\W\|^)$regexp(\W\|$)/\1${remplacement[$regexp]}\2/g"
	204	done
	205
	206	zip --recurse-paths --no-dir-entries -qq "${manuscrit}" *