seb/scripts at e69bb12c565e42766d9347e9db774842981e5e47 ・ Gitprep

scripts / renomme /

Newer ■ ■ ■ ■ ■ ■ ■ ■ ■ ■ Older

193 lines | 6.888kb

ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	1	#!/bin/bash
	2
	3	# script pour renommer des personnages ou lieux dans un document Manuskript
	4	# ./renomme <document manuskript> <ancien nom> <nouveau nom> [ins=poids] [del=poids] [rep=poids]
	5	#
	6	# Le document d'origine est sauvegardé (voir variable $backup)
	7	#
	8	# Si <nouveau nom> est "check", ou "prox" ou "leven", une étude de proximité est alors effectuée
	9	# sur l'algorithme Levenshtein. Les paramètres de poids sont des nombres entiers, et permettent
	10	# de pondérer l'ajout (ins=), la suppression (del=) et le remplacement (rep=) de caractère.
	11	# par défaut chacun des trois paramètres est égal à 1.
	12	# Il n'y a pas d'ordre obligatoire pour le paramétrage, et si un paramétrage est effectué plusieurs fois
	13	# c'est le plus à gauche qui prend la priorité. Il n'y a pas de backup effectué pour l'opération de vérification de la proximité
	14	# crédit algo: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance
	15
	16	set -e
	17
	18	manuscrit=${1:?}
	19	ancien=${2:?}
	20	nouveau=${3:?}
	21
	22	if test "${manuscrit:0:1}" != '/'; then
	23	manuscrit="$PWD/$manuscrit"
	24	fi
	25	test -r "$manuscrit" \|\| exit 1
	26	test $(file --brief --mime-type --dereference "$manuscrit") == 'application/zip' \|\| exit 2
	27
	28	backup="${manuscrit/%.msk} (avant renommage de «${ancien}» en «${nouveau}»).msk"
	29
	30	function trap_exit () {
	31	rm -fr $temp
	32	cd - > /dev/null
	33	}
	34
	35	function determinant () {
	36	eval "local nom=\"\$$1\""
	37	if [[ ${nom:0:1} == @(A\|E\|H\|I\|O\|U) \
	38	\|\| ${nom:0:1} == @(Â\|Ê\|H\|Î\|Ô\|Û) \
	39	\|\| ${nom:0:1} == @(Ä\|Ë\|H\|Ï\|Ö\|Ü) \
	40	\|\| ${nom:0:1} == @(À\|È\|H\|Ì\|Ò\|Ù) \
	41	\|\| ${nom:0:1} == @(Á\|É\|H\|Í\|Ó\|Ú) \
	42	]]; then
	43	eval "determinant_$1=\"d'\""
	44	eval "determinant_$1_formatted=\"d-\""
	45	else
	46	eval "determinant_$1=\"de \""
	47	eval "determinant_$1_formatted=\"de-\""
	48	fi
	49	}
	50
	51	function format () {
	52	eval "$1_formatted=\$(tr --complement --squeeze-repeats 'A-Za-z-_\n' - <<< \"\${$1// /_}\")"
	53	}
	54
	55	function renomme_fichiers () {
	56	for char in $(find characters -type f -regex "characters/[0-9]+-$1.txt"); do
	57	local new_char=$(sed "s/$1/$2/" <<< $char)
	58	echo "character: $char -> $new_char"
	59	mv $char $new_char
	60	break
	61	done
	62
	63	for chapter in $(find outline -type d -regex "outline/.$1."); do
	64	local new_chapter=$(sed "s/$1/$2/g" <<< $chapter)
	65	echo "chapter: $chapter -> $new_chapter"
	66	mv $chapter $new_chapter
	67	done
	68
	69	for part in $(find outline -type f -regex "outline/[^/]$1[^/].md"); do
	70	local new_part=$(sed "s/$1/$2/g" <<< $part)
	71	echo "part: $part -> $new_part"
	72	mv $part $new_part
	73	done
	74	}
	75
	76	trap trap_exit EXIT
	77
simplification e69bb12 Sébastien MARQUE authored on 2021-08-19	78	temp=$(mktemp --directory /dev/shm/XXXXXXXXX)
ajoute scriptt renomme 4436266 Sébastien MARQUE authored on 2021-08-15	79	cd $temp
	80
	81	if [[ $nouveau = @(check\|prox\|leven) ]]; then
	82	unzip -qq "$manuscrit"
	83	for param in $(seq 4 $#); do
	84	for dst in ins del rep; do
	85	eval "if test -n '\$$param' && [[ \"\$$param\" =~ $dst=[0-9]+ ]]; then cost_$dst=\${$param#*=}; fi"
	86	done
	87	done
	88	echo paramètres d\'approximation
	89	echo "caractère manquant (del=): ${cost_del:-1}"
	90	echo "caractère inséré (ins=): ${cost_ins:-1}"
	91	echo "caractère remplacé (rep=): ${cost_rep:-1}"
	92	for f in $(find . -type f); do
	93	let wc+=$(wc -w < $f)
	94	done
	95	awk -v ancien=$ancien -v wc=$wc -v cost_ins=${cost_ins:-1} -v cost_del=${cost_del:-1} -v cost_rep=${cost_rep:-1} '
	96	BEGIN {
	97	RS="[[:punct:]]"
	98	progress_mod = 10
	99	actual_progress = 0
	100	pct_progress = 0
	101	progress = 0
	102	found_words = 0
	103
	104	str1_len = length(ancien)
	105	for (i=1; i<=str1_len; i++)
	106	str1_substr[i]=substr(ancien, i, 1)
	107	}
	108	function levenshtein(str2) {
	109	str2_len = length(str2)
	110	if(str2_len == 0) return str1_len * cost_del
	111	for(j = 1; j <= str2_len; j++)
	112	str2_substr[j]=substr(str2, j, 1)
	113	matrix[0, 0] = 0
	114	for(i = 1; i <= str1_len; i++) {
	115	matrix[i, 0] = i * cost_del
	116	for(j = 1; j <= str2_len; j++) {
	117	matrix[0, j] = j * cost_ins
	118	x = matrix[i - 1, j] + cost_del
	119	y = matrix[i, j - 1] + cost_ins
	120	z = matrix[i - 1, j - 1] + (str1_substr[i] == str2_substr[j] ? 0 : cost_rep)
	121	x = x < y ? x : y
	122	matrix[i, j] = x < z ? x : z
	123	}
	124	}
	125	return matrix[str1_len, str2_len]
	126	}
	127	{
	128	for (word=1; word<=NF; word++) {
	129	progress++
	130	lvstn = levenshtein(gensub("[[:punct:]]","","g",$word))
	131	if (lvstn <= 3 && lvstn > 0) {
	132	approx_possibles[$word]++
	133	found_words++
	134	}
	135	pct_progress=int(progress / wc * 100)
	136	if (actual_progress < pct_progress && pct_progress % progress_mod == 0) {
	137	actual_progress = pct_progress
	138	printf("%i%\n", actual_progress)
	139	}
	140	}
	141	}
	142	END {
	143	if (found_words > 0) {
	144	printf("mot%s proche%s de «%s» [occurences]\n", found_words > 0 ? "s" : "", found_words > 0 ? "s" : "", ancien)
	145	for (i in approx_possibles)
	146	printf("- %s [%i]\n", i, approx_possibles[i])
	147	}
	148	else {
	149	print "aucun mot proche et différent de «" ancien "» trouvé"
	150	}
	151	}
	152	' $(find . -type f)
	153	exit
	154	fi
	155
	156	mv --backup=numbered "$manuscrit" "$backup"
	157
	158	unzip -qq "$backup"
	159
	160	for version in ancien nouveau; do
	161	format $version
	162	determinant $version
	163	done
	164
	165	declare -A remplacement
	166	remplacement=(
	167	[$ancien]="$nouveau"
	168	[$ancien_formatted]="$nouveau_formatted"
	169	[$determinant_ancien$ancien]="$determinant_nouveau$nouveau"
	170	[$determinant_ancien_formatted$ancien_formatted]="$determinant_nouveau_formatted$nouveau_formatted"
	171	)
	172
	173	renomme_fichiers "$ancien_formatted" "$nouveau_formatted"
	174	renomme_fichiers "$determinant_ancien_formatted$ancien_formatted" "$determinant_nouveau_formatted$nouveau_formatted"
	175
	176	egrep --word-regexp --only-matching --recursive --regexp="($determinant_ancien\|$determinant_ancien_formatted)\?($ancien\|$ancien_formatted)" . \
	177	\| awk -v name="$1" -F ':' '
	178	{
	179	if ($NF > 0) {
	180	file[$1]++
	181	nb++
	182	}
	183	}
	184	END {
	185	printf("remplacement de %i occurences pour %s dans %i fichiers\n", nb, name, asort(file))
	186	}'
	187
	188	for regexp in "${!remplacement[@]}"; do
	189	egrep --word-regexp --files-with-matches --recursive --regexp="$regexp" . \
	190	\| xargs --no-run-if-empty sed --regexp-extended --in-place "s/(\W\|^)$regexp(\W\|$)/\1${remplacement[$regexp]}\2/g"
	191	done
	192
	193	zip --recurse-paths --no-dir-entries -qq "${manuscrit}" *