diff --git a/knowledge base/grep.md b/knowledge base/grep.md index 4bf8609..544373e 100644 --- a/knowledge base/grep.md +++ b/knowledge base/grep.md @@ -12,6 +12,11 @@ grep -R --exclude-dir excluded/dir 'pattern' path/to/search/recursively # gnu # show line numbers grep -n 'pattern' path/to/search + +# parallel execution +# mind the files with spaces in their name +find . -type f | parallel -j 100% grep 'pattern' +find . -type f -print0 | xargs -0 -n 1 -P $(nproc) grep 'pattern' ``` ## Grep variants @@ -38,6 +43,18 @@ You can to this using [`pdftotext`](pdfgrep.md) as shown in this example ([sourc find /path -name '*.pdf' -exec sh -c 'pdftotext "{}" - | grep --with-filename --label="{}" --color "your pattern"' ';' ``` +## Gotchas + +- Standard editions of `grep` run in a single thread; use another executor like + `parallel` or `xargs` to parallelize grepping multiple files: + + ```shell + find . -type f | parallel -j 100% grep 'pattern' + find . -type f -print0 | xargs -0 -n 1 -P $(nproc) grep 'pattern' + ``` + + > mind files with spaces in their name. + ## Further readings - Answer on [StackOverflow] about [how to search contents of multiple pdf files] diff --git a/knowledge base/parallel.md b/knowledge base/parallel.md index c1504b4..9d8018d 100644 --- a/knowledge base/parallel.md +++ b/knowledge base/parallel.md @@ -4,24 +4,28 @@ ```shell # group output (--group) -# use all cpu threads (--jobs 0 or --jobs 100%) +# fill up cpu threads (--jobs 100%) # use newline as delimiter for the arguments in input # simulate and print to output the command that would have been executed find . -type f \ | parallel --group --jobs 0 --delimiter '\n' --dry-run clamscan {} # get the exit status of all subjobs (--joblog $outfile) +# use all the threads you can (--jobs 0), hammering the cpu find . -type d -name .git -exec dirname "{}" + \ -| parallel --group --jobs 100% --tagstring {/} --joblog - 'git -C {} pull --recurse-submodules' +| parallel --group --jobs 0 --tagstring {/} --joblog - \ + 'git -C {} pull --recurse-submodules' # inject istio to all deployments in a namespace in (GNU) parallel -# aka _I have a death wish_ -kubectl -n ${NAMESPACE:-default} get deployments -o jsonpath='{.items[*].metadata.name}' \ -| parallel --group --jobs 0 'kubectl -n ${NAMESPACE:-default} apply -f <(istioctl kube-inject -f <(kubectl -n ${NAMESPACE:-default} get deployments,services {} -o json))' +kubectl get deployments -o jsonpath='{.items[*].metadata.name}' \ +| parallel --group --jobs 0 'kubectl -n ${NAMESPACE:-default} apply -f \ + <(istioctl kube-inject -f \ + <(kubectl get deployments,services {} -o json))' # given a list of namespaces get pods and their nodes parallel --group --jobs 100% --tag \ - "kubectl --context $KUBE_CONTEXT --namespace {} get pods --output json | jq -r '.items[] | .metadata.name + \"\t\" + .spec.nodeName' -" \ + "kubectl --context $KUBE_CONTEXT --namespace {} get pods --output json \ + | jq -r '.items[] | .metadata.name + \"\t\" + .spec.nodeName' -" \ ::: "${NAMESPACES}" \ | column -t ``` @@ -35,4 +39,4 @@ parallel --group --jobs 100% --tag \ [man page]: https://www.gnu.org/software/parallel/man.html [tutorial]: https://www.gnu.org/software/parallel/parallel_tutorial.html -[Obtaining exit status values from GNU parallel]: https://stackoverflow.com/questions/6310181/obtaining-exit-status-values-from-gnu-parallel#6789085 +[obtaining exit status values from gnu parallel]: https://stackoverflow.com/questions/6310181/obtaining-exit-status-values-from-gnu-parallel#6789085