diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..9c420526 --- /dev/null +++ b/404.html @@ -0,0 +1,984 @@ + + + + + + + + + + + + + + + + + + + + audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +

404 - Not found

+ +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/alternatives/index.html b/alternatives/index.html new file mode 100644 index 00000000..e17c0f9d --- /dev/null +++ b/alternatives/index.html @@ -0,0 +1,1164 @@ + + + + + + + + + + + + + + + + + + + + + + + + Alternatives - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Alternatives

+

Audiomentations isn't the only python library that can do various types of audio data +augmentation/degradation! Here's an overview:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameGithub starsLicenseLast commitGPU support?
audio-degradation-toolboxGithub starsLicenseLast commitNo
audio_degraderGithub starsLicenseLast commitNo
audiomentationsGithub starsLicenseLast commitNo
audiotoolsGithub starsLicenseLast commitYes
auglibGithub starsLicenseLast commitNo
AugLyGithub starsLicenseLast commitNo
fast-audiomentationsGithub starsLicenseLast commitYes
kapreGithub starsLicenseLast commitYes, Keras/Tensorflow
mudaGithub starsLicenseLast commitNo
nlpaugGithub starsLicenseLast commitNo
pedalboardGithub starsLicenseLast commitNo
pydiogmentGithub starsLicenseLast commitNo
python-audio-effectsGithub starsLicenseLast commitNo
SpecAugmentGithub starsLicenseLast commitYes, Pytorch & Tensorflow
spec_augmentGithub starsLicenseLast commitYes, Pytorch
tealGithub starsLicenseLast commitYes, Keras/Tensorflow
torch-audiomentationsGithub starsLicenseLast commitYes, Pytorch
torchaudio-augmentationsGithub starsLicenseLast commitYes, Pytorch
WavAugmentGithub starsLicenseLast commitNo
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 00000000..1cf13b9f Binary files /dev/null and b/assets/images/favicon.png differ diff --git a/assets/javascripts/bundle.220ee61c.min.js b/assets/javascripts/bundle.220ee61c.min.js new file mode 100644 index 00000000..116072a1 --- /dev/null +++ b/assets/javascripts/bundle.220ee61c.min.js @@ -0,0 +1,29 @@ +"use strict";(()=>{var Ci=Object.create;var gr=Object.defineProperty;var Ri=Object.getOwnPropertyDescriptor;var ki=Object.getOwnPropertyNames,Ht=Object.getOwnPropertySymbols,Hi=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,nn=Object.prototype.propertyIsEnumerable;var rn=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,P=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&rn(e,r,t[r]);if(Ht)for(var r of Ht(t))nn.call(t,r)&&rn(e,r,t[r]);return e};var on=(e,t)=>{var r={};for(var n in e)yr.call(e,n)&&t.indexOf(n)<0&&(r[n]=e[n]);if(e!=null&&Ht)for(var n of Ht(e))t.indexOf(n)<0&&nn.call(e,n)&&(r[n]=e[n]);return r};var Pt=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var Pi=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of ki(t))!yr.call(e,o)&&o!==r&&gr(e,o,{get:()=>t[o],enumerable:!(n=Ri(t,o))||n.enumerable});return e};var yt=(e,t,r)=>(r=e!=null?Ci(Hi(e)):{},Pi(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var sn=Pt((xr,an)=>{(function(e,t){typeof xr=="object"&&typeof an!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(xr,function(){"use strict";function e(r){var n=!0,o=!1,i=null,s={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function a(O){return!!(O&&O!==document&&O.nodeName!=="HTML"&&O.nodeName!=="BODY"&&"classList"in O&&"contains"in O.classList)}function f(O){var Qe=O.type,De=O.tagName;return!!(De==="INPUT"&&s[Qe]&&!O.readOnly||De==="TEXTAREA"&&!O.readOnly||O.isContentEditable)}function c(O){O.classList.contains("focus-visible")||(O.classList.add("focus-visible"),O.setAttribute("data-focus-visible-added",""))}function u(O){O.hasAttribute("data-focus-visible-added")&&(O.classList.remove("focus-visible"),O.removeAttribute("data-focus-visible-added"))}function p(O){O.metaKey||O.altKey||O.ctrlKey||(a(r.activeElement)&&c(r.activeElement),n=!0)}function m(O){n=!1}function d(O){a(O.target)&&(n||f(O.target))&&c(O.target)}function h(O){a(O.target)&&(O.target.classList.contains("focus-visible")||O.target.hasAttribute("data-focus-visible-added"))&&(o=!0,window.clearTimeout(i),i=window.setTimeout(function(){o=!1},100),u(O.target))}function v(O){document.visibilityState==="hidden"&&(o&&(n=!0),Y())}function Y(){document.addEventListener("mousemove",N),document.addEventListener("mousedown",N),document.addEventListener("mouseup",N),document.addEventListener("pointermove",N),document.addEventListener("pointerdown",N),document.addEventListener("pointerup",N),document.addEventListener("touchmove",N),document.addEventListener("touchstart",N),document.addEventListener("touchend",N)}function B(){document.removeEventListener("mousemove",N),document.removeEventListener("mousedown",N),document.removeEventListener("mouseup",N),document.removeEventListener("pointermove",N),document.removeEventListener("pointerdown",N),document.removeEventListener("pointerup",N),document.removeEventListener("touchmove",N),document.removeEventListener("touchstart",N),document.removeEventListener("touchend",N)}function N(O){O.target.nodeName&&O.target.nodeName.toLowerCase()==="html"||(n=!1,B())}document.addEventListener("keydown",p,!0),document.addEventListener("mousedown",m,!0),document.addEventListener("pointerdown",m,!0),document.addEventListener("touchstart",m,!0),document.addEventListener("visibilitychange",v,!0),Y(),r.addEventListener("focus",d,!0),r.addEventListener("blur",h,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var cn=Pt(Er=>{(function(e){var t=function(){try{return!!Symbol.iterator}catch(c){return!1}},r=t(),n=function(c){var u={next:function(){var p=c.shift();return{done:p===void 0,value:p}}};return r&&(u[Symbol.iterator]=function(){return u}),u},o=function(c){return encodeURIComponent(c).replace(/%20/g,"+")},i=function(c){return decodeURIComponent(String(c).replace(/\+/g," "))},s=function(){var c=function(p){Object.defineProperty(this,"_entries",{writable:!0,value:{}});var m=typeof p;if(m!=="undefined")if(m==="string")p!==""&&this._fromString(p);else if(p instanceof c){var d=this;p.forEach(function(B,N){d.append(N,B)})}else if(p!==null&&m==="object")if(Object.prototype.toString.call(p)==="[object Array]")for(var h=0;hd[0]?1:0}),c._entries&&(c._entries={});for(var p=0;p1?i(d[1]):"")}})})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er);(function(e){var t=function(){try{var o=new e.URL("b","http://a");return o.pathname="c d",o.href==="http://a/c%20d"&&o.searchParams}catch(i){return!1}},r=function(){var o=e.URL,i=function(f,c){typeof f!="string"&&(f=String(f)),c&&typeof c!="string"&&(c=String(c));var u=document,p;if(c&&(e.location===void 0||c!==e.location.href)){c=c.toLowerCase(),u=document.implementation.createHTMLDocument(""),p=u.createElement("base"),p.href=c,u.head.appendChild(p);try{if(p.href.indexOf(c)!==0)throw new Error(p.href)}catch(O){throw new Error("URL unable to set base "+c+" due to "+O)}}var m=u.createElement("a");m.href=f,p&&(u.body.appendChild(m),m.href=m.href);var d=u.createElement("input");if(d.type="url",d.value=f,m.protocol===":"||!/:/.test(m.href)||!d.checkValidity()&&!c)throw new TypeError("Invalid URL");Object.defineProperty(this,"_anchorElement",{value:m});var h=new e.URLSearchParams(this.search),v=!0,Y=!0,B=this;["append","delete","set"].forEach(function(O){var Qe=h[O];h[O]=function(){Qe.apply(h,arguments),v&&(Y=!1,B.search=h.toString(),Y=!0)}}),Object.defineProperty(this,"searchParams",{value:h,enumerable:!0});var N=void 0;Object.defineProperty(this,"_updateSearchParams",{enumerable:!1,configurable:!1,writable:!1,value:function(){this.search!==N&&(N=this.search,Y&&(v=!1,this.searchParams._fromString(this.search),v=!0))}})},s=i.prototype,a=function(f){Object.defineProperty(s,f,{get:function(){return this._anchorElement[f]},set:function(c){this._anchorElement[f]=c},enumerable:!0})};["hash","host","hostname","port","protocol"].forEach(function(f){a(f)}),Object.defineProperty(s,"search",{get:function(){return this._anchorElement.search},set:function(f){this._anchorElement.search=f,this._updateSearchParams()},enumerable:!0}),Object.defineProperties(s,{toString:{get:function(){var f=this;return function(){return f.href}}},href:{get:function(){return this._anchorElement.href.replace(/\?$/,"")},set:function(f){this._anchorElement.href=f,this._updateSearchParams()},enumerable:!0},pathname:{get:function(){return this._anchorElement.pathname.replace(/(^\/?)/,"/")},set:function(f){this._anchorElement.pathname=f},enumerable:!0},origin:{get:function(){var f={"http:":80,"https:":443,"ftp:":21}[this._anchorElement.protocol],c=this._anchorElement.port!=f&&this._anchorElement.port!=="";return this._anchorElement.protocol+"//"+this._anchorElement.hostname+(c?":"+this._anchorElement.port:"")},enumerable:!0},password:{get:function(){return""},set:function(f){},enumerable:!0},username:{get:function(){return""},set:function(f){},enumerable:!0}}),i.createObjectURL=function(f){return o.createObjectURL.apply(o,arguments)},i.revokeObjectURL=function(f){return o.revokeObjectURL.apply(o,arguments)},e.URL=i};if(t()||r(),e.location!==void 0&&!("origin"in e.location)){var n=function(){return e.location.protocol+"//"+e.location.hostname+(e.location.port?":"+e.location.port:"")};try{Object.defineProperty(e.location,"origin",{get:n,enumerable:!0})}catch(o){setInterval(function(){e.location.origin=n()},100)}}})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er)});var qr=Pt((Mt,Nr)=>{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof Mt=="object"&&typeof Nr=="object"?Nr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Mt=="object"?Mt.ClipboardJS=r():t.ClipboardJS=r()})(Mt,function(){return function(){var e={686:function(n,o,i){"use strict";i.d(o,{default:function(){return Ai}});var s=i(279),a=i.n(s),f=i(370),c=i.n(f),u=i(817),p=i.n(u);function m(j){try{return document.execCommand(j)}catch(T){return!1}}var d=function(T){var E=p()(T);return m("cut"),E},h=d;function v(j){var T=document.documentElement.getAttribute("dir")==="rtl",E=document.createElement("textarea");E.style.fontSize="12pt",E.style.border="0",E.style.padding="0",E.style.margin="0",E.style.position="absolute",E.style[T?"right":"left"]="-9999px";var H=window.pageYOffset||document.documentElement.scrollTop;return E.style.top="".concat(H,"px"),E.setAttribute("readonly",""),E.value=j,E}var Y=function(T,E){var H=v(T);E.container.appendChild(H);var I=p()(H);return m("copy"),H.remove(),I},B=function(T){var E=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},H="";return typeof T=="string"?H=Y(T,E):T instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(T==null?void 0:T.type)?H=Y(T.value,E):(H=p()(T),m("copy")),H},N=B;function O(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?O=function(E){return typeof E}:O=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},O(j)}var Qe=function(){var T=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},E=T.action,H=E===void 0?"copy":E,I=T.container,q=T.target,Me=T.text;if(H!=="copy"&&H!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(q!==void 0)if(q&&O(q)==="object"&&q.nodeType===1){if(H==="copy"&&q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(H==="cut"&&(q.hasAttribute("readonly")||q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(Me)return N(Me,{container:I});if(q)return H==="cut"?h(q):N(q,{container:I})},De=Qe;function $e(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?$e=function(E){return typeof E}:$e=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},$e(j)}function Ei(j,T){if(!(j instanceof T))throw new TypeError("Cannot call a class as a function")}function tn(j,T){for(var E=0;E0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof I.action=="function"?I.action:this.defaultAction,this.target=typeof I.target=="function"?I.target:this.defaultTarget,this.text=typeof I.text=="function"?I.text:this.defaultText,this.container=$e(I.container)==="object"?I.container:document.body}},{key:"listenClick",value:function(I){var q=this;this.listener=c()(I,"click",function(Me){return q.onClick(Me)})}},{key:"onClick",value:function(I){var q=I.delegateTarget||I.currentTarget,Me=this.action(q)||"copy",kt=De({action:Me,container:this.container,target:this.target(q),text:this.text(q)});this.emit(kt?"success":"error",{action:Me,text:kt,trigger:q,clearSelection:function(){q&&q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(I){return vr("action",I)}},{key:"defaultTarget",value:function(I){var q=vr("target",I);if(q)return document.querySelector(q)}},{key:"defaultText",value:function(I){return vr("text",I)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(I){var q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return N(I,q)}},{key:"cut",value:function(I){return h(I)}},{key:"isSupported",value:function(){var I=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],q=typeof I=="string"?[I]:I,Me=!!document.queryCommandSupported;return q.forEach(function(kt){Me=Me&&!!document.queryCommandSupported(kt)}),Me}}]),E}(a()),Ai=Li},828:function(n){var o=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function s(a,f){for(;a&&a.nodeType!==o;){if(typeof a.matches=="function"&&a.matches(f))return a;a=a.parentNode}}n.exports=s},438:function(n,o,i){var s=i(828);function a(u,p,m,d,h){var v=c.apply(this,arguments);return u.addEventListener(m,v,h),{destroy:function(){u.removeEventListener(m,v,h)}}}function f(u,p,m,d,h){return typeof u.addEventListener=="function"?a.apply(null,arguments):typeof m=="function"?a.bind(null,document).apply(null,arguments):(typeof u=="string"&&(u=document.querySelectorAll(u)),Array.prototype.map.call(u,function(v){return a(v,p,m,d,h)}))}function c(u,p,m,d){return function(h){h.delegateTarget=s(h.target,p),h.delegateTarget&&d.call(u,h)}}n.exports=f},879:function(n,o){o.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},o.nodeList=function(i){var s=Object.prototype.toString.call(i);return i!==void 0&&(s==="[object NodeList]"||s==="[object HTMLCollection]")&&"length"in i&&(i.length===0||o.node(i[0]))},o.string=function(i){return typeof i=="string"||i instanceof String},o.fn=function(i){var s=Object.prototype.toString.call(i);return s==="[object Function]"}},370:function(n,o,i){var s=i(879),a=i(438);function f(m,d,h){if(!m&&!d&&!h)throw new Error("Missing required arguments");if(!s.string(d))throw new TypeError("Second argument must be a String");if(!s.fn(h))throw new TypeError("Third argument must be a Function");if(s.node(m))return c(m,d,h);if(s.nodeList(m))return u(m,d,h);if(s.string(m))return p(m,d,h);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(m,d,h){return m.addEventListener(d,h),{destroy:function(){m.removeEventListener(d,h)}}}function u(m,d,h){return Array.prototype.forEach.call(m,function(v){v.addEventListener(d,h)}),{destroy:function(){Array.prototype.forEach.call(m,function(v){v.removeEventListener(d,h)})}}}function p(m,d,h){return a(document.body,m,d,h)}n.exports=f},817:function(n){function o(i){var s;if(i.nodeName==="SELECT")i.focus(),s=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var a=i.hasAttribute("readonly");a||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),a||i.removeAttribute("readonly"),s=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var f=window.getSelection(),c=document.createRange();c.selectNodeContents(i),f.removeAllRanges(),f.addRange(c),s=f.toString()}return s}n.exports=o},279:function(n){function o(){}o.prototype={on:function(i,s,a){var f=this.e||(this.e={});return(f[i]||(f[i]=[])).push({fn:s,ctx:a}),this},once:function(i,s,a){var f=this;function c(){f.off(i,c),s.apply(a,arguments)}return c._=s,this.on(i,c,a)},emit:function(i){var s=[].slice.call(arguments,1),a=((this.e||(this.e={}))[i]||[]).slice(),f=0,c=a.length;for(f;f{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var rs=/["'&<>]/;Yo.exports=ns;function ns(e){var t=""+e,r=rs.exec(t);if(!r)return t;var n,o="",i=0,s=0;for(i=r.index;i0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[n++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function W(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var n=r.call(e),o,i=[],s;try{for(;(t===void 0||t-- >0)&&!(o=n.next()).done;)i.push(o.value)}catch(a){s={error:a}}finally{try{o&&!o.done&&(r=n.return)&&r.call(n)}finally{if(s)throw s.error}}return i}function D(e,t,r){if(r||arguments.length===2)for(var n=0,o=t.length,i;n1||a(m,d)})})}function a(m,d){try{f(n[m](d))}catch(h){p(i[0][3],h)}}function f(m){m.value instanceof et?Promise.resolve(m.value.v).then(c,u):p(i[0][2],m)}function c(m){a("next",m)}function u(m){a("throw",m)}function p(m,d){m(d),i.shift(),i.length&&a(i[0][0],i[0][1])}}function pn(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof Ee=="function"?Ee(e):e[Symbol.iterator](),r={},n("next"),n("throw"),n("return"),r[Symbol.asyncIterator]=function(){return this},r);function n(i){r[i]=e[i]&&function(s){return new Promise(function(a,f){s=e[i](s),o(a,f,s.done,s.value)})}}function o(i,s,a,f){Promise.resolve(f).then(function(c){i({value:c,done:a})},s)}}function C(e){return typeof e=="function"}function at(e){var t=function(n){Error.call(n),n.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var It=at(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(n,o){return o+1+") "+n.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function Ve(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ie=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,n,o,i;if(!this.closed){this.closed=!0;var s=this._parentage;if(s)if(this._parentage=null,Array.isArray(s))try{for(var a=Ee(s),f=a.next();!f.done;f=a.next()){var c=f.value;c.remove(this)}}catch(v){t={error:v}}finally{try{f&&!f.done&&(r=a.return)&&r.call(a)}finally{if(t)throw t.error}}else s.remove(this);var u=this.initialTeardown;if(C(u))try{u()}catch(v){i=v instanceof It?v.errors:[v]}var p=this._finalizers;if(p){this._finalizers=null;try{for(var m=Ee(p),d=m.next();!d.done;d=m.next()){var h=d.value;try{ln(h)}catch(v){i=i!=null?i:[],v instanceof It?i=D(D([],W(i)),W(v.errors)):i.push(v)}}}catch(v){n={error:v}}finally{try{d&&!d.done&&(o=m.return)&&o.call(m)}finally{if(n)throw n.error}}}if(i)throw new It(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ln(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Ve(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Ve(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Sr=Ie.EMPTY;function jt(e){return e instanceof Ie||e&&"closed"in e&&C(e.remove)&&C(e.add)&&C(e.unsubscribe)}function ln(e){C(e)?e():e.unsubscribe()}var Le={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var st={setTimeout:function(e,t){for(var r=[],n=2;n0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var n=this,o=this,i=o.hasError,s=o.isStopped,a=o.observers;return i||s?Sr:(this.currentObservers=null,a.push(r),new Ie(function(){n.currentObservers=null,Ve(a,r)}))},t.prototype._checkFinalizedStatuses=function(r){var n=this,o=n.hasError,i=n.thrownError,s=n.isStopped;o?r.error(i):s&&r.complete()},t.prototype.asObservable=function(){var r=new F;return r.source=this,r},t.create=function(r,n){return new xn(r,n)},t}(F);var xn=function(e){ie(t,e);function t(r,n){var o=e.call(this)||this;return o.destination=r,o.source=n,o}return t.prototype.next=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.next)===null||o===void 0||o.call(n,r)},t.prototype.error=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.error)===null||o===void 0||o.call(n,r)},t.prototype.complete=function(){var r,n;(n=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||n===void 0||n.call(r)},t.prototype._subscribe=function(r){var n,o;return(o=(n=this.source)===null||n===void 0?void 0:n.subscribe(r))!==null&&o!==void 0?o:Sr},t}(x);var Et={now:function(){return(Et.delegate||Date).now()},delegate:void 0};var wt=function(e){ie(t,e);function t(r,n,o){r===void 0&&(r=1/0),n===void 0&&(n=1/0),o===void 0&&(o=Et);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=n,i._timestampProvider=o,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=n===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,n),i}return t.prototype.next=function(r){var n=this,o=n.isStopped,i=n._buffer,s=n._infiniteTimeWindow,a=n._timestampProvider,f=n._windowTime;o||(i.push(r),!s&&i.push(a.now()+f)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var n=this._innerSubscribe(r),o=this,i=o._infiniteTimeWindow,s=o._buffer,a=s.slice(),f=0;f0?e.prototype.requestAsyncId.call(this,r,n,o):(r.actions.push(this),r._scheduled||(r._scheduled=ut.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,n,o){var i;if(o===void 0&&(o=0),o!=null?o>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,n,o);var s=r.actions;n!=null&&((i=s[s.length-1])===null||i===void 0?void 0:i.id)!==n&&(ut.cancelAnimationFrame(n),r._scheduled=void 0)},t}(Wt);var Sn=function(e){ie(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var n=this._scheduled;this._scheduled=void 0;var o=this.actions,i;r=r||o.shift();do if(i=r.execute(r.state,r.delay))break;while((r=o[0])&&r.id===n&&o.shift());if(this._active=!1,i){for(;(r=o[0])&&r.id===n&&o.shift();)r.unsubscribe();throw i}},t}(Dt);var Oe=new Sn(wn);var M=new F(function(e){return e.complete()});function Vt(e){return e&&C(e.schedule)}function Cr(e){return e[e.length-1]}function Ye(e){return C(Cr(e))?e.pop():void 0}function Te(e){return Vt(Cr(e))?e.pop():void 0}function zt(e,t){return typeof Cr(e)=="number"?e.pop():t}var pt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Nt(e){return C(e==null?void 0:e.then)}function qt(e){return C(e[ft])}function Kt(e){return Symbol.asyncIterator&&C(e==null?void 0:e[Symbol.asyncIterator])}function Qt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function zi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var Yt=zi();function Gt(e){return C(e==null?void 0:e[Yt])}function Bt(e){return un(this,arguments,function(){var r,n,o,i;return $t(this,function(s){switch(s.label){case 0:r=e.getReader(),s.label=1;case 1:s.trys.push([1,,9,10]),s.label=2;case 2:return[4,et(r.read())];case 3:return n=s.sent(),o=n.value,i=n.done,i?[4,et(void 0)]:[3,5];case 4:return[2,s.sent()];case 5:return[4,et(o)];case 6:return[4,s.sent()];case 7:return s.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function Jt(e){return C(e==null?void 0:e.getReader)}function U(e){if(e instanceof F)return e;if(e!=null){if(qt(e))return Ni(e);if(pt(e))return qi(e);if(Nt(e))return Ki(e);if(Kt(e))return On(e);if(Gt(e))return Qi(e);if(Jt(e))return Yi(e)}throw Qt(e)}function Ni(e){return new F(function(t){var r=e[ft]();if(C(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function qi(e){return new F(function(t){for(var r=0;r=2;return function(n){return n.pipe(e?A(function(o,i){return e(o,i,n)}):de,ge(1),r?He(t):Dn(function(){return new Zt}))}}function Vn(){for(var e=[],t=0;t=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new x}:t,n=e.resetOnError,o=n===void 0?!0:n,i=e.resetOnComplete,s=i===void 0?!0:i,a=e.resetOnRefCountZero,f=a===void 0?!0:a;return function(c){var u,p,m,d=0,h=!1,v=!1,Y=function(){p==null||p.unsubscribe(),p=void 0},B=function(){Y(),u=m=void 0,h=v=!1},N=function(){var O=u;B(),O==null||O.unsubscribe()};return y(function(O,Qe){d++,!v&&!h&&Y();var De=m=m!=null?m:r();Qe.add(function(){d--,d===0&&!v&&!h&&(p=$r(N,f))}),De.subscribe(Qe),!u&&d>0&&(u=new rt({next:function($e){return De.next($e)},error:function($e){v=!0,Y(),p=$r(B,o,$e),De.error($e)},complete:function(){h=!0,Y(),p=$r(B,s),De.complete()}}),U(O).subscribe(u))})(c)}}function $r(e,t){for(var r=[],n=2;ne.next(document)),e}function K(e,t=document){return Array.from(t.querySelectorAll(e))}function z(e,t=document){let r=ce(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function ce(e,t=document){return t.querySelector(e)||void 0}function _e(){return document.activeElement instanceof HTMLElement&&document.activeElement||void 0}function tr(e){return L(b(document.body,"focusin"),b(document.body,"focusout")).pipe(ke(1),l(()=>{let t=_e();return typeof t!="undefined"?e.contains(t):!1}),V(e===_e()),J())}function Xe(e){return{x:e.offsetLeft,y:e.offsetTop}}function Kn(e){return L(b(window,"load"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>Xe(e)),V(Xe(e)))}function rr(e){return{x:e.scrollLeft,y:e.scrollTop}}function dt(e){return L(b(e,"scroll"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>rr(e)),V(rr(e)))}var Yn=function(){if(typeof Map!="undefined")return Map;function e(t,r){var n=-1;return t.some(function(o,i){return o[0]===r?(n=i,!0):!1}),n}return function(){function t(){this.__entries__=[]}return Object.defineProperty(t.prototype,"size",{get:function(){return this.__entries__.length},enumerable:!0,configurable:!0}),t.prototype.get=function(r){var n=e(this.__entries__,r),o=this.__entries__[n];return o&&o[1]},t.prototype.set=function(r,n){var o=e(this.__entries__,r);~o?this.__entries__[o][1]=n:this.__entries__.push([r,n])},t.prototype.delete=function(r){var n=this.__entries__,o=e(n,r);~o&&n.splice(o,1)},t.prototype.has=function(r){return!!~e(this.__entries__,r)},t.prototype.clear=function(){this.__entries__.splice(0)},t.prototype.forEach=function(r,n){n===void 0&&(n=null);for(var o=0,i=this.__entries__;o0},e.prototype.connect_=function(){!Wr||this.connected_||(document.addEventListener("transitionend",this.onTransitionEnd_),window.addEventListener("resize",this.refresh),va?(this.mutationsObserver_=new MutationObserver(this.refresh),this.mutationsObserver_.observe(document,{attributes:!0,childList:!0,characterData:!0,subtree:!0})):(document.addEventListener("DOMSubtreeModified",this.refresh),this.mutationEventsAdded_=!0),this.connected_=!0)},e.prototype.disconnect_=function(){!Wr||!this.connected_||(document.removeEventListener("transitionend",this.onTransitionEnd_),window.removeEventListener("resize",this.refresh),this.mutationsObserver_&&this.mutationsObserver_.disconnect(),this.mutationEventsAdded_&&document.removeEventListener("DOMSubtreeModified",this.refresh),this.mutationsObserver_=null,this.mutationEventsAdded_=!1,this.connected_=!1)},e.prototype.onTransitionEnd_=function(t){var r=t.propertyName,n=r===void 0?"":r,o=ba.some(function(i){return!!~n.indexOf(i)});o&&this.refresh()},e.getInstance=function(){return this.instance_||(this.instance_=new e),this.instance_},e.instance_=null,e}(),Gn=function(e,t){for(var r=0,n=Object.keys(t);r0},e}(),Jn=typeof WeakMap!="undefined"?new WeakMap:new Yn,Xn=function(){function e(t){if(!(this instanceof e))throw new TypeError("Cannot call a class as a function.");if(!arguments.length)throw new TypeError("1 argument required, but only 0 present.");var r=ga.getInstance(),n=new La(t,r,this);Jn.set(this,n)}return e}();["observe","unobserve","disconnect"].forEach(function(e){Xn.prototype[e]=function(){var t;return(t=Jn.get(this))[e].apply(t,arguments)}});var Aa=function(){return typeof nr.ResizeObserver!="undefined"?nr.ResizeObserver:Xn}(),Zn=Aa;var eo=new x,Ca=$(()=>k(new Zn(e=>{for(let t of e)eo.next(t)}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function he(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ye(e){return Ca.pipe(S(t=>t.observe(e)),g(t=>eo.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(()=>he(e)))),V(he(e)))}function bt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function ar(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}var to=new x,Ra=$(()=>k(new IntersectionObserver(e=>{for(let t of e)to.next(t)},{threshold:0}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function sr(e){return Ra.pipe(S(t=>t.observe(e)),g(t=>to.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(({isIntersecting:r})=>r))))}function ro(e,t=16){return dt(e).pipe(l(({y:r})=>{let n=he(e),o=bt(e);return r>=o.height-n.height-t}),J())}var cr={drawer:z("[data-md-toggle=drawer]"),search:z("[data-md-toggle=search]")};function no(e){return cr[e].checked}function Ke(e,t){cr[e].checked!==t&&cr[e].click()}function Ue(e){let t=cr[e];return b(t,"change").pipe(l(()=>t.checked),V(t.checked))}function ka(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Ha(){return L(b(window,"compositionstart").pipe(l(()=>!0)),b(window,"compositionend").pipe(l(()=>!1))).pipe(V(!1))}function oo(){let e=b(window,"keydown").pipe(A(t=>!(t.metaKey||t.ctrlKey)),l(t=>({mode:no("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),A(({mode:t,type:r})=>{if(t==="global"){let n=_e();if(typeof n!="undefined")return!ka(n,r)}return!0}),pe());return Ha().pipe(g(t=>t?M:e))}function le(){return new URL(location.href)}function ot(e){location.href=e.href}function io(){return new x}function ao(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)ao(e,r)}function _(e,t,...r){let n=document.createElement(e);if(t)for(let o of Object.keys(t))typeof t[o]!="undefined"&&(typeof t[o]!="boolean"?n.setAttribute(o,t[o]):n.setAttribute(o,""));for(let o of r)ao(n,o);return n}function fr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function so(){return location.hash.substring(1)}function Dr(e){let t=_("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Pa(e){return L(b(window,"hashchange"),e).pipe(l(so),V(so()),A(t=>t.length>0),X(1))}function co(e){return Pa(e).pipe(l(t=>ce(`[id="${t}"]`)),A(t=>typeof t!="undefined"))}function Vr(e){let t=matchMedia(e);return er(r=>t.addListener(()=>r(t.matches))).pipe(V(t.matches))}function fo(){let e=matchMedia("print");return L(b(window,"beforeprint").pipe(l(()=>!0)),b(window,"afterprint").pipe(l(()=>!1))).pipe(V(e.matches))}function zr(e,t){return e.pipe(g(r=>r?t():M))}function ur(e,t={credentials:"same-origin"}){return ue(fetch(`${e}`,t)).pipe(fe(()=>M),g(r=>r.status!==200?Ot(()=>new Error(r.statusText)):k(r)))}function We(e,t){return ur(e,t).pipe(g(r=>r.json()),X(1))}function uo(e,t){let r=new DOMParser;return ur(e,t).pipe(g(n=>n.text()),l(n=>r.parseFromString(n,"text/xml")),X(1))}function pr(e){let t=_("script",{src:e});return $(()=>(document.head.appendChild(t),L(b(t,"load"),b(t,"error").pipe(g(()=>Ot(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(l(()=>{}),R(()=>document.head.removeChild(t)),ge(1))))}function po(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function lo(){return L(b(window,"scroll",{passive:!0}),b(window,"resize",{passive:!0})).pipe(l(po),V(po()))}function mo(){return{width:innerWidth,height:innerHeight}}function ho(){return b(window,"resize",{passive:!0}).pipe(l(mo),V(mo()))}function bo(){return G([lo(),ho()]).pipe(l(([e,t])=>({offset:e,size:t})),X(1))}function lr(e,{viewport$:t,header$:r}){let n=t.pipe(ee("size")),o=G([n,r]).pipe(l(()=>Xe(e)));return G([r,t,o]).pipe(l(([{height:i},{offset:s,size:a},{x:f,y:c}])=>({offset:{x:s.x-f,y:s.y-c+i},size:a})))}(()=>{function e(n,o){parent.postMessage(n,o||"*")}function t(...n){return n.reduce((o,i)=>o.then(()=>new Promise(s=>{let a=document.createElement("script");a.src=i,a.onload=s,document.body.appendChild(a)})),Promise.resolve())}var r=class extends EventTarget{constructor(n){super(),this.url=n,this.m=i=>{i.source===this.w&&(this.dispatchEvent(new MessageEvent("message",{data:i.data})),this.onmessage&&this.onmessage(i))},this.e=(i,s,a,f,c)=>{if(s===`${this.url}`){let u=new ErrorEvent("error",{message:i,filename:s,lineno:a,colno:f,error:c});this.dispatchEvent(u),this.onerror&&this.onerror(u)}};let o=document.createElement("iframe");o.hidden=!0,document.body.appendChild(this.iframe=o),this.w.document.open(),this.w.document.write(` + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Changelog

+

All notable changes to this project will be documented in this file.

+

The format is based on Keep a Changelog, +and this project adheres to Semantic Versioning.

+

Unreleased

+

Added

+
    +
  • Add new transforms: AddColorNoise, Aliasing and BitCrush
  • +
+

0.34.1 - 2023-11-24

+

Changed

+
    +
  • Bump min numpy version from 1.18 to 1.21
  • +
  • Use numpy.typing in type hints
  • +
  • Optimize max abs calculations in terms of memory and speed. This makes Normalize, Mp3Compression and Limiter slightly faster.
  • +
+

0.33.0 - 2023-08-30

+

Changed

+
    +
  • Bump min numpy version from 1.16 to 1.18
  • +
  • Bump min scipy version from 1.3 to 1.4
  • +
  • Bump min python version from 3.7 to 3.8, because 3.7 is beyond end-of-life already
  • +
  • Change some AssertionError exceptions to ValueError
  • +
+

⚠ The Shift transform has been changed:

+
    +
  • Removed fade parameter. fade_duration=0.0 now denotes disabled fading.
  • +
  • Rename min_fraction to min_shift and max_fraction to max_shift
  • +
  • Add shift_unit parameter
  • +
  • Fading is enabled by default
  • +
  • Smoother fade curve
  • +
+

These are breaking changes. The following example shows how you can adapt your code when upgrading from <=v0.32.0 to >=v0.33.0:

+ + + + + + + + + + + + + + + + + +
<= 0.32.0>= 0.33.0
Shift(min_fraction=-0.5, max_fraction=0.5, fade=True, fade_duration=0.01)Shift(min_shift=-0.5, max_shift=0.5, shift_unit="fraction", fade_duration=0.01)
Shift()Shift(fade_duration=0.0)
+

Fixed

+
    +
  • Correct some wrong type hints
  • +
+

0.32.0 - 2023-08-15

+

Added

+
    +
  • Add new RepeatPart transform
  • +
+

Changed

+
    +
  • Bump min version of numpy dependency from 1.13 to 1.16
  • +
  • If a transform is in "frozen parameters" mode, but has no parameters yet, the transform will randomize/set parameters when it gets called for the first time
  • +
  • Increase the threshold for raising WrongMultichannelAudioShape. This allows some rare use cases where the number of channels slightly exceeds the number of samples.
  • +
+

Fixed

+
    +
  • Fix some type hints that were np.array instead of np.ndarray
  • +
+

0.31.0 - 2023-06-21

+

Changed

+
    +
  • Raise exception instead of warning when the given multichannel ndarray has wrong shape
  • +
  • Add support for the latest librosa 0.10 version
  • +
  • Switch to a faster default resampler internally in PitchShift, leading to much faster execution. This requires soxr.
  • +
  • Bump min scipy requirement from 1.0 to 1.3
  • +
  • Rename "_in_db" to "_db" in args and parameters. Passing args with the old names still works, but is deprecated and will stop working in a future version.
  • +
+

0.30.0 - 2023-05-02

+

Added

+
    +
  • Add new AdjustDuration transform
  • +
+

Fixed

+
    +
  • Fix a bug where too loud inputs got wrap distortion when running them through Mp3Compression
  • +
+

0.29.0 - 2023-03-15

+

Added

+
    +
  • Add apply_to parameter that can be set to "only_too_loud_sounds" in Normalize
  • +
+

Changed

+
    +
  • Change default value of noise_rms from "relative" to "relative_to_whole_input" in AddShortNoises
  • +
  • Change default values of min_snr_in_db (from 0.0 to -6.0), max_snr_in_db (from 24.0 to 18.0), min_time_between_sounds (from 4.0 to 2.0) and max_time_between_sounds (from 16.0 to 8.0) in AddShortNoises
  • +
+

Fixed

+
    +
  • Fix a bug where Limiter raised an exception when it got digital silence as input
  • +
+

0.28.0 - 2023-01-12

+

Added

+
    +
  • Add/improve type hints
  • +
  • Add/improve documentation
  • +
+

Fixed

+
    +
  • Fix a bug in RoomSimulator where the value of max_order was not respected
  • +
+

Removed

+
    +
  • Remove FrequencyMask that had been deprecated since version 0.22.0. BandStopFilter is a good alternative.
  • +
+

0.27.0 - 2022-09-13

+

Changed

+
    +
  • Speed up Limiter by ~8x
  • +
  • Fix/improve some docstrings and type hints
  • +
  • Change default values in Trim and ApplyImpulseResponse according to the warnings that were added in v0.23.0
  • +
  • Emit a FutureWarning when noise_rms in AddShortNoises is not specified - the + default value will change from "relative" to "relative_to_whole_input" in a future version.
  • +
+

0.26.0 - 2022-08-19

+

Added

+
    +
  • Add new transform Lambda. Thanks to Thanatoz-1.
  • +
  • Add new transform Limiter. Thanks to pzelasko.
  • +
+

Fixed

+
    +
  • Fix incorrect type hints in RoomSimulator
  • +
  • Make Shift robust to different sample rate inputs when parameters are frozen
  • +
+

0.25.1 - 2022-06-15

+

Fixed

+
    +
  • Fix a bug where RoomSimulator would treat an x value as if it was y, and vice versa
  • +
+

0.25.0 - 2022-05-30

+

Added

+
    +
  • Add AirAbsorption transform
  • +
  • Add mp4 to the list of recognized audio filename extensions
  • +
+

Changed

+
    +
  • Guard against invalid params in TimeMask
  • +
  • Emit FutureWarning instead of UserWarning in Trim and ApplyImpulseResponse
  • +
  • Allow specifying a file path, a folder path, a list of files or a list of folders to + ApplyImpulseResponse, AddBackgroundNoise and AddShortNoises. Previously only a path to a folder was allowed.
  • +
+

Fixed

+
    +
  • Fix a bug with noise_transform in AddBackgroundNoise where some + SNR calculations were done before the noise_transform was applied. This has sometimes + led to incorrect SNR in the output. This changes the behavior of + AddBackgroundNoise (when noise_transform is used).
  • +
+

Removed

+
    +
  • Remove support for Python 3.6, as it is past its end of life already. RIP.
  • +
+

0.24.0 - 2022-03-18

+

Added

+
    +
  • Add SevenBandParametricEQ transform
  • +
  • Add optional noise_transform in AddShortNoises
  • +
  • Add .aac and .aif to the list of recognized audio filename endings
  • +
+

Changed

+
    +
  • Show warning if top_db and/or p in Trim are not specified because their default + values will change in a future version
  • +
+

Fixed

+
    +
  • Fix filter instability bug related to center freq above nyquist freq in LowShelfFilter and HighShelfFilter
  • +
+

0.23.0 - 2022-03-07

+

Added

+
    +
  • Add Padding transform
  • +
  • Add RoomSimulator transform for simulating shoebox rooms using pyroomacoustics
  • +
  • Add parameter signal_gain_in_db_during_noise in AddShortNoises
  • +
+

Changed

+
    +
  • Not specifying a value for leave_length_unchanged in AddImpulseResponse now emits + a warning, as the default value will change from False to True in a future version.
  • +
+

Removed

+
    +
  • Remove the deprecated AddImpulseResponse alias. Use ApplyImpulseResponse instead.
  • +
  • Remove support for the legacy parameters min_SNR and max_SNR in AddGaussianSNR
  • +
  • Remove useless default path value in AddBackgroundNoise, AddShortNoises and ApplyImpulseResponse
  • +
+

0.22.0 - 2022-02-18

+

Added

+
    +
  • Implement GainTransition
  • +
  • Add support for librosa 0.9
  • +
  • Add support for stereo audio in Mp3Compression, Resample and Trim
  • +
  • Add "relative_to_whole_input" option for noise_rms parameter in AddShortNoises
  • +
  • Add optional noise_transform in AddBackgroundNoise
  • +
+

Changed

+
    +
  • Improve speed of PitchShift by 6-18% when the input audio is stereo
  • +
+

Deprecated

+
    +
  • Deprecate FrequencyMask in favor of BandStopFilter
  • +
+

Removed

+
    +
  • Remove support for librosa<=0.7.2
  • +
+

0.21.0 - 2022-02-10

+

Added

+
    +
  • Add support for multichannel audio in ApplyImpulseResponse, BandPassFilter, HighPassFilter and LowPassFilter
  • +
  • Add BandStopFilter (similar to FrequencyMask, but with overhauled defaults and parameter randomization behavior), PeakingFilter, LowShelfFilter and HighShelfFilter
  • +
  • Add parameter add_all_noises_with_same_level in AddShortNoises
  • +
+

Changed

+
    +
  • Change BandPassFilter, LowPassFilter, HighPassFilter, to use scipy's butterworth + filters instead of pydub. Now they have parametrized roll-off. Filters are now steeper + than before by default - set min_rolloff=6, max_rolloff=6 to get the old behavior. + They also support zero-phase filtering now. And they're at least ~25x times faster than before!
  • +
+

Removed

+
    +
  • Remove optional wavio dependency for audio loading
  • +
+

0.20.0 - 2021-11-18

+

Added

+
    +
  • Implement OneOf and SomeOf for applying one of or some of many transforms. Transforms are randomly + chosen every call. Inspired by augly. Thanks to Cangonin and iver56.
  • +
  • Add a new argument apply_to_children (bool) in randomize_parameters, + freeze_parameters and unfreeze_parameters in Compose and SpecCompose.
  • +
+

Changed

+
    +
  • Insert three new parameters in AddBackgroundNoise: noise_rms (defaults to "relative", which is + the old behavior), min_absolute_rms_in_db and max_absolute_rms_in_db. This may be a breaking + change if you used AddBackgroundNoise with positional arguments in earlier versions of audiomentations! + Please use keyword arguments to be on the safe side - it should be backwards compatible then.
  • +
+

Fixed

+
    +
  • Remove global pydub import which was accidentally introduced in v0.18.0. pydub is + considered an optional dependency and is imported only on demand now.
  • +
+

0.19.0 - 2021-10-18

+

Added

+
    +
  • Implement TanhDistortion. Thanks to atamazian and iver56.
  • +
  • Add a noise_rms parameter to AddShortNoises. It defaults to relative, which + is the old behavior. absolute allows for adding loud noises to parts that are + relatively silent in the input.
  • +
+

0.18.0 - 2021-08-05

+

Added

+
    +
  • Implement BandPassFilter, HighPassFilter, LowPassFilter and Reverse. Thanks to atamazian.
  • +
+

0.17.0 - 2021-06-25

+

Added

+
    +
  • Add a fade option in Shift for eliminating unwanted clicks
  • +
  • Add support for 32-bit int wav loading with scipy>=1.6
  • +
  • Add support for float64 wav files. However, the use of this format is discouraged, + since float32 is more than enough for audio in most cases.
  • +
  • Implement Clip. Thanks to atamazian.
  • +
  • Add some parameter sanity checks in AddGaussianNoise
  • +
  • Officially support librosa 0.8.1
  • +
+

Changed

+
    +
  • Rename AddImpulseResponse to ApplyImpulseResponse. The former will still work for + now, but give a warning.
  • +
  • When looking for audio files in AddImpulseResponse, AddBackgroundNoise + and AddShortNoises, follow symlinks by default.
  • +
  • When using the new parameters min_snr_in_db and max_snr_in_db in AddGaussianSNR, + SNRs will be picked uniformly in the decibel scale instead of in the linear amplitude + ratio scale. The new behavior aligns more with human hearing, which is not linear.
  • +
+

Fixed

+
    +
  • Avoid division by zero in AddImpulseResponse when input is digital silence (all zeros)
  • +
  • Fix inverse SNR characteristics in AddGaussianSNR. It will continue working as before + unless you switch to the new parameters min_snr_in_db and max_snr_in_db. If you + use the old parameters, you'll get a warning.
  • +
+

0.16.0 - 2021-02-11

+

Added

+
    +
  • Implement SpecCompose for applying a pipeline of spectrogram transforms. Thanks to omerferhatt.
  • +
+

Fixed

+
    +
  • Fix a bug in SpecChannelShuffle where it did not support more than 3 audio channels. Thanks to omerferhatt.
  • +
  • Limit scipy version range to >=1.0,<1.6 to avoid issues with loading 24-bit wav files. + Support for scipy>=1.6 will be added later.
  • +
+

0.15.0 - 2020-12-10

+

Added

+
    +
  • Add an option leave_length_unchanged to AddImpulseResponse
  • +
+

Fixed

+
    +
  • Fix picklability of instances of AddImpulseResponse, AddBackgroundNoise + and AddShortNoises
  • +
+

0.14.0 - 2020-12-06

+

Added

+
    +
  • Implement LoudnessNormalization
  • +
  • Implement randomize_parameters in Compose. Thanks to SolomidHero.
  • +
  • Add multichannel support to AddGaussianNoise, AddGaussianSNR, ClippingDistortion, + FrequencyMask, PitchShift, Shift, TimeMask and TimeStretch
  • +
+

0.13.0 - 2020-11-10

+

Added

+
    +
  • Lay the foundation for spectrogram transforms. Implement SpecChannelShuffle and + SpecFrequencyMask.
  • +
  • Configurable LRU cache for transforms that use external sound files. Thanks to alumae.
  • +
  • Officially add multichannel support to Normalize
  • +
+

Changed

+
    +
  • Show a warning if a waveform had to be resampled after loading it. This is because resampling + is slow. Ideally, files on disk should already have the desired sample rate.
  • +
+

Fixed

+
    +
  • Correctly find audio files with upper case filename extensions.
  • +
  • Fix a bug where AddBackgroundNoise crashed when trying to add digital silence to an input. Thanks to juheeuu.
  • +
+

0.12.1 - 2020-09-28

+

Changed

+
    +
  • Speed up AddBackgroundNoise, AddShortNoises and AddImpulseResponse by loading wav files with scipy or wavio instead of librosa.
  • +
+

0.12.0 - 2020-09-23

+

Added

+
    +
  • Implement Mp3Compression
  • +
  • Officially support multichannel audio in Gain and PolarityInversion
  • +
  • Add m4a and opus to the list of recognized audio filename extensions
  • +
+

Changed

+
    +
  • Expand range of supported librosa versions
  • +
+

Removed

+
    +
  • Python <= 3.5 is no longer officially supported, since Python 3.5 has reached end-of-life
  • +
  • Breaking change: Internal util functions are no longer exposed directly. If you were doing + e.g. from audiomentations import calculate_rms, now you have to do + from audiomentations.core.utils import calculate_rms
  • +
+

0.11.0 - 2020-08-27

+

Added

+
    +
  • Implement Gain and PolarityInversion. Thanks to Spijkervet for the inspiration.
  • +
+

0.10.1 - 2020-07-27

+

Changed

+
    +
  • Improve the performance of AddBackgroundNoise and AddShortNoises by optimizing the implementation of calculate_rms.
  • +
+

Fixed

+
    +
  • Improve compatibility of output files written by the demo script. Thanks to xwJohn.
  • +
  • Fix division by zero bug in Normalize. Thanks to ZFTurbo.
  • +
+

0.10.0 - 2020-05-05

+

Added

+
    +
  • AddImpulseResponse, AddBackgroundNoise and AddShortNoises now support aiff files in addition to flac, mp3, ogg and wav
  • +
+

Changed

+
    +
  • Breaking change: AddImpulseResponse, AddBackgroundNoise and AddShortNoises now include subfolders when searching for files. This is useful when your sound files are organized in subfolders.
  • +
+

Fixed

+
    +
  • Fix filter instability bug in FrequencyMask. Thanks to kvilouras.
  • +
+

0.9.0 - 2020-02-20

+

Added

+
    +
  • Remember randomized/chosen effect parameters. This allows for freezing the parameters and applying the same effect to multiple sounds. Use transform.freeze_parameters() and transform.unfreeze_parameters() for this.
  • +
  • Implement transform.serialize_parameters(). Useful for when you want to store metadata on how a sound was perturbed.
  • +
  • Add a rollover parameter to Shift. This allows for introducing silence instead of a wrapped part of the sound.
  • +
  • Add support for flac in AddImpulseResponse
  • +
  • Implement AddBackgroundNoise transform. Useful for when you want to add background noise to all of your sound. You need to give it a folder of background noises to choose from.
  • +
  • Implement AddShortNoises. Useful for when you want to add (bursts of) short noise sounds to your input audio.
  • +
+

Changed

+
    +
  • Disregard non-audio files when looking for impulse response files
  • +
  • Switch to a faster convolve implementation. This makes AddImpulseResponse significantly faster.
  • +
  • Expand supported range of librosa versions
  • +
+

Fixed

+
    +
  • Fix a bug in ClippingDistortion where the min_percentile_threshold was not respected as expected.
  • +
  • Improve handling of empty input
  • +
+

0.8.0 - 2020-01-28

+

Added

+
    +
  • Add shuffle parameter in Composer
  • +
  • Add Resample transformation
  • +
  • Add ClippingDistortion transformation
  • +
  • Add fade parameter to TimeMask
  • +
+

Thanks to askskro

+

0.7.0 - 2020-01-14

+

Added

+
    +
  • AddGaussianSNR
  • +
  • AddImpulseResponse
  • +
  • FrequencyMask
  • +
  • TimeMask
  • +
  • Trim
  • +
+

Thanks to karpnv

+

0.6.0 - 2019-05-27

+

Added

+
    +
  • Implement peak normalization
  • +
+

0.5.0 - 2019-02-23

+

Added

+
    +
  • Implement Shift transform
  • +
+

Changed

+
    +
  • Ensure p is within bounds
  • +
+

0.4.0 - 2019-02-19

+

Added

+
    +
  • Implement PitchShift transform
  • +
+

Fixed

+
    +
  • Fix output dtype of AddGaussianNoise
  • +
+

0.3.0 - 2019-02-19

+

Added

+
    +
  • Implement leave_length_unchanged in TimeStretch
  • +
+

0.2.0 - 2019-02-18

+

Added

+
    +
  • Add TimeStretch transform
  • +
  • Parametrize AddGaussianNoise
  • +
+

0.1.0 - 2019-02-15

+

Added

+
    +
  • Initial release. Includes only one transform: AddGaussianNoise
  • +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/google874768f12a0e923e.html b/google874768f12a0e923e.html new file mode 100644 index 00000000..7c72d287 --- /dev/null +++ b/google874768f12a0e923e.html @@ -0,0 +1 @@ +google-site-verification: google874768f12a0e923e.html \ No newline at end of file diff --git a/guides/cpu_vs_gpu/index.html b/guides/cpu_vs_gpu/index.html new file mode 100644 index 00000000..14266eec --- /dev/null +++ b/guides/cpu_vs_gpu/index.html @@ -0,0 +1,1095 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + CPU vs. GPU - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+ +
+ + + +
+
+ + + + + + + +

CPU vs. GPU: Which to use for online data augmentation when training audio ML models?

+

When training an audio machine learning model that includes online data augmentation as part of the training pipeline, you can choose to run the transforms on CPU or GPU. While some libraries, such as torch-audiomentations, support GPU, audiomentations is CPU-only. So, which one is better? The answer is: it depends.

+

Pros of using CPU-only libraries like audiomentations

+

There are several advantages to using CPU-only data augmentation libraries like audiomentations:

+
    +
  • Easy to get started: Audiomentations is straightforward to install and use, which makes it a good choice for beginners or for those who want to quickly prototype an idea.
  • +
  • No VRAM usage: These libraries don't use valuable VRAM, which you might want to allocate to your model with large batch sizes.
  • +
  • Often fast enough to keep GPU(s) busy: Running augmentations on CPU on multiple threads in a data loader can be fast enough to keep your GPU(s) busy, which means that data loading doesn't become a bottleneck if the model's GPU utilization is already high. This can speed up model training.
  • +
  • Larger selection of transforms: Some types of transforms, such as Mp3Compression, only have CPU implementations that can't run on GPU. This means that audiomentations provides a more extensive selection of transforms than torch-audiomentations.
  • +
  • Independent of specific tensor processing libraries: Audiomentations is CPU-only, which means it is not tied to a specific tensor processing library like TensorFlow or PyTorch.
  • +
+

Pros of running audio augmentation transforms on GPU(s)

+

There are also advantages to running audio augmentation transforms on GPU, for example, with the help of torch-audiomentations :

+
    +
  • Faster processing: When your model is not big enough to utilize your GPU fully (in terms of processing capabilities and VRAM), running transforms on GPU can make sense, especially when the transforms are much faster on GPU than on CPU. An example of this is convolution, which can be used for applying room reverb or various filters.
  • +
  • Can speed up training: If running the data loader becomes a bottleneck when running the transforms on CPU, running transforms on GPU(s) instead can speed up the training.
  • +
+

In summary, whether to use CPU-only libraries like audiomentations or GPU-accelerated libraries like torch-audiomentations depends on the specific requirements of your model and the available hardware. If your model training pipeline doesn't utilize your GPU(s) fully, running transforms on GPU might be the best choice. However, if your model's GPU utilization is already very high, running the transforms on multiple CPU threads might be the best option. It boils down to checking where your bottleneck is.

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/guides/multichannel_audio_array_shapes/index.html b/guides/multichannel_audio_array_shapes/index.html new file mode 100644 index 00000000..a1a534cb --- /dev/null +++ b/guides/multichannel_audio_array_shapes/index.html @@ -0,0 +1,1142 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Multichannel audio array shapes - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Multichannel audio array shapes

+

When working with audio files in Python, you may encounter two main formats for representing the data, especially when you are dealing with stereo (or multichannel) audio. These formats correspond to the shape of the numpy ndarray that holds the audio data.

+

1. Channels-first format

+

This format has the shape (channels, samples). In the context of a stereo audio file, the number of channels would be 2 (for left and right), and samples are the individual data points in the audio file. For example, a stereo audio file with a duration of 1 second sampled at 44100 Hz would have a shape of (2, 44100).

+

This is the format expected by audiomentations when dealing with multichannel audio. If you provide multichannel audio data in a different format, a WrongMultichannelAudioShape exception will be raised.

+

Note that audiomentations also supports mono audio, i.e. shape like (1, samples) or (samples,)

+

2. Channels-last format

+

This format has the shape (samples, channels). Using the same stereo file example as above, the shape would be (44100, 2). This format is commonly returned by the soundfile library when loading a stereo wav file, because channels last is the inherent data layout of a stereo wav file. This layout is the default in stereo wav files because it facilitates streaming audio, where data must be read and played back sequentially.

+

Loading audio with different libraries

+

Different libraries in Python may return audio data in different formats. For instance, librosa by default returns a mono ndarray, whereas soundfile will return a multichannel ndarray in channels-last format when loading a stereo wav file.

+

Here is an example of how to load a file with each:

+
import librosa
+import soundfile as sf
+
+# Librosa, mono
+y, sr = librosa.load("stereo_audio_example.wav", sr=None, mono=True)
+print(y.shape)  # (117833,)
+
+# Librosa, multichannel
+y, sr = librosa.load("stereo_audio_example.wav", sr=None, mono=False)
+print(y.shape)  # (2, 117833)
+
+# Soundfile
+y, sr = sf.read("stereo_audio_example.wav")
+print(y.shape)  # (117833, 2)
+
+

Converting between formats

+

If you have audio data in the channels-last format but need it in channels-first format, you can easily convert it using the transpose operation of numpy ndarrays:

+
import numpy as np
+
+# Assuming y is your audio data in channels-last format
+y_transposed = np.transpose(y)
+
+# Alternative, shorter syntax:
+y_transposed = y.T
+
+

Now, y_transposed will be in channels-first format and can be used with audiomentations.

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/guides/transform_parameters/index.html b/guides/transform_parameters/index.html new file mode 100644 index 00000000..2af817ab --- /dev/null +++ b/guides/transform_parameters/index.html @@ -0,0 +1,1127 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Transform parameters - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + + + + + +
+
+ + + + + + + +

Transform parameters

+

How to obtain the chosen parameters after calling a transform

+

You can access the parameters property of a transform. Code example:

+
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
+import numpy as np
+
+augment = Compose([
+    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
+    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
+    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
+    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
+])
+
+# Generate 2 seconds of dummy audio for the sake of example
+samples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)
+
+# Augment/transform/perturb the audio data
+augmented_samples = augment(samples=samples, sample_rate=16000)
+
+for transform in augment.transforms:
+    print(f"{transform.__class__.__name__}: {transform.parameters}")
+
+

When running the example code above, it may print something like this: +

AddGaussianNoise: {'should_apply': True, 'amplitude': 0.0027702725003923272}
+TimeStretch: {'should_apply': True, 'rate': 1.158377360016495}
+PitchShift: {'should_apply': False}
+Shift: {'should_apply': False}
+

+

How to use apply a transform with the same parameters to multiple inputs

+

This technique can be useful if you want to transform e.g. a target sound in the same way as an input sound. Code example:

+
from audiomentations import Gain
+import numpy as np
+
+augment = Gain(p=1.0)
+
+samples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)
+samples2 = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)
+
+augmented_samples = augment(samples=samples, sample_rate=16000)
+augment.freeze_parameters()
+print(augment.parameters)
+augmented_samples2 = augment(samples=samples2, sample_rate=16000)
+print(augment.parameters)
+augment.unfreeze_parameters()
+
+

When running the example code above, it may print something like this:

+
{'should_apply': True, 'amplitude_ratio': 0.9688148624484364}
+{'should_apply': True, 'amplitude_ratio': 0.9688148624484364}
+
+

In other words, this means that both sounds (samples and samples2) were gained by the same amount

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..13c793c6 --- /dev/null +++ b/index.html @@ -0,0 +1,1125 @@ + + + + + + + + + + + + + + + + + + + + + + + + audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Audiomentations documentation

+

Build status +Code coverage +Code Style: Black +Licence: MIT +DOI

+

A Python library for audio data augmentation. Inspired by +albumentations. Useful for deep learning. Runs on +CPU. Supports mono audio and multichannel audio. Can be +integrated in training pipelines in e.g. Tensorflow/Keras or Pytorch. Has helped people get +world-class results in Kaggle competitions. Is used by companies making next-generation audio +products.

+

Need a Pytorch-specific alternative with GPU support? Check out torch-audiomentations!

+

Setup

+

Python version support +PyPI version +Number of downloads from PyPI per month

+

pip install audiomentations

+

Optional requirements

+

Some features have extra dependencies. Extra python package dependencies can be installed by running

+

pip install audiomentations[extras]

+ + + + + + + + + + + + + + + + + + + + + + + + + +
FeatureExtra dependencies
Limitercylimiter
LoudnessNormalizationpyloudnorm
Mp3Compressionffmpeg and [pydub or lameenc]
RoomSimulatorpyroomacoustics
+

Note: ffmpeg can be installed via e.g. conda or from the official ffmpeg download page.

+

Usage example

+

Waveform

+
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
+import numpy as np
+
+augment = Compose([
+    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
+    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
+    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
+    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
+])
+
+# Generate 2 seconds of dummy audio for the sake of example
+samples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)
+
+# Augment/transform/perturb the audio data
+augmented_samples = augment(samples=samples, sample_rate=16000)
+
+

Spectrogram

+
from audiomentations import SpecCompose, SpecChannelShuffle, SpecFrequencyMask
+import numpy as np
+
+augment = SpecCompose(
+    [
+        SpecChannelShuffle(p=0.5),
+        SpecFrequencyMask(p=0.5),
+    ]
+)
+
+# Example spectrogram with 1025 frequency bins, 256 time steps and 2 audio channels
+spectrogram = np.random.random((1025, 256, 2))
+
+# Augment/transform/perturb the spectrogram
+augmented_spectrogram = augment(spectrogram)
+
+

Waveform transforms

+

For a list and explanation of all waveform transforms, see Waveform transforms in the menu.

+

Waveform transforms can be visualized (for understanding) by the audio-transformation-visualization GUI (made by phrasenmaeher), where you can upload your own input wav file

+

Spectrogram transforms

+

For a list and brief explanation of all spectrogram transforms, see Spectrogram transforms

+

Composition classes

+

Compose

+

Compose applies the given sequence of transforms when called, optionally shuffling the sequence for every call.

+

SpecCompose

+

Same as Compose, but for spectrogram transforms

+

OneOf

+

OneOf randomly picks one of the given transforms when called, and applies that transform.

+

SomeOf

+

SomeOf randomly picks several of the given transforms when called, and applies those transforms.

+

Known limitations

+
    +
  • A few transforms do not support multichannel audio yet. See Multichannel audio
  • +
  • Expects the input dtype to be float32, and have values between -1 and 1.
  • +
  • The code runs on CPU, not GPU. For a GPU-compatible version, check out pytorch-audiomentations
  • +
  • Multiprocessing probably works but is not officially supported yet
  • +
+

Contributions are welcome!

+

Multichannel audio

+

As of v0.22.0, all transforms except AddBackgroundNoise and AddShortNoises support not only mono audio (1-dimensional numpy arrays), but also stereo audio, i.e. 2D arrays with shape like (num_channels, num_samples). See also the guide on multichannel audio array shapes.

+

Acknowledgements

+

Thanks to Nomono for backing audiomentations.

+

Thanks to all contributors who help improving audiomentations.

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..8d9f9fa9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +mkdocs==1.5.2 +mkdocs-material==9.1.21 diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 00000000..cf8549bd --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Audiomentations documentation","text":"

A Python library for audio data augmentation. Inspired by albumentations. Useful for deep learning. Runs on CPU. Supports mono audio and multichannel audio. Can be integrated in training pipelines in e.g. Tensorflow/Keras or Pytorch. Has helped people get world-class results in Kaggle competitions. Is used by companies making next-generation audio products.

Need a Pytorch-specific alternative with GPU support? Check out torch-audiomentations!

"},{"location":"#setup","title":"Setup","text":"

pip install audiomentations

"},{"location":"#optional-requirements","title":"Optional requirements","text":"

Some features have extra dependencies. Extra python package dependencies can be installed by running

pip install audiomentations[extras]

Feature Extra dependencies Limiter cylimiter LoudnessNormalization pyloudnorm Mp3Compression ffmpeg and [pydub or lameenc] RoomSimulator pyroomacoustics

Note: ffmpeg can be installed via e.g. conda or from the official ffmpeg download page.

"},{"location":"#usage-example","title":"Usage example","text":""},{"location":"#waveform","title":"Waveform","text":"
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift\nimport numpy as np\n\naugment = Compose([\n    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),\n])\n\n# Generate 2 seconds of dummy audio for the sake of example\nsamples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)\n\n# Augment/transform/perturb the audio data\naugmented_samples = augment(samples=samples, sample_rate=16000)\n
"},{"location":"#spectrogram","title":"Spectrogram","text":"
from audiomentations import SpecCompose, SpecChannelShuffle, SpecFrequencyMask\nimport numpy as np\n\naugment = SpecCompose(\n    [\n        SpecChannelShuffle(p=0.5),\n        SpecFrequencyMask(p=0.5),\n    ]\n)\n\n# Example spectrogram with 1025 frequency bins, 256 time steps and 2 audio channels\nspectrogram = np.random.random((1025, 256, 2))\n\n# Augment/transform/perturb the spectrogram\naugmented_spectrogram = augment(spectrogram)\n
"},{"location":"#waveform-transforms","title":"Waveform transforms","text":"

For a list and explanation of all waveform transforms, see Waveform transforms in the menu.

Waveform transforms can be visualized (for understanding) by the audio-transformation-visualization GUI (made by phrasenmaeher), where you can upload your own input wav file

"},{"location":"#spectrogram-transforms","title":"Spectrogram transforms","text":"

For a list and brief explanation of all spectrogram transforms, see Spectrogram transforms

"},{"location":"#composition-classes","title":"Composition classes","text":""},{"location":"#compose","title":"Compose","text":"

Compose applies the given sequence of transforms when called, optionally shuffling the sequence for every call.

"},{"location":"#speccompose","title":"SpecCompose","text":"

Same as Compose, but for spectrogram transforms

"},{"location":"#oneof","title":"OneOf","text":"

OneOf randomly picks one of the given transforms when called, and applies that transform.

"},{"location":"#someof","title":"SomeOf","text":"

SomeOf randomly picks several of the given transforms when called, and applies those transforms.

"},{"location":"#known-limitations","title":"Known limitations","text":"
  • A few transforms do not support multichannel audio yet. See Multichannel audio
  • Expects the input dtype to be float32, and have values between -1 and 1.
  • The code runs on CPU, not GPU. For a GPU-compatible version, check out pytorch-audiomentations
  • Multiprocessing probably works but is not officially supported yet

Contributions are welcome!

"},{"location":"#multichannel-audio","title":"Multichannel audio","text":"

As of v0.22.0, all transforms except AddBackgroundNoise and AddShortNoises support not only mono audio (1-dimensional numpy arrays), but also stereo audio, i.e. 2D arrays with shape like (num_channels, num_samples). See also the guide on multichannel audio array shapes.

"},{"location":"#acknowledgements","title":"Acknowledgements","text":"

Thanks to Nomono for backing audiomentations.

Thanks to all contributors who help improving audiomentations.

"},{"location":"alternatives/","title":"Alternatives","text":"

Audiomentations isn't the only python library that can do various types of audio data augmentation/degradation! Here's an overview:

Name Github stars License Last commit GPU support? audio-degradation-toolbox audio_degrader audiomentations audiotools auglib AugLy fast-audiomentations kapre muda nlpaug pedalboard pydiogment python-audio-effects SpecAugment spec_augment teal torch-audiomentations torchaudio-augmentations WavAugment"},{"location":"changelog/","title":"Changelog","text":"

All notable changes to this project will be documented in this file.

The format is based on Keep a Changelog, and this project adheres to Semantic Versioning.

"},{"location":"changelog/#unreleased","title":"Unreleased","text":""},{"location":"changelog/#added","title":"Added","text":"
  • Add new transforms: AddColorNoise, Aliasing and BitCrush
"},{"location":"changelog/#0341-2023-11-24","title":"0.34.1 - 2023-11-24","text":""},{"location":"changelog/#changed","title":"Changed","text":"
  • Bump min numpy version from 1.18 to 1.21
  • Use numpy.typing in type hints
  • Optimize max abs calculations in terms of memory and speed. This makes Normalize, Mp3Compression and Limiter slightly faster.
"},{"location":"changelog/#0330-2023-08-30","title":"0.33.0 - 2023-08-30","text":""},{"location":"changelog/#changed_1","title":"Changed","text":"
  • Bump min numpy version from 1.16 to 1.18
  • Bump min scipy version from 1.3 to 1.4
  • Bump min python version from 3.7 to 3.8, because 3.7 is beyond end-of-life already
  • Change some AssertionError exceptions to ValueError
"},{"location":"changelog/#the-shift-transform-has-been-changed","title":"The Shift transform has been changed:","text":"
  • Removed fade parameter. fade_duration=0.0 now denotes disabled fading.
  • Rename min_fraction to min_shift and max_fraction to max_shift
  • Add shift_unit parameter
  • Fading is enabled by default
  • Smoother fade curve

These are breaking changes. The following example shows how you can adapt your code when upgrading from <=v0.32.0 to >=v0.33.0:

<= 0.32.0 >= 0.33.0 Shift(min_fraction=-0.5, max_fraction=0.5, fade=True, fade_duration=0.01) Shift(min_shift=-0.5, max_shift=0.5, shift_unit=\"fraction\", fade_duration=0.01) Shift() Shift(fade_duration=0.0)"},{"location":"changelog/#fixed","title":"Fixed","text":"
  • Correct some wrong type hints
"},{"location":"changelog/#0320-2023-08-15","title":"0.32.0 - 2023-08-15","text":""},{"location":"changelog/#added_1","title":"Added","text":"
  • Add new RepeatPart transform
"},{"location":"changelog/#changed_2","title":"Changed","text":"
  • Bump min version of numpy dependency from 1.13 to 1.16
  • If a transform is in \"frozen parameters\" mode, but has no parameters yet, the transform will randomize/set parameters when it gets called for the first time
  • Increase the threshold for raising WrongMultichannelAudioShape. This allows some rare use cases where the number of channels slightly exceeds the number of samples.
"},{"location":"changelog/#fixed_1","title":"Fixed","text":"
  • Fix some type hints that were np.array instead of np.ndarray
"},{"location":"changelog/#0310-2023-06-21","title":"0.31.0 - 2023-06-21","text":""},{"location":"changelog/#changed_3","title":"Changed","text":"
  • Raise exception instead of warning when the given multichannel ndarray has wrong shape
  • Add support for the latest librosa 0.10 version
  • Switch to a faster default resampler internally in PitchShift, leading to much faster execution. This requires soxr.
  • Bump min scipy requirement from 1.0 to 1.3
  • Rename \"_in_db\" to \"_db\" in args and parameters. Passing args with the old names still works, but is deprecated and will stop working in a future version.
"},{"location":"changelog/#0300-2023-05-02","title":"0.30.0 - 2023-05-02","text":""},{"location":"changelog/#added_2","title":"Added","text":"
  • Add new AdjustDuration transform
"},{"location":"changelog/#fixed_2","title":"Fixed","text":"
  • Fix a bug where too loud inputs got wrap distortion when running them through Mp3Compression
"},{"location":"changelog/#0290-2023-03-15","title":"0.29.0 - 2023-03-15","text":""},{"location":"changelog/#added_3","title":"Added","text":"
  • Add apply_to parameter that can be set to \"only_too_loud_sounds\" in Normalize
"},{"location":"changelog/#changed_4","title":"Changed","text":"
  • Change default value of noise_rms from \"relative\" to \"relative_to_whole_input\" in AddShortNoises
  • Change default values of min_snr_in_db (from 0.0 to -6.0), max_snr_in_db (from 24.0 to 18.0), min_time_between_sounds (from 4.0 to 2.0) and max_time_between_sounds (from 16.0 to 8.0) in AddShortNoises
"},{"location":"changelog/#fixed_3","title":"Fixed","text":"
  • Fix a bug where Limiter raised an exception when it got digital silence as input
"},{"location":"changelog/#0280-2023-01-12","title":"0.28.0 - 2023-01-12","text":""},{"location":"changelog/#added_4","title":"Added","text":"
  • Add/improve type hints
  • Add/improve documentation
"},{"location":"changelog/#fixed_4","title":"Fixed","text":"
  • Fix a bug in RoomSimulator where the value of max_order was not respected
"},{"location":"changelog/#removed","title":"Removed","text":"
  • Remove FrequencyMask that had been deprecated since version 0.22.0. BandStopFilter is a good alternative.
"},{"location":"changelog/#0270-2022-09-13","title":"0.27.0 - 2022-09-13","text":""},{"location":"changelog/#changed_5","title":"Changed","text":"
  • Speed up Limiter by ~8x
  • Fix/improve some docstrings and type hints
  • Change default values in Trim and ApplyImpulseResponse according to the warnings that were added in v0.23.0
  • Emit a FutureWarning when noise_rms in AddShortNoises is not specified - the default value will change from \"relative\" to \"relative_to_whole_input\" in a future version.
"},{"location":"changelog/#0260-2022-08-19","title":"0.26.0 - 2022-08-19","text":""},{"location":"changelog/#added_5","title":"Added","text":"
  • Add new transform Lambda. Thanks to Thanatoz-1.
  • Add new transform Limiter. Thanks to pzelasko.
"},{"location":"changelog/#fixed_5","title":"Fixed","text":"
  • Fix incorrect type hints in RoomSimulator
  • Make Shift robust to different sample rate inputs when parameters are frozen
"},{"location":"changelog/#0251-2022-06-15","title":"0.25.1 - 2022-06-15","text":""},{"location":"changelog/#fixed_6","title":"Fixed","text":"
  • Fix a bug where RoomSimulator would treat an x value as if it was y, and vice versa
"},{"location":"changelog/#0250-2022-05-30","title":"0.25.0 - 2022-05-30","text":""},{"location":"changelog/#added_6","title":"Added","text":"
  • Add AirAbsorption transform
  • Add mp4 to the list of recognized audio filename extensions
"},{"location":"changelog/#changed_6","title":"Changed","text":"
  • Guard against invalid params in TimeMask
  • Emit FutureWarning instead of UserWarning in Trim and ApplyImpulseResponse
  • Allow specifying a file path, a folder path, a list of files or a list of folders to ApplyImpulseResponse, AddBackgroundNoise and AddShortNoises. Previously only a path to a folder was allowed.
"},{"location":"changelog/#fixed_7","title":"Fixed","text":"
  • Fix a bug with noise_transform in AddBackgroundNoise where some SNR calculations were done before the noise_transform was applied. This has sometimes led to incorrect SNR in the output. This changes the behavior of AddBackgroundNoise (when noise_transform is used).
"},{"location":"changelog/#removed_1","title":"Removed","text":"
  • Remove support for Python 3.6, as it is past its end of life already. RIP.
"},{"location":"changelog/#0240-2022-03-18","title":"0.24.0 - 2022-03-18","text":""},{"location":"changelog/#added_7","title":"Added","text":"
  • Add SevenBandParametricEQ transform
  • Add optional noise_transform in AddShortNoises
  • Add .aac and .aif to the list of recognized audio filename endings
"},{"location":"changelog/#changed_7","title":"Changed","text":"
  • Show warning if top_db and/or p in Trim are not specified because their default values will change in a future version
"},{"location":"changelog/#fixed_8","title":"Fixed","text":"
  • Fix filter instability bug related to center freq above nyquist freq in LowShelfFilter and HighShelfFilter
"},{"location":"changelog/#0230-2022-03-07","title":"0.23.0 - 2022-03-07","text":""},{"location":"changelog/#added_8","title":"Added","text":"
  • Add Padding transform
  • Add RoomSimulator transform for simulating shoebox rooms using pyroomacoustics
  • Add parameter signal_gain_in_db_during_noise in AddShortNoises
"},{"location":"changelog/#changed_8","title":"Changed","text":"
  • Not specifying a value for leave_length_unchanged in AddImpulseResponse now emits a warning, as the default value will change from False to True in a future version.
"},{"location":"changelog/#removed_2","title":"Removed","text":"
  • Remove the deprecated AddImpulseResponse alias. Use ApplyImpulseResponse instead.
  • Remove support for the legacy parameters min_SNR and max_SNR in AddGaussianSNR
  • Remove useless default path value in AddBackgroundNoise, AddShortNoises and ApplyImpulseResponse
"},{"location":"changelog/#0220-2022-02-18","title":"0.22.0 - 2022-02-18","text":""},{"location":"changelog/#added_9","title":"Added","text":"
  • Implement GainTransition
  • Add support for librosa 0.9
  • Add support for stereo audio in Mp3Compression, Resample and Trim
  • Add \"relative_to_whole_input\" option for noise_rms parameter in AddShortNoises
  • Add optional noise_transform in AddBackgroundNoise
"},{"location":"changelog/#changed_9","title":"Changed","text":"
  • Improve speed of PitchShift by 6-18% when the input audio is stereo
"},{"location":"changelog/#deprecated","title":"Deprecated","text":"
  • Deprecate FrequencyMask in favor of BandStopFilter
"},{"location":"changelog/#removed_3","title":"Removed","text":"
  • Remove support for librosa<=0.7.2
"},{"location":"changelog/#0210-2022-02-10","title":"0.21.0 - 2022-02-10","text":""},{"location":"changelog/#added_10","title":"Added","text":"
  • Add support for multichannel audio in ApplyImpulseResponse, BandPassFilter, HighPassFilter and LowPassFilter
  • Add BandStopFilter (similar to FrequencyMask, but with overhauled defaults and parameter randomization behavior), PeakingFilter, LowShelfFilter and HighShelfFilter
  • Add parameter add_all_noises_with_same_level in AddShortNoises
"},{"location":"changelog/#changed_10","title":"Changed","text":"
  • Change BandPassFilter, LowPassFilter, HighPassFilter, to use scipy's butterworth filters instead of pydub. Now they have parametrized roll-off. Filters are now steeper than before by default - set min_rolloff=6, max_rolloff=6 to get the old behavior. They also support zero-phase filtering now. And they're at least ~25x times faster than before!
"},{"location":"changelog/#removed_4","title":"Removed","text":"
  • Remove optional wavio dependency for audio loading
"},{"location":"changelog/#0200-2021-11-18","title":"0.20.0 - 2021-11-18","text":""},{"location":"changelog/#added_11","title":"Added","text":"
  • Implement OneOf and SomeOf for applying one of or some of many transforms. Transforms are randomly chosen every call. Inspired by augly. Thanks to Cangonin and iver56.
  • Add a new argument apply_to_children (bool) in randomize_parameters, freeze_parameters and unfreeze_parameters in Compose and SpecCompose.
"},{"location":"changelog/#changed_11","title":"Changed","text":"
  • Insert three new parameters in AddBackgroundNoise: noise_rms (defaults to \"relative\", which is the old behavior), min_absolute_rms_in_db and max_absolute_rms_in_db. This may be a breaking change if you used AddBackgroundNoise with positional arguments in earlier versions of audiomentations! Please use keyword arguments to be on the safe side - it should be backwards compatible then.
"},{"location":"changelog/#fixed_9","title":"Fixed","text":"
  • Remove global pydub import which was accidentally introduced in v0.18.0. pydub is considered an optional dependency and is imported only on demand now.
"},{"location":"changelog/#0190-2021-10-18","title":"0.19.0 - 2021-10-18","text":""},{"location":"changelog/#added_12","title":"Added","text":"
  • Implement TanhDistortion. Thanks to atamazian and iver56.
  • Add a noise_rms parameter to AddShortNoises. It defaults to relative, which is the old behavior. absolute allows for adding loud noises to parts that are relatively silent in the input.
"},{"location":"changelog/#0180-2021-08-05","title":"0.18.0 - 2021-08-05","text":""},{"location":"changelog/#added_13","title":"Added","text":"
  • Implement BandPassFilter, HighPassFilter, LowPassFilter and Reverse. Thanks to atamazian.
"},{"location":"changelog/#0170-2021-06-25","title":"0.17.0 - 2021-06-25","text":""},{"location":"changelog/#added_14","title":"Added","text":"
  • Add a fade option in Shift for eliminating unwanted clicks
  • Add support for 32-bit int wav loading with scipy>=1.6
  • Add support for float64 wav files. However, the use of this format is discouraged, since float32 is more than enough for audio in most cases.
  • Implement Clip. Thanks to atamazian.
  • Add some parameter sanity checks in AddGaussianNoise
  • Officially support librosa 0.8.1
"},{"location":"changelog/#changed_12","title":"Changed","text":"
  • Rename AddImpulseResponse to ApplyImpulseResponse. The former will still work for now, but give a warning.
  • When looking for audio files in AddImpulseResponse, AddBackgroundNoise and AddShortNoises, follow symlinks by default.
  • When using the new parameters min_snr_in_db and max_snr_in_db in AddGaussianSNR, SNRs will be picked uniformly in the decibel scale instead of in the linear amplitude ratio scale. The new behavior aligns more with human hearing, which is not linear.
"},{"location":"changelog/#fixed_10","title":"Fixed","text":"
  • Avoid division by zero in AddImpulseResponse when input is digital silence (all zeros)
  • Fix inverse SNR characteristics in AddGaussianSNR. It will continue working as before unless you switch to the new parameters min_snr_in_db and max_snr_in_db. If you use the old parameters, you'll get a warning.
"},{"location":"changelog/#0160-2021-02-11","title":"0.16.0 - 2021-02-11","text":""},{"location":"changelog/#added_15","title":"Added","text":"
  • Implement SpecCompose for applying a pipeline of spectrogram transforms. Thanks to omerferhatt.
"},{"location":"changelog/#fixed_11","title":"Fixed","text":"
  • Fix a bug in SpecChannelShuffle where it did not support more than 3 audio channels. Thanks to omerferhatt.
  • Limit scipy version range to >=1.0,<1.6 to avoid issues with loading 24-bit wav files. Support for scipy>=1.6 will be added later.
"},{"location":"changelog/#0150-2020-12-10","title":"0.15.0 - 2020-12-10","text":""},{"location":"changelog/#added_16","title":"Added","text":"
  • Add an option leave_length_unchanged to AddImpulseResponse
"},{"location":"changelog/#fixed_12","title":"Fixed","text":"
  • Fix picklability of instances of AddImpulseResponse, AddBackgroundNoise and AddShortNoises
"},{"location":"changelog/#0140-2020-12-06","title":"0.14.0 - 2020-12-06","text":""},{"location":"changelog/#added_17","title":"Added","text":"
  • Implement LoudnessNormalization
  • Implement randomize_parameters in Compose. Thanks to SolomidHero.
  • Add multichannel support to AddGaussianNoise, AddGaussianSNR, ClippingDistortion, FrequencyMask, PitchShift, Shift, TimeMask and TimeStretch
"},{"location":"changelog/#0130-2020-11-10","title":"0.13.0 - 2020-11-10","text":""},{"location":"changelog/#added_18","title":"Added","text":"
  • Lay the foundation for spectrogram transforms. Implement SpecChannelShuffle and SpecFrequencyMask.
  • Configurable LRU cache for transforms that use external sound files. Thanks to alumae.
  • Officially add multichannel support to Normalize
"},{"location":"changelog/#changed_13","title":"Changed","text":"
  • Show a warning if a waveform had to be resampled after loading it. This is because resampling is slow. Ideally, files on disk should already have the desired sample rate.
"},{"location":"changelog/#fixed_13","title":"Fixed","text":"
  • Correctly find audio files with upper case filename extensions.
  • Fix a bug where AddBackgroundNoise crashed when trying to add digital silence to an input. Thanks to juheeuu.
"},{"location":"changelog/#0121-2020-09-28","title":"0.12.1 - 2020-09-28","text":""},{"location":"changelog/#changed_14","title":"Changed","text":"
  • Speed up AddBackgroundNoise, AddShortNoises and AddImpulseResponse by loading wav files with scipy or wavio instead of librosa.
"},{"location":"changelog/#0120-2020-09-23","title":"0.12.0 - 2020-09-23","text":""},{"location":"changelog/#added_19","title":"Added","text":"
  • Implement Mp3Compression
  • Officially support multichannel audio in Gain and PolarityInversion
  • Add m4a and opus to the list of recognized audio filename extensions
"},{"location":"changelog/#changed_15","title":"Changed","text":"
  • Expand range of supported librosa versions
"},{"location":"changelog/#removed_5","title":"Removed","text":"
  • Python <= 3.5 is no longer officially supported, since Python 3.5 has reached end-of-life
  • Breaking change: Internal util functions are no longer exposed directly. If you were doing e.g. from audiomentations import calculate_rms, now you have to do from audiomentations.core.utils import calculate_rms
"},{"location":"changelog/#0110-2020-08-27","title":"0.11.0 - 2020-08-27","text":""},{"location":"changelog/#added_20","title":"Added","text":"
  • Implement Gain and PolarityInversion. Thanks to Spijkervet for the inspiration.
"},{"location":"changelog/#0101-2020-07-27","title":"0.10.1 - 2020-07-27","text":""},{"location":"changelog/#changed_16","title":"Changed","text":"
  • Improve the performance of AddBackgroundNoise and AddShortNoises by optimizing the implementation of calculate_rms.
"},{"location":"changelog/#fixed_14","title":"Fixed","text":"
  • Improve compatibility of output files written by the demo script. Thanks to xwJohn.
  • Fix division by zero bug in Normalize. Thanks to ZFTurbo.
"},{"location":"changelog/#0100-2020-05-05","title":"0.10.0 - 2020-05-05","text":""},{"location":"changelog/#added_21","title":"Added","text":"
  • AddImpulseResponse, AddBackgroundNoise and AddShortNoises now support aiff files in addition to flac, mp3, ogg and wav
"},{"location":"changelog/#changed_17","title":"Changed","text":"
  • Breaking change: AddImpulseResponse, AddBackgroundNoise and AddShortNoises now include subfolders when searching for files. This is useful when your sound files are organized in subfolders.
"},{"location":"changelog/#fixed_15","title":"Fixed","text":"
  • Fix filter instability bug in FrequencyMask. Thanks to kvilouras.
"},{"location":"changelog/#090-2020-02-20","title":"0.9.0 - 2020-02-20","text":""},{"location":"changelog/#added_22","title":"Added","text":"
  • Remember randomized/chosen effect parameters. This allows for freezing the parameters and applying the same effect to multiple sounds. Use transform.freeze_parameters() and transform.unfreeze_parameters() for this.
  • Implement transform.serialize_parameters(). Useful for when you want to store metadata on how a sound was perturbed.
  • Add a rollover parameter to Shift. This allows for introducing silence instead of a wrapped part of the sound.
  • Add support for flac in AddImpulseResponse
  • Implement AddBackgroundNoise transform. Useful for when you want to add background noise to all of your sound. You need to give it a folder of background noises to choose from.
  • Implement AddShortNoises. Useful for when you want to add (bursts of) short noise sounds to your input audio.
"},{"location":"changelog/#changed_18","title":"Changed","text":"
  • Disregard non-audio files when looking for impulse response files
  • Switch to a faster convolve implementation. This makes AddImpulseResponse significantly faster.
  • Expand supported range of librosa versions
"},{"location":"changelog/#fixed_16","title":"Fixed","text":"
  • Fix a bug in ClippingDistortion where the min_percentile_threshold was not respected as expected.
  • Improve handling of empty input
"},{"location":"changelog/#080-2020-01-28","title":"0.8.0 - 2020-01-28","text":""},{"location":"changelog/#added_23","title":"Added","text":"
  • Add shuffle parameter in Composer
  • Add Resample transformation
  • Add ClippingDistortion transformation
  • Add fade parameter to TimeMask

Thanks to askskro

"},{"location":"changelog/#070-2020-01-14","title":"0.7.0 - 2020-01-14","text":""},{"location":"changelog/#added_24","title":"Added","text":"
  • AddGaussianSNR
  • AddImpulseResponse
  • FrequencyMask
  • TimeMask
  • Trim

Thanks to karpnv

"},{"location":"changelog/#060-2019-05-27","title":"0.6.0 - 2019-05-27","text":""},{"location":"changelog/#added_25","title":"Added","text":"
  • Implement peak normalization
"},{"location":"changelog/#050-2019-02-23","title":"0.5.0 - 2019-02-23","text":""},{"location":"changelog/#added_26","title":"Added","text":"
  • Implement Shift transform
"},{"location":"changelog/#changed_19","title":"Changed","text":"
  • Ensure p is within bounds
"},{"location":"changelog/#040-2019-02-19","title":"0.4.0 - 2019-02-19","text":""},{"location":"changelog/#added_27","title":"Added","text":"
  • Implement PitchShift transform
"},{"location":"changelog/#fixed_17","title":"Fixed","text":"
  • Fix output dtype of AddGaussianNoise
"},{"location":"changelog/#030-2019-02-19","title":"0.3.0 - 2019-02-19","text":""},{"location":"changelog/#added_28","title":"Added","text":"
  • Implement leave_length_unchanged in TimeStretch
"},{"location":"changelog/#020-2019-02-18","title":"0.2.0 - 2019-02-18","text":""},{"location":"changelog/#added_29","title":"Added","text":"
  • Add TimeStretch transform
  • Parametrize AddGaussianNoise
"},{"location":"changelog/#010-2019-02-15","title":"0.1.0 - 2019-02-15","text":""},{"location":"changelog/#added_30","title":"Added","text":"
  • Initial release. Includes only one transform: AddGaussianNoise
"},{"location":"spectrogram_transforms/","title":"Spectrogram transforms","text":"

audiomentations is in a very early (read: not very useful yet) stage when it comes to spectrogram transforms. Consider applying waveform transforms before converting your waveforms to spectrograms, or check out alternative libraries

"},{"location":"spectrogram_transforms/#specchannelshuffle","title":"SpecChannelShuffle","text":"

Added in v0.13.0

Shuffle the channels of a multichannel spectrogram. This can help combat positional bias.

"},{"location":"spectrogram_transforms/#specfrequencymask","title":"SpecFrequencyMask","text":"

Added in v0.13.0

Mask a set of frequencies in a spectrogram, \u00e0 la Google AI SpecAugment. This type of data augmentation has proved to make speech recognition models more robust.

The masked frequencies can be replaced with either the mean of the original values or a given constant (e.g. zero).

"},{"location":"guides/cpu_vs_gpu/","title":"CPU vs. GPU: Which to use for online data augmentation when training audio ML models?","text":"

When training an audio machine learning model that includes online data augmentation as part of the training pipeline, you can choose to run the transforms on CPU or GPU. While some libraries, such as torch-audiomentations, support GPU, audiomentations is CPU-only. So, which one is better? The answer is: it depends.

"},{"location":"guides/cpu_vs_gpu/#pros-of-using-cpu-only-libraries-like-audiomentations","title":"Pros of using CPU-only libraries like audiomentations","text":"

There are several advantages to using CPU-only data augmentation libraries like audiomentations:

  • Easy to get started: Audiomentations is straightforward to install and use, which makes it a good choice for beginners or for those who want to quickly prototype an idea.
  • No VRAM usage: These libraries don't use valuable VRAM, which you might want to allocate to your model with large batch sizes.
  • Often fast enough to keep GPU(s) busy: Running augmentations on CPU on multiple threads in a data loader can be fast enough to keep your GPU(s) busy, which means that data loading doesn't become a bottleneck if the model's GPU utilization is already high. This can speed up model training.
  • Larger selection of transforms: Some types of transforms, such as Mp3Compression, only have CPU implementations that can't run on GPU. This means that audiomentations provides a more extensive selection of transforms than torch-audiomentations.
  • Independent of specific tensor processing libraries: Audiomentations is CPU-only, which means it is not tied to a specific tensor processing library like TensorFlow or PyTorch.
"},{"location":"guides/cpu_vs_gpu/#pros-of-running-audio-augmentation-transforms-on-gpus","title":"Pros of running audio augmentation transforms on GPU(s)","text":"

There are also advantages to running audio augmentation transforms on GPU, for example, with the help of torch-audiomentations :

  • Faster processing: When your model is not big enough to utilize your GPU fully (in terms of processing capabilities and VRAM), running transforms on GPU can make sense, especially when the transforms are much faster on GPU than on CPU. An example of this is convolution, which can be used for applying room reverb or various filters.
  • Can speed up training: If running the data loader becomes a bottleneck when running the transforms on CPU, running transforms on GPU(s) instead can speed up the training.

In summary, whether to use CPU-only libraries like audiomentations or GPU-accelerated libraries like torch-audiomentations depends on the specific requirements of your model and the available hardware. If your model training pipeline doesn't utilize your GPU(s) fully, running transforms on GPU might be the best choice. However, if your model's GPU utilization is already very high, running the transforms on multiple CPU threads might be the best option. It boils down to checking where your bottleneck is.

"},{"location":"guides/multichannel_audio_array_shapes/","title":"Multichannel audio array shapes","text":"

When working with audio files in Python, you may encounter two main formats for representing the data, especially when you are dealing with stereo (or multichannel) audio. These formats correspond to the shape of the numpy ndarray that holds the audio data.

"},{"location":"guides/multichannel_audio_array_shapes/#1-channels-first-format","title":"1. Channels-first format","text":"

This format has the shape (channels, samples). In the context of a stereo audio file, the number of channels would be 2 (for left and right), and samples are the individual data points in the audio file. For example, a stereo audio file with a duration of 1 second sampled at 44100 Hz would have a shape of (2, 44100).

This is the format expected by audiomentations when dealing with multichannel audio. If you provide multichannel audio data in a different format, a WrongMultichannelAudioShape exception will be raised.

Note that audiomentations also supports mono audio, i.e. shape like (1, samples) or (samples,)

"},{"location":"guides/multichannel_audio_array_shapes/#2-channels-last-format","title":"2. Channels-last format","text":"

This format has the shape (samples, channels). Using the same stereo file example as above, the shape would be (44100, 2). This format is commonly returned by the soundfile library when loading a stereo wav file, because channels last is the inherent data layout of a stereo wav file. This layout is the default in stereo wav files because it facilitates streaming audio, where data must be read and played back sequentially.

"},{"location":"guides/multichannel_audio_array_shapes/#loading-audio-with-different-libraries","title":"Loading audio with different libraries","text":"

Different libraries in Python may return audio data in different formats. For instance, librosa by default returns a mono ndarray, whereas soundfile will return a multichannel ndarray in channels-last format when loading a stereo wav file.

Here is an example of how to load a file with each:

import librosa\nimport soundfile as sf\n\n# Librosa, mono\ny, sr = librosa.load(\"stereo_audio_example.wav\", sr=None, mono=True)\nprint(y.shape)  # (117833,)\n\n# Librosa, multichannel\ny, sr = librosa.load(\"stereo_audio_example.wav\", sr=None, mono=False)\nprint(y.shape)  # (2, 117833)\n\n# Soundfile\ny, sr = sf.read(\"stereo_audio_example.wav\")\nprint(y.shape)  # (117833, 2)\n
"},{"location":"guides/multichannel_audio_array_shapes/#converting-between-formats","title":"Converting between formats","text":"

If you have audio data in the channels-last format but need it in channels-first format, you can easily convert it using the transpose operation of numpy ndarrays:

import numpy as np\n\n# Assuming y is your audio data in channels-last format\ny_transposed = np.transpose(y)\n\n# Alternative, shorter syntax:\ny_transposed = y.T\n

Now, y_transposed will be in channels-first format and can be used with audiomentations.

"},{"location":"guides/transform_parameters/","title":"Transform parameters","text":""},{"location":"guides/transform_parameters/#how-to-obtain-the-chosen-parameters-after-calling-a-transform","title":"How to obtain the chosen parameters after calling a transform","text":"

You can access the parameters property of a transform. Code example:

from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift\nimport numpy as np\n\naugment = Compose([\n    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),\n])\n\n# Generate 2 seconds of dummy audio for the sake of example\nsamples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)\n\n# Augment/transform/perturb the audio data\naugmented_samples = augment(samples=samples, sample_rate=16000)\n\nfor transform in augment.transforms:\n    print(f\"{transform.__class__.__name__}: {transform.parameters}\")\n

When running the example code above, it may print something like this:

AddGaussianNoise: {'should_apply': True, 'amplitude': 0.0027702725003923272}\nTimeStretch: {'should_apply': True, 'rate': 1.158377360016495}\nPitchShift: {'should_apply': False}\nShift: {'should_apply': False}\n

"},{"location":"guides/transform_parameters/#how-to-use-apply-a-transform-with-the-same-parameters-to-multiple-inputs","title":"How to use apply a transform with the same parameters to multiple inputs","text":"

This technique can be useful if you want to transform e.g. a target sound in the same way as an input sound. Code example:

from audiomentations import Gain\nimport numpy as np\n\naugment = Gain(p=1.0)\n\nsamples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)\nsamples2 = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)\n\naugmented_samples = augment(samples=samples, sample_rate=16000)\naugment.freeze_parameters()\nprint(augment.parameters)\naugmented_samples2 = augment(samples=samples2, sample_rate=16000)\nprint(augment.parameters)\naugment.unfreeze_parameters()\n

When running the example code above, it may print something like this:

{'should_apply': True, 'amplitude_ratio': 0.9688148624484364}\n{'should_apply': True, 'amplitude_ratio': 0.9688148624484364}\n

In other words, this means that both sounds (samples and samples2) were gained by the same amount

"},{"location":"waveform_transforms/add_background_noise/","title":"AddBackgroundNoise","text":"

Added in v0.9.0

Mix in another sound, e.g. a background noise. Useful if your original sound is clean and you want to simulate an environment where background noise is present.

Can also be used for mixup when training classification/annotation models.

A path to a file/folder with sound(s), or a list of file/folder paths, must be specified. These sounds should ideally be at least as long as the input sounds to be transformed. Otherwise, the background sound will be repeated, which may sound unnatural.

Note that in the default case (noise_rms=\"relative\") the gain of the added noise is relative to the amount of signal in the input. This implies that if the input is completely silent, no noise will be added.

Optionally, the added noise sound can be transformed (with noise_transform) before it gets mixed in.

Here are some examples of datasets that can be downloaded and used as background noise:

  • https://github.com/karolpiczak/ESC-50#download
  • https://github.com/microsoft/DNS-Challenge/
"},{"location":"waveform_transforms/add_background_noise/#input-output-example","title":"Input-output example","text":"

Here we add some music to a speech recording, targeting a signal-to-noise ratio (SNR) of 5 decibels (dB), which means that the speech (signal) is 5 dB louder than the music (noise).

Input sound Transformed sound"},{"location":"waveform_transforms/add_background_noise/#usage-examples","title":"Usage examples","text":"Relative RMSAbsolute RMS
from audiomentations import AddBackgroundNoise, PolarityInversion\n\ntransform = AddBackgroundNoise(\n    sounds_path=\"/path/to/folder_with_sound_files\",\n    min_snr_in_db=3.0,\n    max_snr_in_db=30.0,\n    noise_transform=PolarityInversion(),\n    p=1.0\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
from audiomentations import AddBackgroundNoise, PolarityInversion\n\ntransform = AddBackgroundNoise(\n    sounds_path=\"/path/to/folder_with_sound_files\",\n    noise_rms=\"absolute\",\n    min_absolute_rms_in_db=-45.0,\n    max_absolute_rms_in_db=-15.0,\n    noise_transform=PolarityInversion(),\n    p=1.0\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/add_background_noise/#addbackgroundnoise-api","title":"AddBackgroundNoise API","text":"sounds_path: Union[List[Path], List[str], Path, str] A path or list of paths to audio file(s) and/or folder(s) with audio files. Can be str or Path instance(s). The audio files given here are supposed to be background noises. min_snr_db: float \u2022 unit: Decibel Default: 3.0. Minimum signal-to-noise ratio in dB. Is only used if noise_rms is set to \"relative\" max_snr_db: float \u2022 unit: Decibel Default: 30.0. Maximum signal-to-noise ratio in dB. Is only used if noise_rms is set to \"relative\" min_snr_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use min_snr_db instead max_snr_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use max_snr_db instead noise_rms: str \u2022 choices: \"absolute\", \"relative\" Default: \"relative\". Defines how the background noise will be added to the audio input. If the chosen option is \"relative\", the root mean square (RMS) of the added noise will be proportional to the RMS of the input sound. If the chosen option is \"absolute\", the background noise will have an RMS independent of the rms of the input audio file min_absolute_rms_db: float \u2022 unit: Decibel Default: -45.0. Is only used if noise_rms is set to \"absolute\". It is the minimum RMS value in dB that the added noise can take. The lower the RMS is, the lower the added sound will be. max_absolute_rms_db: float \u2022 unit: Decibel Default: -15.0. Is only used if noise_rms is set to \"absolute\". It is the maximum RMS value in dB that the added noise can take. Note that this value can not exceed 0. min_absolute_rms_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use min_absolute_rms_db instead max_absolute_rms_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use max_absolute_rms_in_db instead noise_transform: Optional[Callable[[NDArray[np.float32], int], NDArray[np.float32]]] Default: None. A callable waveform transform (or composition of transforms) that gets applied to the noise before it gets mixed in. The callable is expected to input audio waveform (numpy array) and sample rate (int). p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform. lru_cache_size: int Default: 2. Maximum size of the LRU cache for storing noise files in memory"},{"location":"waveform_transforms/add_color_noise/","title":"AddColorNoise","text":"

To be added in v0.35.0

Mix in noise with color, optionally weighted by an A-weighting curve. When f_decay=0, this is equivalent to AddGaussianNoise. Otherwise, see: Colors of Noise .

"},{"location":"waveform_transforms/add_color_noise/#addcolornoise-api","title":"AddColorNoise API","text":"min_snr_db: float \u2022 unit: Decibel Default: 5.0. Minimum signal-to-noise ratio in dB. A lower number means more noise. max_snr_db: float \u2022 unit: Decibel Default: 40.0. Maximum signal-to-noise ratio in dB. A greater number means less noise. min_f_decay: float \u2022 unit: Decibels/octave Default: -6.0. Minimum noise decay in dB per octave. max_f_decay: float \u2022 unit: Decibels/octave Default: 6.0. Maximum noise decay in dB per octave.

Those values can be chosen from the following table:

Colour f_decay (db/octave) pink -3.01 brown/brownian -6.02 red -6.02 blue 3.01 azure 3.01 violet 6.02 white 0.0

See Colors of noise on Wikipedia about those values.

p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform. p_apply_a_weighting: float \u2022 range: [0.0, 1.0] Default: 0.0. The probability of additionally weighting the transform using an A-weighting curve. n_fft: int Default: 128. The number of points the decay curve is computed (for coloring white noise)."},{"location":"waveform_transforms/add_gaussian_noise/","title":"AddGaussianNoise","text":"

Added in v0.1.0

Add gaussian noise to the samples

"},{"location":"waveform_transforms/add_gaussian_noise/#input-output-example","title":"Input-output example","text":"

Here we add some gaussian noise (with amplitude 0.01) to a speech recording.

Input sound Transformed sound"},{"location":"waveform_transforms/add_gaussian_noise/#usage-example","title":"Usage example","text":"
from audiomentations import AddGaussianNoise\n\ntransform = AddGaussianNoise(\n    min_amplitude=0.001,\n    max_amplitude=0.015,\n    p=1.0\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/add_gaussian_noise/#addgaussiannoise-api","title":"AddGaussianNoise API","text":"min_amplitude: float \u2022 unit: linear amplitude Default: 0.001. Minimum noise amplification factor. max_amplitude: float \u2022 unit: linear amplitude Default: 0.015. Maximum noise amplification factor. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/add_gaussian_snr/","title":"AddGaussianSNR","text":"

Added in v0.7.0

The AddGaussianSNR transform injects Gaussian noise into an audio signal. It applies a Signal-to-Noise Ratio (SNR) that is chosen randomly from a uniform distribution on the decibel scale. This choice is consistent with the nature of human hearing, which is logarithmic rather than linear.

SNR is a common measure used in science and engineering to compare the level of a desired signal to the level of noise. In the context of audio, the signal is the meaningful sound that you're interested in, like a person's voice, music, or other audio content, while the noise is unwanted sound that can interfere with the signal.

The SNR quantifies the ratio of the power of the signal to the power of the noise. The higher the SNR, the less the noise is present in relation to the signal.

Gaussian noise, a kind of white noise, is a type of statistical noise where the amplitude of the noise signal follows a Gaussian distribution. This means that most of the samples are close to the mean (zero), and fewer of them are farther away. It's called Gaussian noise due to its characteristic bell-shaped Gaussian distribution.

Gaussian noise is similar to the sound of a radio or TV tuned to a nonexistent station: a kind of constant, uniform hiss or static.

"},{"location":"waveform_transforms/add_gaussian_snr/#input-output-example","title":"Input-output example","text":"

Here we add some gaussian noise (with SNR = 16 dB) to a speech recording.

Input sound Transformed sound"},{"location":"waveform_transforms/add_gaussian_snr/#usage-example","title":"Usage example","text":"
from audiomentations import AddGaussianSNR\n\ntransform = AddGaussianSNR(\n    min_snr_db=5.0,\n    max_snr_db=40.0,\n    p=1.0\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/add_gaussian_snr/#addgaussiansnr-api","title":"AddGaussianSNR API","text":"min_snr_db: float \u2022 unit: Decibel Default: 5.0. Minimum signal-to-noise ratio in dB. A lower number means more noise. max_snr_db: float \u2022 unit: decibel Default: 40.0. Maximum signal-to-noise ratio in dB. A greater number means less noise. min_snr_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use min_snr_db instead max_snr_in_db: float \u2022 unit: decibel Deprecated as of v0.31.0. Use max_snr_db instead p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/add_short_noises/","title":"AddShortNoises","text":"

Added in v0.9.0

Mix in various (bursts of overlapping) sounds with random pauses between. Useful if your original sound is clean and you want to simulate an environment where short noises sometimes occur.

A folder of (noise) sounds to be mixed in must be specified.

"},{"location":"waveform_transforms/add_short_noises/#input-output-example","title":"Input-output example","text":"

Here we add some short noise sounds to a voice recording.

Input sound Transformed sound"},{"location":"waveform_transforms/add_short_noises/#usage-examples","title":"Usage examples","text":"Noise RMS relative to whole inputAbsolute RMS
from audiomentations import AddShortNoises, PolarityInversion\n\ntransform = AddShortNoises(\n    sounds_path=\"/path/to/folder_with_sound_files\",\n    min_snr_in_db=3.0,\n    max_snr_in_db=30.0,\n    noise_rms=\"relative_to_whole_input\",\n    min_time_between_sounds=2.0,\n    max_time_between_sounds=8.0,\n    noise_transform=PolarityInversion(),\n    p=1.0\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
from audiomentations import AddShortNoises, PolarityInversion\n\ntransform = AddShortNoises(\n    sounds_path=\"/path/to/folder_with_sound_files\",\n    min_absolute_noise_rms_db=-50.0,\n    max_absolute_noise_rms_db=-20.0,        \n    noise_rms=\"absolute\",\n    min_time_between_sounds=2.0,\n    max_time_between_sounds=8.0,\n    noise_transform=PolarityInversion(),\n    p=1.0\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/add_short_noises/#addshortnoises-api","title":"AddShortNoises API","text":"sounds_path: Union[List[Path], List[str], Path, str] A path or list of paths to audio file(s) and/or folder(s) with audio files. Can be str or Path instance(s). The audio files given here are supposed to be (short) noises. min_snr_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use min_snr_db instead max_snr_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use max_snr_db instead min_snr_db: float \u2022 unit: Decibel Default: -6.0. Minimum signal-to-noise ratio in dB. A lower value means the added sounds/noises will be louder. This gets ignored if noise_rms is set to \"absolute\". max_snr_db: float \u2022 unit: Decibel Default: 18.0. Maximum signal-to-noise ratio in dB. A lower value means the added sounds/noises will be louder. This gets ignored if noise_rms is set to \"absolute\". min_time_between_sounds: float \u2022 unit: seconds Default: 2.0. Minimum pause time (in seconds) between the added sounds/noises max_time_between_sounds: float \u2022 unit: seconds Default: 8.0. Maximum pause time (in seconds) between the added sounds/noises noise_rms: str \u2022 choices: \"absolute\", \"relative\", \"relative_to_whole_input\"

Default: \"relative\" (<=v0.27), but will be changed to \"relative_to_whole_input\" in a future version.

This parameter defines how the noises will be added to the audio input.

  • \"relative\": the RMS value of the added noise will be proportional to the RMS value of the input sound calculated only for the region where the noise is added.
  • \"absolute\": the added noises will have an RMS independent of the RMS of the input audio file.
  • \"relative_to_whole_input\": the RMS of the added noises will be proportional to the RMS of the whole input sound.
min_absolute_noise_rms_db: float \u2022 unit: Decibel Default: -50.0. Is only used if noise_rms is set to \"absolute\". It is the minimum RMS value in dB that the added noise can take. The lower the RMS is, the lower will the added sound be. max_absolute_noise_rms_db: float \u2022 unit: seconds Default: -20.0. Is only used if noise_rms is set to \"absolute\". It is the maximum RMS value in dB that the added noise can take. Note that this value can not exceed 0. add_all_noises_with_same_level: bool Default: False. Whether to add all the short noises (within one audio snippet) with the same SNR. If noise_rms is set to \"absolute\", the RMS is used instead of SNR. The target SNR (or RMS) will change every time the parameters of the transform are randomized. include_silence_in_noise_rms_estimation: bool Default: True. It chooses how the RMS of the noises to be added will be calculated. If this option is set to False, the silence in the noise files will be disregarded in the RMS calculation. It is useful for non-stationary noises where silent periods occur. burst_probability: float Default: 0.22. For every noise that gets added, there is a probability of adding an extra burst noise that overlaps with the noise. This parameter controls that probability. min_pause_factor_during_burst and max_pause_factor_during_burst control the amount of overlap. min_pause_factor_during_burst: float Default: 0.1. Min value of how far into the current sound (as fraction) the burst sound should start playing. The value must be greater than 0. max_pause_factor_during_burst: float Default: 1.1. Max value of how far into the current sound (as fraction) the burst sound should start playing. The value must be greater than 0. min_fade_in_time: float \u2022 unit: seconds Default: 0.005. Min noise fade in time in seconds. Use a value larger than 0 to avoid a \"click\" at the start of the noise. max_fade_in_time: float \u2022 unit: seconds Default: 0.08. Max noise fade in time in seconds. Use a value larger than 0 to avoid a \"click\" at the start of the noise. min_fade_out_time: float \u2022 unit: seconds Default: 0.01. Min sound/noise fade out time in seconds. Use a value larger than 0 to avoid a \"click\" at the end of the sound/noise. max_fade_out_time: float \u2022 unit: seconds Default: 0.1. Max sound/noise fade out time in seconds. Use a value larger than 0 to avoid a \"click\" at the end of the sound/noise. signal_gain_in_db_during_noise: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use signal_gain_db_during_noise instead signal_gain_db_during_noise: float \u2022 unit: Decibel

Default: 0.0. Gain applied to the signal during a short noise. When fading the signal to the custom gain, the same fade times are used as for the noise, so it's essentially cross-fading. The default value (0.0) means the signal will not be gained. If set to a very low value, e.g. -100.0, this feature could be used for completely replacing the signal with the noise. This could be relevant in some use cases, for example:

  • replace the signal with another signal of a similar class (e.g. replace some speech with a cough)
  • simulate an ECG off-lead condition (electrodes are temporarily disconnected)
noise_transform: Optional[Callable[[NDArray[np.float32], int], NDArray[np.float32]]] Default: None. A callable waveform transform (or composition of transforms) that gets applied to noises before they get mixed in. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform. lru_cache_size: int Default: 64. Maximum size of the LRU cache for storing noise files in memory"},{"location":"waveform_transforms/adjust_duration/","title":"AdjustDuration","text":"

Added in v0.30.0

Trim or pad the audio to the specified length/duration in samples or seconds. If the input sound is longer than the target duration, pick a random offset and crop the sound to the target duration. If the input sound is shorter than the target duration, pad the sound so the duration matches the target duration.

This transform can be useful if you need audio with constant length, e.g. as input to a machine learning model. The reason for varying audio clip lengths can be e.g.

  • the nature of the audio dataset (different audio clips have different lengths)
  • data augmentation transforms that change the lengths (e.g. time stretching or convolving with impulse responses without cutting the tail)
"},{"location":"waveform_transforms/adjust_duration/#input-output-example","title":"Input-output example","text":"

Here we input an audio clip and remove a part of the start and the end, so the length of the result matches the specified target length.

Input sound Transformed sound"},{"location":"waveform_transforms/adjust_duration/#usage-examples","title":"Usage examples","text":"Target length in samplesTarget duration in seconds
from audiomentations import AdjustDuration\n\ntransform = AdjustDuration(duration_samples=60000, p=1.0)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
from audiomentations import AdjustDuration\n\ntransform = AdjustDuration(duration_seconds=3.75, p=1.0)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/adjust_duration/#adjustduration-api","title":"AdjustDuration API","text":"duration_samples: int \u2022 range: [0, \u221e) Target duration in number of samples. duration_seconds: float \u2022 range: [0.0, \u221e) Target duration in seconds. padding_mode: str \u2022 choices: \"silence\", \"wrap\", \"reflect\" Default: \"silence\". Padding mode. Only used when audio input is shorter than the target duration. padding_position: str \u2022 choices: \"start\", \"end\" Default: \"end\". The position of the inserted/added padding. Only used when audio input is shorter than the target duration. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/air_absorption/","title":"AirAbsorption","text":"

Added in v0.25.0

A lowpass-like filterbank with variable octave attenuation that simulates attenuation of high frequencies due to air absorption. This transform is parametrized by temperature, humidity, and the distance between audio source and microphone.

This is not a scientifically accurate transform but basically applies a uniform filterbank with attenuations given by:

att = exp(- distance * absorption_coefficient)

where distance is the microphone-source assumed distance in meters and absorption_coefficient is adapted from a lookup table by pyroomacoustics. It can also be seen as a lowpass filter with variable octave attenuation.

Note that since this transform mostly affects high frequencies, it is only suitable for audio with sufficiently high sample rate, like 32 kHz and above.

Note also that this transform only \"simulates\" the dampening of high frequencies, and does not attenuate according to the distance law. Gain augmentation needs to be done separately.

"},{"location":"waveform_transforms/air_absorption/#input-output-example","title":"Input-output example","text":"

Here we input a high-quality speech recording and apply AirAbsorption with an air temperature of 20 degrees celsius, 70% humidity and a distance of 20 meters. One can see clearly in the spectrogram that the highs, especially above ~13 kHz, are rolled off in the output, but it may require a quiet room and some concentration to hear it clearly in the audio comparison.

Input sound Transformed sound"},{"location":"waveform_transforms/air_absorption/#usage-example","title":"Usage example","text":"
from audiomentations import AirAbsorption\n\ntransform = AirAbsorption(\n    min_distance=10.0,\n    max_distance=50.0,\n    p=1.0,\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=48000)\n
"},{"location":"waveform_transforms/air_absorption/#airabsorption-api","title":"AirAbsorption API","text":"min_temperature: float \u2022 unit: Celsius \u2022 choices: [10.0, 20.0] Default: 10.0. Minimum temperature in Celsius (can take a value of either 10.0 or 20.0) max_temperature: float \u2022 unit: Celsius \u2022 choices: [10.0, 20.0] Default: 20.0. Maximum temperature in Celsius (can take a value of either 10.0 or 20.0) min_humidity: float \u2022 unit: percent \u2022 range: [30.0, 90.0] Default: 30.0. Minimum humidity in percent (between 30.0 and 90.0) max_humidity: float \u2022 unit: percent \u2022 range: [30.0, 90.0] Default: 90.0. Maximum humidity in percent (between 30.0 and 90.0) min_distance: float \u2022 unit: meters Default: 10.0. Minimum microphone-source distance in meters. max_distance: float \u2022 unit: meters Default: 100.0. Maximum microphone-source distance in meters. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/aliasing/","title":"Aliasing","text":"

To be added in v0.35.0

Downsample the audio to a lower sample rate by linear interpolation, without low-pass filtering it first, resulting in aliasing artifacts. You get aliasing artifacts when there is high-frequency audio in the input audio that falls above the nyquist frequency of the chosen target sample rate. Audio with frequencies above the nyquist frequency cannot be reproduced accurately and get \"reflected\"/mirrored to other frequencies. The aliasing artifacts \"replace\" the original high frequency signals. The result can be described as coarse and metallic.

After the downsampling, the signal gets upsampled to the original signal again, so the length of the output becomes the same as the length of the input.

For more information, see

  • Sample rate reduction on Wikipedia
  • Intro to downsampling by NTNU, Department of Music, Music Technology. Note: that article describes a slightly different downsampling technique, called sample-and-hold, while audiomentations implements linear interpolation. However, both methods lead to aliasing artifacts.
"},{"location":"waveform_transforms/aliasing/#input-output-example","title":"Input-output example","text":"

Here we target a sample rate of 12000 Hz. Note the vertical mirroring in the spectrogram in the transformed sound.

Input sound Transformed sound"},{"location":"waveform_transforms/aliasing/#usage-example","title":"Usage example","text":"
from audiomentations import Aliasing\n\ntransform = Aliasing(min_sample_rate=8000, max_sample_rate=30000, p=1.0)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=44100)\n
"},{"location":"waveform_transforms/aliasing/#aliasing-api","title":"Aliasing API","text":"min_sample_rate: int \u2022 unit: Hz \u2022 range: [2, \u221e) Minimum target sample rate to downsample to max_sample_rate: int \u2022 unit: Hz \u2022 range: [2, \u221e) Maximum target sample rate to downsample to p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/apply_impulse_response/","title":"ApplyImpulseResponse","text":"

Added in v0.7.0

This transform convolves the audio with a randomly selected (room) impulse response file.

ApplyImpulseResponse is commonly used as a data augmentation technique that adds realistic-sounding reverb to recordings. This can for example make denoisers and speech recognition systems more robust to different acoustic environments and distances between the sound source and the microphone. It could also be used to generate roomy audio examples for the training of dereverberation models.

Convolution with an impulse response is a powerful technique in signal processing that can be employed to emulate the acoustic characteristics of specific environments or devices. This process can transform a dry recording, giving it the sonic signature of being played in a specific location or through a particular device.

What is an impulse response? An impulse response (IR) captures the unique acoustical signature of a space or object. It's essentially a recording of how a specific environment or system responds to an impulse (a short, sharp sound). By convolving an audio signal with an impulse response, we can simulate how that signal would sound in the captured environment.

Note that some impulse responses, especially those captured in larger spaces or from specific equipment, can introduce a noticeable delay when convolved with an audio signal. In some applications, this delay is a desirable property. However, in some other applications, the convolved audio should not have a delay compared to the original audio. If this is the case for you, you can align the audio afterwards with fast-align-audio , for example.

Impulse responses can be created using e.g. http://tulrich.com/recording/ir_capture/

Some datasets of impulse responses are publicly available:

  • EchoThief containing 115 impulse responses acquired in a wide range of locations.
  • The MIT McDermott dataset containing 271 impulse responses acquired in everyday places.

Impulse responses are represented as audio (ideally wav) files in the given ir_path.

Another thing worth checking is that your IR files have the same sample rate as your audio inputs. Why? Because if they have different sample rates, the internal resampling will slow down execution, and because some high frequencies may get lost.

"},{"location":"waveform_transforms/apply_impulse_response/#input-output-example","title":"Input-output example","text":"

Here we make a dry speech recording quite reverbant by convolving it with a room impulse response

Input sound Transformed sound"},{"location":"waveform_transforms/apply_impulse_response/#usage-example","title":"Usage example","text":"
from audiomentations import ApplyImpulseResponse\n\ntransform = ApplyImpulseResponse(ir_path=\"/path/to/sound_folder\", p=1.0)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=48000)\n
"},{"location":"waveform_transforms/apply_impulse_response/#applyimpulseresponse-api","title":"ApplyImpulseResponse API","text":"ir_path: Union[List[Path], List[str], str, Path] A path or list of paths to audio file(s) and/or folder(s) with audio files. Can be str or Path instance(s). The audio files given here are supposed to be (room) impulse responses. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform. lru_cache_size: int Default: 128. Maximum size of the LRU cache for storing impulse response files in memory. leave_length_unchanged: bool Default: True. When set to True, the tail of the sound (e.g. reverb at the end) will be chopped off so that the length of the output is equal to the length of the input."},{"location":"waveform_transforms/band_pass_filter/","title":"BandPassFilter","text":"

Added in v0.18.0, updated in v0.21.0

Apply band-pass filtering to the input audio. Filter steepness (6/12/18... dB / octave) is parametrized. Can also be set for zero-phase filtering (will result in a 6 dB drop at cutoffs).

"},{"location":"waveform_transforms/band_pass_filter/#input-output-example","title":"Input-output example","text":"

Here we input a high-quality speech recording and apply BandPassFilter with a center frequency of 2500 Hz and a bandwidth fraction of 0.8, which means that the bandwidth in this example is 2000 Hz, so the low frequency cutoff is 1500 Hz and the high frequency cutoff is 3500 Hz. One can see in the spectrogram that the high and the low frequencies are both attenuated in the output. If you listen to the audio example, you might notice that the transformed output almost sounds like a phone call from the time when phone audio was narrowband and mostly contained frequencies between ~300 and ~3400 Hz.

Input sound Transformed sound"},{"location":"waveform_transforms/band_pass_filter/#usage-example","title":"Usage example","text":"
from audiomentations import BandPassFilter\n\ntransform = BandPassFilter(min_center_freq=100.0, max_center_freq=6000, p=1.0)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=48000)\n
"},{"location":"waveform_transforms/band_pass_filter/#bandpassfilter-api","title":"BandPassFilter API","text":"min_center_freq: float \u2022 unit: hertz Default: 200.0. Minimum center frequency in hertz max_center_freq: float \u2022 unit: hertz Default: 4000.0. Maximum center frequency in hertz min_bandwidth_fraction: float \u2022 range: [0.0, 2.0] Default: 0.5. Minimum bandwidth relative to center frequency max_bandwidth_fraction: float \u2022 range: [0.0, 2.0] Default: 1.99. Maximum bandwidth relative to center frequency min_rolloff: float \u2022 unit: Decibels/octave Default: 12. Minimum filter roll-off (in dB/octave). Must be a multiple of 6 max_rolloff: float \u2022 unit: Decibels/octave Default: 24. Maximum filter roll-off (in dB/octave) Must be a multiple of 6 zero_phase: bool Default: False. Whether filtering should be zero phase. When this is set to True it will not affect the phase of the input signal but will sound 3 dB lower at the cutoff frequency compared to the non-zero phase case (6 dB vs. 3 dB). Additionally, it is 2 times slower than in the non-zero phase case. If you absolutely want no phase distortions (e.g. want to augment an audio file with lots of transients, like a drum track), set this to True. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/band_stop_filter/","title":"BandStopFilter","text":"

Added in v0.21.0

Apply band-stop filtering to the input audio. Also known as notch filter or band reject filter. It relates to the frequency mask idea in the SpecAugment paper . Center frequency gets picked in mel space, so it is somewhat aligned with human hearing, which is not linear. Filter steepness (6/12/18... dB / octave) is parametrized. Can also be set for zero-phase filtering (will result in a 6 dB drop at cutoffs).

Applying band-stop filtering as data augmentation during model training can aid in preventing overfitting to specific frequency relationships, helping to make the model robust to diverse audio environments and scenarios, where frequency losses can occur.

"},{"location":"waveform_transforms/band_stop_filter/#input-output-example","title":"Input-output example","text":"

Here we input a speech recording and apply BandStopFilter with a center frequency of 2500 Hz and a bandwidth fraction of 0.8, which means that the bandwidth in this example is 2000 Hz, so the low frequency cutoff is 1500 Hz and the high frequency cutoff is 3500 Hz. One can see in the spectrogram of the transformed sound that the band stop filter has attenuated this frequency range. If you listen to the audio example, you can hear that the timbre is different in the transformed sound than in the original.

Input sound Transformed sound"},{"location":"waveform_transforms/band_stop_filter/#bandstopfilter-api","title":"BandStopFilter API","text":"min_center_freq: float \u2022 unit: hertz Default: 200.0. Minimum center frequency in hertz max_center_freq: float \u2022 unit: hertz Default: 4000.0. Maximum center frequency in hertz min_bandwidth_fraction: float Default: 0.5. Minimum bandwidth relative to center frequency max_bandwidth_fraction: float Default: 1.99. Maximum bandwidth relative to center frequency min_rolloff: float \u2022 unit: Decibels/octave Default: 12. Minimum filter roll-off (in dB/octave). Must be a multiple of 6 max_rolloff: float \u2022 unit: Decibels/octave Default: 24. Maximum filter roll-off (in dB/octave) Must be a multiple of 6 zero_phase: bool Default: False. Whether filtering should be zero phase. When this is set to True it will not affect the phase of the input signal but will sound 3 dB lower at the cutoff frequency compared to the non-zero phase case (6 dB vs. 3 dB). Additionally, it is 2 times slower than in the non-zero phase case. If you absolutely want no phase distortions (e.g. want to augment an audio file with lots of transients, like a drum track), set this to True. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/bit_crush/","title":"BitCrush","text":"

To be added in v0.35.0

Apply a bit crush effect to the audio by reducing the bit depth. In other words, it reduces the number of bits that can be used for representing each audio sample. This adds quantization noise, and affects dynamic range. This transform does not apply dithering.

For more information, see

  • Resolution reduction on Wikipedia
  • Intro to bit reduction by NTNU, Department of Music, Music Technology
"},{"location":"waveform_transforms/bit_crush/#input-output-example","title":"Input-output example","text":"

Here we reduce the bit depth from 16 to 6 bits per sample

Input sound Transformed sound"},{"location":"waveform_transforms/bit_crush/#usage-example","title":"Usage example","text":"
from audiomentations import BitCrush\n\ntransform = BitCrush(min_bit_depth=5, max_bit_depth=14, p=1.0)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/bit_crush/#bitcrush-api","title":"BitCrush API","text":"min_bit_depth: int \u2022 unit: bits \u2022 range: [1, 32] Minimum bit depth the audio will be \"converted\" to max_bit_depth: int \u2022 unit: bits \u2022 range: [1, 32] Maximum bit depth the audio will be \"converted\" to p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/clip/","title":"Clip","text":"

Added in v0.17.0

Clip audio by specified values. e.g. set a_min=-1.0 and a_max=1.0 to ensure that no samples in the audio exceed that extent. This can be relevant for avoiding integer overflow or underflow (which results in unintended wrap distortion that can sound horrible) when exporting to e.g. 16-bit PCM wav.

Another way of ensuring that all values stay between -1.0 and 1.0 is to apply PeakNormalization.

This transform is different from ClippingDistortion in that it takes fixed values for clipping instead of clipping a random percentile of the samples. Arguably, this transform is not very useful for data augmentation. Instead, think of it as a very cheap and harsh limiter (for samples that exceed the allotted extent) that can sometimes be useful at the end of a data augmentation pipeline.

"},{"location":"waveform_transforms/clip/#clip-api","title":"Clip API","text":"a_min: float Default: -1.0. Minimum value for clipping. a_max: float Default: 1.0. Maximum value for clipping. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/clipping_distortion/","title":"ClippingDistortion","text":"

Added in v0.8.0

Distort signal by clipping a random percentage of points

The percentage of points that will be clipped is drawn from a uniform distribution between the two input parameters min_percentile_threshold and max_percentile_threshold. If for instance 30% is drawn, the samples are clipped if they're below the 15th or above the 85th percentile.

"},{"location":"waveform_transforms/clipping_distortion/#clippingdistortion-api","title":"ClippingDistortion API","text":"min_percentile_threshold: int Default: 0. A lower bound on the total percent of samples that will be clipped max_percentile_threshold: int Default: 40. An upper bound on the total percent of samples that will be clipped p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/gain/","title":"Gain","text":"

Added in v0.11.0

Multiply the audio by a random amplitude factor to reduce or increase the volume. This technique can help a model become somewhat invariant to the overall gain of the input audio.

Warning: This transform can return samples outside the [-1, 1] range, which may lead to clipping or wrap distortion, depending on what you do with the audio in a later stage. See also https://en.wikipedia.org/wiki/Clipping_(audio)#Digital_clipping

"},{"location":"waveform_transforms/gain/#gain-api","title":"Gain API","text":"min_gain_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use min_gain_db instead max_gain_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use max_gain_db instead min_gain_db: float \u2022 unit: Decibel Default: -12.0. Minimum gain. max_gain_db: float \u2022 unit: Decibel Default: 12.0. Maximum gain. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/gain_transition/","title":"GainTransition","text":"

Added in v0.22.0

Gradually change the volume up or down over a random time span. Also known as fade in and fade out. The fade works on a logarithmic scale, which is natural to human hearing.

The way this works is that it picks two gains: a first gain and a second gain. Then it picks a time range for the transition between those two gains. Note that this transition can start before the audio starts and/or end after the audio ends, so the output audio can start or end in the middle of a transition. The gain starts at the first gain and is held constant until the transition start. Then it transitions to the second gain. Then that gain is held constant until the end of the sound.

"},{"location":"waveform_transforms/gain_transition/#gaintransition-api","title":"GainTransition API","text":"min_gain_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use min_gain_db instead max_gain_in_db: float \u2022 unit: Decibel Deprecated as of v0.31.0. Use max_gain_db instead min_gain_db: float \u2022 unit: Decibel Default: -24.0. Minimum gain. max_gain_db: float \u2022 unit: Decibel Default: 6.0. Maximum gain. min_duration: Union[float, int] \u2022 unit: see duration_unit Default: 0.2. Minimum length of transition. max_duration: Union[float, int] \u2022 unit: see duration_unit Default: 6.0. Maximum length of transition. duration_unit: str \u2022 choices: \"fraction\", \"samples\", \"seconds\"

Default: \"seconds\". Defines the unit of the value of min_duration and max_duration.

  • \"fraction\": Fraction of the total sound length
  • \"samples\": Number of audio samples
  • \"seconds\": Number of seconds
p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/high_pass_filter/","title":"HighPassFilter","text":"

Added in v0.18.0, updated in v0.21.0

Apply high-pass filtering to the input audio of parametrized filter steepness (6/12/18... dB / octave). Can also be set for zero-phase filtering (will result in a 6 dB drop at cutoff).

"},{"location":"waveform_transforms/high_pass_filter/#highpassfilter-api","title":"HighPassFilter API","text":"min_cutoff_freq: float \u2022 unit: hertz Default: 20.0. Minimum cutoff frequency max_cutoff_freq: float \u2022 unit: hertz Default: 2400.0. Maximum cutoff frequency min_rolloff: float \u2022 unit: Decibels/octave Default: 12. Minimum filter roll-off (in dB/octave). Must be a multiple of 6 max_rolloff: float \u2022 unit: Decibels/octave Default: 24. Maximum filter roll-off (in dB/octave). Must be a multiple of 6 zero_phase: bool Default: False. Whether filtering should be zero phase. When this is set to True it will not affect the phase of the input signal but will sound 3 dB lower at the cutoff frequency compared to the non-zero phase case (6 dB vs. 3 dB). Additionally, it is 2 times slower than in the non-zero phase case. If you absolutely want no phase distortions (e.g. want to augment an audio file with lots of transients, like a drum track), set this to True. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/high_shelf_filter/","title":"HighShelfFilter","text":"

Added in v0.21.0

A high shelf filter is a filter that either boosts (increases amplitude) or cuts (decreases amplitude) frequencies above a certain center frequency. This transform applies a high-shelf filter at a specific center frequency in hertz. The gain at nyquist frequency is controlled by {min,max}_gain_db (note: can be positive or negative!). Filter coefficients are taken from the W3 Audio EQ Cookbook

"},{"location":"waveform_transforms/high_shelf_filter/#highshelffilter-api","title":"HighShelfFilter API","text":"min_center_freq: float \u2022 unit: hertz Default: 300.0. The minimum center frequency of the shelving filter max_center_freq: float \u2022 unit: hertz Default: 7500.0. The maximum center frequency of the shelving filter min_gain_db: float \u2022 unit: Decibel Default: -18.0. The minimum gain at the nyquist frequency max_gain_db: float \u2022 unit: Decibel Default: 18.0. The maximum gain at the nyquist frequency min_q: float \u2022 range: (0.0, 1.0] Default: 0.1. The minimum quality factor Q. The higher the Q, the steeper the transition band will be. max_q: float \u2022 range: (0.0, 1.0] Default: 0.999. The maximum quality factor Q. The higher the Q, the steeper the transition band will be. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/lambda/","title":"Lambda","text":"

Added in v0.26.0

Apply a user-defined transform (callable) to the signal. The inspiration for this transform comes from albumentation's lambda transform. This allows one to have a little more fine-grained control over the operations in the context of a Compose, OneOf or SomeOf

"},{"location":"waveform_transforms/lambda/#usage-example","title":"Usage example","text":"
import random\n\nfrom audiomentations import Lambda, OneOf, Gain\n\n\ndef gain_only_left_channel(samples, sample_rate):\n    samples[0, :] *= random.uniform(0.8, 1.25)\n    return samples\n\n\ntransform = OneOf(\n    transforms=[Lambda(transform=gain_only_left_channel, p=1.0), Gain(p=1.0)]\n)\n\naugmented_sound = transform(my_stereo_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/lambda/#lambda-api","title":"Lambda API","text":"transform: Callable A callable to be applied. It should input samples (ndarray), sample_rate (int) and optionally some user-defined keyword arguments. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform. **kwargs Optional extra parameters passed to the callable transform"},{"location":"waveform_transforms/limiter/","title":"Limiter","text":"

Added in v0.26.0

The Limiter, based on cylimiter , is a straightforward audio transform that applies dynamic range compression. It is capable of limiting the audio signal based on certain parameters. Additionally, please note that this transform introduces a slight delay in the signal, equivalent to a fraction of the attack time.

  • The threshold determines the audio level above which the limiter kicks in.
  • The attack time is how quickly the limiter kicks in once the audio signal starts exceeding the threshold.
  • The release time determines how quickly the limiter stops working after the signal drops below the threshold.
"},{"location":"waveform_transforms/limiter/#input-output-example","title":"Input-output example","text":"

In this example we apply the limiter with a threshold that is 10 dB lower than the signal peak

Input sound Transformed sound"},{"location":"waveform_transforms/limiter/#usage-examples","title":"Usage examples","text":"Threshold relative to signal peakAbsolute threshold
from audiomentations import Limiter\n\ntransform = Limiter(\n    min_threshold_db=-16.0,\n    max_threshold_db=-6.0,\n    threshold_mode=\"relative_to_signal_peak\",\n    p=1.0,\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
from audiomentations import Limiter\n\ntransform = Limiter(\n    min_threshold_db=-16.0,\n    max_threshold_db=-6.0,\n    threshold_mode=\"absolute\",\n    p=1.0,\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/limiter/#limiter-api","title":"Limiter API","text":"min_threshold_db: float \u2022 unit: Decibel Default: -24.0. Minimum threshold max_threshold_db: float \u2022 unit: Decibel Default: -2.0. Maximum threshold min_attack: float \u2022 unit: seconds Default: 0.0005. Minimum attack time max_attack: float \u2022 unit: seconds Default: 0.025. Maximum attack time min_release: float \u2022 unit: seconds Default: 0.05. Minimum release time max_release: float \u2022 unit: seconds Default: 0.7. Maximum release time threshold_mode: str \u2022 choices: \"relative_to_signal_peak\", \"absolute\"

Default: relative_to_signal_peak. Specifies the mode for determining the threshold.

  • \"relative_to_signal_peak\" means the threshold is relative to peak of the signal.
  • \"absolute\" means the threshold is relative to 0 dBFS, so it doesn't depend on the peak of the signal.
p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/loudness_normalization/","title":"LoudnessNormalization","text":"

Added in v0.14.0

Apply a constant amount of gain to match a specific loudness (in LUFS). This is an implementation of ITU-R BS.1770-4.

For an explanation on LUFS, see https://en.wikipedia.org/wiki/LUFS

See also the following web pages for more info on audio loudness normalization:

  • https://github.com/csteinmetz1/pyloudnorm
  • https://en.wikipedia.org/wiki/Audio_normalization

Warning: This transform can return samples outside the [-1, 1] range, which may lead to clipping or wrap distortion, depending on what you do with the audio in a later stage. See also https://en.wikipedia.org/wiki/Clipping_(audio)#Digital_clipping

"},{"location":"waveform_transforms/loudness_normalization/#loudnessnormalization-api","title":"LoudnessNormalization API","text":"min_lufs_in_db: float \u2022 unit: LUFS Deprecated as of v0.31.0. Use min_lufs instead max_lufs_in_db: float \u2022 unit: LUFS Deprecated as of v0.31.0. Use max_lufs instead min_lufs: float \u2022 unit: LUFS Default: -31.0. Minimum loudness target max_lufs: float \u2022 unit: LUFS Default: -13.0. Maximum loudness target p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/low_pass_filter/","title":"LowPassFilter","text":"

Added in v0.18.0, updated in v0.21.0

Apply low-pass filtering to the input audio of parametrized filter steepness (6/12/18... dB / octave). Can also be set for zero-phase filtering (will result in a 6db drop at cutoff).

"},{"location":"waveform_transforms/low_pass_filter/#lowpassfilter-api","title":"LowPassFilter API","text":"min_cutoff_freq: float \u2022 unit: hertz Default: 150.0. Minimum cutoff frequency max_cutoff_freq: float \u2022 unit: hertz Default: 7500.0. Maximum cutoff frequency min_rolloff: float \u2022 unit: Decibels/octave Default: 12. Minimum filter roll-off (in dB/octave). Must be a multiple of 6 max_rolloff: float \u2022 unit: Decibels/octave Default: 24. Maximum filter roll-off (in dB/octave) Must be a multiple of 6 zero_phase: bool Default: False. Whether filtering should be zero phase. When this is set to True it will not affect the phase of the input signal but will sound 3 dB lower at the cutoff frequency compared to the non-zero phase case (6 dB vs. 3 dB). Additionally, it is 2 times slower than in the non-zero phase case. If you absolutely want no phase distortions (e.g. want to augment an audio file with lots of transients, like a drum track), set this to True. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/low_shelf_filter/","title":"LowShelfFilter","text":"

Added in v0.21.0

A low shelf filter is a filter that either boosts (increases amplitude) or cuts (decreases amplitude) frequencies below a certain center frequency. This transform applies a low-shelf filter at a specific center frequency in hertz. The gain at DC frequency is controlled by {min,max}_gain_db (note: can be positive or negative!). Filter coefficients are taken from the W3 Audio EQ Cookbook

"},{"location":"waveform_transforms/low_shelf_filter/#lowshelffilter-api","title":"LowShelfFilter API","text":"min_center_freq: float \u2022 unit: hertz Default: 50.0. The minimum center frequency of the shelving filter max_center_freq: float \u2022 unit: hertz Default: 4000.0. The maximum center frequency of the shelving filter min_gain_db: float \u2022 unit: Decibel Default: -18.0. The minimum gain at DC (0 Hz) max_gain_db: float \u2022 unit: Decibel Default: 18.0. The maximum gain at DC (0 Hz) min_q: float \u2022 range: (0.0, 1.0] Default: 0.1. The minimum quality factor Q. The higher the Q, the steeper the transition band will be. max_q: float \u2022 range: (0.0, 1.0] Default: 0.999. The maximum quality factor Q. The higher the Q, the steeper the transition band will be. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/mp3_compression/","title":"Mp3Compression","text":"

Added in v0.12.0

Compress the audio using an MP3 encoder to lower the audio quality. This may help machine learning models deal with compressed, low-quality audio.

This transform depends on either lameenc or pydub/ffmpeg.

Note that bitrates below 32 kbps are only supported for low sample rates (up to 24000 Hz).

Note: When using the \"lameenc\" backend, the output may be slightly longer than the input due to the fact that the LAME encoder inserts some silence at the beginning of the audio.

Warning: This transform writes to disk, so it may be slow.

"},{"location":"waveform_transforms/mp3_compression/#mp3compression-api","title":"Mp3Compression API","text":"min_bitrate: int \u2022 unit: kbps \u2022 range: [8, max_bitrate] Default: 8. Minimum bitrate in kbps max_bitrate: int \u2022 unit: kbps \u2022 range: [min_bitrate, 320] Default: 64. Maximum bitrate in kbps backend: str \u2022 choices: \"pydub\", \"lameenc\"

Default: \"pydub\".

  • \"pydub\": May use ffmpeg under the hood. Pro: Seems to avoid introducing latency in the output. Con: Slightly slower than \"lameenc\".
  • \"lameenc\": Pro: With this backend you can set the quality parameter in addition to the bitrate (although this parameter is not exposed in the audiomentations API yet). Con: Seems to introduce some silence at the start of the audio.
p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/normalize/","title":"Normalize","text":"

Added in v0.6.0

Apply a constant amount of gain, so that highest signal level present in the sound becomes 0 dBFS, i.e. the loudest level allowed if all samples must be between -1 and 1. Also known as peak normalization.

"},{"location":"waveform_transforms/normalize/#normalize-api","title":"Normalize API","text":"p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/padding/","title":"Padding","text":"

Added in v0.23.0

Apply padding to the audio signal - take a fraction of the end or the start of the audio and replace that part with padding. This can be useful for preparing ML models with constant input length for padded inputs.

"},{"location":"waveform_transforms/padding/#padding-api","title":"Padding API","text":"mode: str \u2022 choices: \"silence\", \"wrap\", \"reflect\" Default: \"silence\". Padding mode. min_fraction: float \u2022 range: [0.0, 1.0] Default: 0.01. Minimum fraction of the signal duration to be padded max_fraction: float \u2022 range: [0.0, 1.0] Default: 0.7. Maximum fraction of the signal duration to be padded pad_section: str \u2022 choices: \"start\", \"end\" Default: \"end\". Which part of the signal should be replaced with padding p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/peaking_filter/","title":"PeakingFilter","text":"

Added in v0.21.0

Add a biquad peaking filter transform

"},{"location":"waveform_transforms/peaking_filter/#peakingfilter-api","title":"PeakingFilter API","text":"min_center_freq: float \u2022 unit: hertz \u2022 range: [0.0, \u221e) Default: 50.0. The minimum center frequency of the peaking filter max_center_freq: float \u2022 unit: hertz \u2022 range: [0.0, \u221e) Default: 7500.0. The maximum center frequency of the peaking filter min_gain_db: float \u2022 unit: Decibel Default: -24.0. The minimum gain at center frequency max_gain_db: float \u2022 unit: Decibel Default: 24.0. The maximum gain at center frequency min_q: float \u2022 range: [0.0, \u221e) Default: 0.5. The minimum quality factor Q. The higher the Q, the steeper the transition band will be. max_q: float \u2022 range: [0.0, \u221e) Default: 5.0. The maximum quality factor Q. The higher the Q, the steeper the transition band will be. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/pitch_shift/","title":"PitchShift","text":"

Added in v0.4.0

Pitch shift the sound up or down without changing the tempo.

Under the hood this does time stretching (by phase vocoding) followed by resampling. Note that phase vocoding can degrade audio quality by \"smearing\" transient sounds, altering the timbre of harmonic sounds, and distorting pitch modulations. This may result in a loss of sharpness, clarity, or naturalness in the transformed audio.

If you need a better sounding pitch shifting method, consider the following alternatives:

  • signalsmith-stretch
  • Rubber Band library
  • https://github.com/KAIST-MACLab/PyTSMod
  • https://github.com/vinusankars/ESOLA
"},{"location":"waveform_transforms/pitch_shift/#input-output-example","title":"Input-output example","text":"

Here we pitch down a piano recording by 4 semitones:

Input sound Transformed sound"},{"location":"waveform_transforms/pitch_shift/#usage-example","title":"Usage example","text":"
from audiomentations import PitchShift\n\ntransform = PitchShift(\n    min_semitones=-5.0,\n    max_semitones=5.0,\n    p=1.0\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=44100)\n
"},{"location":"waveform_transforms/pitch_shift/#pitchshift-api","title":"PitchShift API","text":"min_semitones: float \u2022 unit: semitones \u2022 range: [-12.0, 12.0] Default: -4.0. Minimum semitones to shift. Negative number means shift down. max_semitones: float \u2022 unit: semitones \u2022 range: [-12.0, 12.0] Default: 4.0. Maximum semitones to shift. Positive number means shift up. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/polarity_inversion/","title":"PolarityInversion","text":"

Added in v0.11.0

Flip the audio samples upside-down, reversing their polarity. In other words, multiply the waveform by -1, so negative values become positive, and vice versa. The result will sound the same compared to the original when played back in isolation. However, when mixed with other audio sources, the result may be different. This waveform inversion technique is sometimes used for audio cancellation or obtaining the difference between two waveforms. However, in the context of audio data augmentation, this transform can be useful when training phase-aware machine learning models.

"},{"location":"waveform_transforms/polarity_inversion/#polarityinversion-api","title":"PolarityInversion API","text":"p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/post_gain/","title":"PostGain","text":"

Added in v0.31.0

Gain up or down the audio after the given transform (or set of transforms) has processed the audio. There are several methods that determine how the audio should be gained. PostGain can be useful for compensating for any gain differences introduced by a (set of) transform(s), or for preventing clipping in the output.

"},{"location":"waveform_transforms/post_gain/#postgain-api","title":"PostGain API","text":"transform: Callable[[NDArray[np.float32], int], NDArray[np.float32]] A callable to be applied. It should input samples (ndarray), sample_rate (int) and optionally some user-defined keyword arguments. method: str \u2022 choices: \"same_rms\", \"same_lufs\" or \"peak_normalize_always\"

This parameter defines the method for choosing the post gain amount.

  • \"same_rms\": The sound gets post-gained so that the RMS (Root Mean Square) of the output matches the RMS of the input.
  • \"same_lufs\": The sound gets post-gained so that the LUFS (Loudness Units Full Scale) of the output matches the LUFS of the input.
  • \"peak_normalize_always\": The sound gets peak normalized (gained up or down so that the absolute value of the most extreme sample in the output is 1.0)
  • \"peak_normalize_if_too_loud\": The sound gets peak normalized if it is too loud (max absolute value greater than 1.0). This option can be useful for avoiding clipping.
"},{"location":"waveform_transforms/repeat_part/","title":"RepeatPart","text":"

Added in v0.32.0

Select a subsection (or \"part\") of the audio and repeat that part a number of times. This can be useful when simulating scenarios where a short audio snippet gets repeated, for example:

  • Repetitions of some musical note or sound in a rhythmical way
  • A person stutters or says the same word (with variations) multiple times on a row
  • A mechanical noise with periodic repetitions
  • A \"skip in the record\" or a \"stuck needle\" effect, reminiscent of vinyl records or CDs when they repeatedly play a short section due to a scratch or other imperfection.
  • Digital audio glitches, such as a buffer underrun in video games, where the current audio frame gets looped continuously due to system overloads or a software crash.

Note that the length of inputs you give it must be compatible with the part duration range and crossfade duration. If you give it an input audio array that is too short, a UserWarning will be raised and no operation is applied to the signal.

"},{"location":"waveform_transforms/repeat_part/#input-output-example","title":"Input-output example","text":"

In this speech example, the audio was transformed with

  • a part duration of approximately 0.4 seconds
  • \"insert\" mode. In this mode, the output becomes longer than the input.
  • a SevenBandParametricEQ part transform. This is why each repeat in the output has a different timbre.

Input sound Transformed sound"},{"location":"waveform_transforms/repeat_part/#usage-examples","title":"Usage examples","text":"Insert modeReplace mode
from audiomentations import RepeatPart\n\ntransform = RepeatPart(mode=\"insert\", p=1.0)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
from audiomentations import RepeatPart\n\ntransform = RepeatPart(mode=\"replace\", p=1.0)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/repeat_part/#repeatpart-api","title":"RepeatPart API","text":"min_repeats: int \u2022 range: [1, max_repeats] Default: 1. Minimum number of times a selected audio segment should be repeated in addition to the original. For instance, if the selected number of repeats is 1, the selected segment will be followed by one repeat. max_repeats: int \u2022 range: [min_repeats, \u221e) Default: 3. Maximum number of times a selected audio segment can be repeated in addition to the original min_part_duration: float \u2022 unit: seconds \u2022 range: [0.00025, max_part_duration] Default: 0.25. Minimum duration (in seconds) of the audio segment that can be selected for repetition. max_part_duration: float \u2022 unit: seconds \u2022 range: [min_part_duration, \u221e) Default: 1.2. Maximum duration (in seconds) of the audio segment that can be selected for repetition. mode: str \u2022 choices: \"insert\", \"replace\"

Default: \"insert\". This parameter has two options:

  • \"insert\": Insert the repeat(s), making the array longer. After the last repeat there will be the last part of the original audio, offset in time compared to the input array.
  • \"replace\": Have the repeats replace (as in overwrite) the original audio. Any remaining part at the end (if not overwritten by repeats) will be left untouched without offset. The length of the output array is the same as the input array.
crossfade_duration: float \u2022 unit: seconds \u2022 range: 0.0 or [0.00025, \u221e) Default: 0.005. Duration for crossfading between repeated parts as well as potentially from the original audio to the repeats and back. The crossfades will be equal-energy or equal-gain depending on the audio and/or the chosen parameters of the transform. The crossfading feature can be used to smooth transitions and avoid abrupt changes, which can lead to impulses/clicks in the audio. If you know what you're doing, and impulses/clicks are desired for your use case, you can disable the crossfading by setting this value to 0.0. part_transform: Optional[Callable[[NDArray[np.float32], int], NDArray[np.float32]]] An optional callable (audiomentations transform) that gets applied individually to each repeat. This can be used to make each repeat slightly different from the previous one. Note that a part_transform that makes the part shorter is only supported if the transformed part is at least two times the crossfade duration. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/resample/","title":"Resample","text":"

Added in v0.8.0

Resample signal using librosa.core.resample

To do downsampling only set both minimum and maximum sampling rate lower than original sampling rate and vice versa to do upsampling only.

"},{"location":"waveform_transforms/resample/#resample-api","title":"Resample API","text":"min_sample_rate: int \u2022 unit: Hz Default: 8000. Minimum sample rate max_sample_rate: int \u2022 unit: Hz Default: 44100. Maximum sample rate p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/reverse/","title":"Reverse","text":"

Added in v0.18.0

Reverse the audio. Also known as time inversion. Inversion of an audio track along its time axis relates to the random flip of an image, which is an augmentation technique that is widely used in the visual domain. This can be relevant in the context of audio classification. It was successfully applied in the paper AudioCLIP: Extending CLIP to Image, Text and Audio .

"},{"location":"waveform_transforms/reverse/#input-output-example","title":"Input-output example","text":"

In this example, we reverse a speech recording

Input sound Transformed sound"},{"location":"waveform_transforms/reverse/#usage-example","title":"Usage example","text":"
from audiomentations import Reverse\n\ntransform = Reverse(p=1.0)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=44100)\n
"},{"location":"waveform_transforms/reverse/#reverse-api","title":"Reverse API","text":"p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/room_simulator/","title":"RoomSimulator","text":"

Added in v0.23.0

A ShoeBox Room Simulator. Simulates a cuboid of parametrized size and average surface absorption coefficient. It also includes a source and microphones in parametrized locations.

Use it when you want a ton of synthetic room impulse responses of specific configurations characteristics or simply to quickly add reverb for augmentation purposes

"},{"location":"waveform_transforms/room_simulator/#roomsimulator-api","title":"RoomSimulator API","text":"min_size_x: float \u2022 unit: meters Default: 3.6. Minimum width (x coordinate) of the room in meters max_size_x: float \u2022 unit: meters Default: 5.6. Maximum width of the room in meters min_size_y: float \u2022 unit: meters Default: 3.6. Minimum depth (y coordinate) of the room in meters max_size_y: float \u2022 unit: meters Default: 3.9. Maximum depth of the room in meters min_size_z: float \u2022 unit: meters Default: 2.4. Minimum height (z coordinate) of the room in meters max_size_z: float \u2022 unit: meters Default: 3.0. Maximum height of the room in meters min_absorption_value: float

Default: 0.075. Minimum absorption coefficient value. When calculation_mode is \"absorption\" it will set the given coefficient value for the surfaces of the room (walls, ceilings, and floor). This coefficient takes values between 0 (fully reflective surface) and 1 (fully absorbing surface).

Example values (may differ!):

Environment Coefficient value Studio with acoustic panels > 0.40 Office / Library ~ 0.15 Factory ~ 0.05 max_absorption_value: float Default: 0.4. Maximum absorption coefficient value. See min_absorption_value for more info. min_target_rt60: float \u2022 unit: seconds

Default: 0.15. Minimum target RT60. RT60 is defined as the measure of the time after the sound source ceases that it takes for the sound pressure level to reduce by 60 dB. When calculation_mode is \"rt60\", it tries to set the absorption value of the surfaces of the room to achieve a target RT60 (in seconds). Note that this parameter changes only the materials (absorption coefficients) of the surfaces, not the dimension of the rooms.

Example values (may differ!):

Environment RT60 Recording studio 0.3 s Office 0.5 s Concert hall 1.5 s max_target_rt60: float \u2022 unit: seconds Default: 0.8. Maximum target RT60. See min_target_rt60 for more info. min_source_x: float \u2022 unit: meters Default: 0.1. Minimum x location of the source max_source_x: float \u2022 unit: meters Default: 3.5. Maximum x location of the source min_source_y: float \u2022 unit: meters Default: 0.1. Minimum y location of the source max_source_x: float \u2022 unit: meters Default: 2.7. Maximum y location of the source min_source_z: float \u2022 unit: meters Default: 1.0. Minimum z location of the source max_source_x: float \u2022 unit: meters Default: 2.1. Maximum z location of the source min_mic_distance: float \u2022 unit: meters Default: 0.15. Minimum distance of the microphone from the source in meters max_mic_distance: float \u2022 unit: meters Default: 0.35. Maximum distance of the microphone from the source in meters min_mic_azimuth: float \u2022 unit: radians Default: -math.pi. Minimum azimuth (angle around z axis) of the microphone relative to the source. max_mic_azimuth: float \u2022 unit: radians Default: math.pi. Maximum azimuth (angle around z axis) of the microphone relative to the source. min_mic_elevation: float \u2022 unit: radians Default: -math.pi. Minimum elevation of the microphone relative to the source, in radians. max_mic_elevation: float \u2022 unit: radians Default: math.pi. Maximum elevation of the microphone relative to the source, in radians. calculation_mode: str \u2022 choices: \"rt60\", \"absorption\" Default: \"absorption\". When set to \"absorption\", it will create the room with surfaces based on min_absorption_value and max_absorption_value. If set to \"rt60\" it will try to assign surface materials that lead to a room impulse response with target rt60 given by min_target_rt60 and max_target_rt60 use_ray_tracing: bool Default: True. Whether to use ray_tracing or not (slower but much more accurate). Disable this if you need speed but do not really care for incorrect results. max_order: int \u2022 range: [1, \u221e)

Default: 1. Maximum order of reflections for the Image Source Model. E.g. a value of 1 will only add first order reflections while a value of 12 will add a diffuse reverberation tail.

Warning

Placing this higher than 11-12 will result in a very slow augmentation process when calculation_mode=\"rt60\".

Tip

When using calculation_mode=\"rt60\", keep it around 3-4.

leave_length_unchanged: bool Default: False. When set to True, the tail of the sound (e.g. reverb at the end) will be chopped off so that the length of the output is equal to the length of the input. padding: float \u2022 unit: meters Default: 0.1. Minimum distance in meters between source or mic and the room walls, floor or ceiling. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform. ray_tracing_options: Optional[Dict] Default: None. Options for the ray tracer. See set_ray_tracing here: https://github.com/LCAV/pyroomacoustics/blob/master/pyroomacoustics/room.py"},{"location":"waveform_transforms/seven_band_parametric_eq/","title":"SevenBandParametricEQ","text":"

Added in v0.24.0

Adjust the volume of different frequency bands. This transform is a 7-band parametric equalizer - a combination of one low shelf filter, five peaking filters and one high shelf filter, all with randomized gains, Q values and center frequencies.

Because this transform changes the timbre, but keeps the overall \"class\" of the sound the same (depending on application), it can be used for data augmentation to make ML models more robust to various frequency spectrums. Many things can affect the spectrum, for example:

  • the nature and quality of the sound source
  • room acoustics
  • any objects between the microphone and the sound source
  • microphone type/model
  • the distance between the sound source and the microphone

The seven bands have center frequencies picked in the following ranges (min-max):

  • 42-95 Hz
  • 91-204 Hz
  • 196-441 Hz
  • 421-948 Hz
  • 909-2045 Hz
  • 1957-4404 Hz
  • 4216-9486 Hz
"},{"location":"waveform_transforms/seven_band_parametric_eq/#sevenbandparametriceq-api","title":"SevenBandParametricEQ API","text":"min_gain_db: float \u2022 unit: Decibel Default: -12.0. Minimum number of dB to cut or boost a band max_gain_db: float \u2022 unit: decibel Default: 12.0. Maximum number of dB to cut or boost a band p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/shift/","title":"Shift","text":"

Added in v0.5.0

Shift the samples forwards or backwards, with or without rollover

"},{"location":"waveform_transforms/shift/#shift-api","title":"Shift API","text":"

This only applies to version 0.33.0 and newer. If you are using an older version, you should consider upgrading. Or if you really want to keep using the old version, you can check the \"Old Shift API (<=v0.32.0)\" section below

min_shift: float | int Default: -0.5. Minimum amount of shifting in time. See also shift_unit. max_shift: float | int Default: 0.5. Maximum amount of shifting in time. See also shift_unit. shift_unit: str \u2022 choices: \"fraction\", \"samples\", \"seconds\"

Default: \"fraction\" Defines the unit of the value of min_shift and max_shift.

  • \"fraction\": Fraction of the total sound length
  • \"samples\": Number of audio samples
  • \"seconds\": Number of seconds
rollover: bool Default: True. When set to True, samples that roll beyond the first or last position are re-introduced at the last or first. When set to False, samples that roll beyond the first or last position are discarded. In other words, rollover=False results in an empty space (with zeroes). fade_duration: float \u2022 unit: seconds \u2022 range: 0.0 or [0.00025, \u221e) Default: 0.005. If you set this to a positive number, there will be a fade in and/or out at the \"stitch\" (that was the start or the end of the audio before the shift). This can smooth out an unwanted abrupt change between two consecutive samples (which sounds like a transient/click/pop). This parameter denotes the duration of the fade in seconds. To disable the fading feature, set this parameter to 0.0. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/shift/#old-shift-api-v0320","title":"Old Shift API (<=v0.32.0)","text":"

This only applies to version 0.32.0 and older

min_fraction: float \u2022 range: [-1, 1] Default: -0.5. Minimum fraction of total sound length to shift. max_fraction: float \u2022 range: [-1, 1] Default: 0.5. Maximum fraction of total sound length to shift. rollover: bool Default: True. When set to True, samples that roll beyond the first or last position are re-introduced at the last or first. When set to False, samples that roll beyond the first or last position are discarded. In other words, rollover=False results in an empty space (with zeroes). fade: bool Default: False. When set to True, there will be a short fade in and/or out at the \"stitch\" (that was the start or the end of the audio before the shift). This can smooth out an unwanted abrupt change between two consecutive samples (which sounds like a transient/click/pop). fade_duration: float \u2022 unit: seconds Default: 0.01. If fade=True, then this is the duration of the fade in seconds. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/tanh_distortion/","title":"TanhDistortion","text":"

Added in v0.19.0

Apply tanh (hyperbolic tangent) distortion to the audio. This technique is sometimes used for adding distortion to guitar recordings. The tanh() function can give a rounded \"soft clipping\" kind of distortion, and the distortion amount is proportional to the loudness of the input and the pre-gain. Tanh is symmetric, so the positive and negative parts of the signal are squashed in the same way. This transform can be useful as data augmentation because it adds harmonics. In other words, it changes the timbre of the sound.

See this page for examples: http://gdsp.hf.ntnu.no/lessons/3/17/

"},{"location":"waveform_transforms/tanh_distortion/#input-output-example","title":"Input-output example","text":"

In this example we apply tanh distortion with the \"distortion amount\" (think of it as a knob that goes from 0 to 1) set to 0.25

Input sound Transformed sound"},{"location":"waveform_transforms/tanh_distortion/#usage-example","title":"Usage example","text":"
from audiomentations import TanhDistortion\n\ntransform = TanhDistortion(\n    min_distortion=0.01,\n    max_distortion=0.7,\n    p=1.0\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/tanh_distortion/#tanhdistortion-api","title":"TanhDistortion API","text":"min_distortion: float \u2022 range: [0.0, 1.0] Default: 0.01. Minimum \"amount\" of distortion to apply to the signal. max_distortion: float \u2022 range: [0.0, 1.0] Default: 0.7. Maximum \"amount\" of distortion to apply to the signal. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/time_mask/","title":"TimeMask","text":"

Added in v0.7.0

Make a randomly chosen part of the audio silent. Inspired by https://arxiv.org/pdf/1904.08779.pdf

"},{"location":"waveform_transforms/time_mask/#input-output-example","title":"Input-output example","text":"

Here we silence a part of a speech recording.

Input sound Transformed sound"},{"location":"waveform_transforms/time_mask/#usage-example","title":"Usage example","text":"
from audiomentations import TimeMask\n\ntransform = TimeMask(\n    min_band_part=0.1,\n    max_band_part=0.15,\n    fade=True,\n    p=1.0,\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/time_mask/#timemask-api","title":"TimeMask API","text":"min_band_part: float \u2022 range: [0.0, 1.0] Default: 0.0. Minimum length of the silent part as a fraction of the total sound length. max_band_part: float \u2022 range: [0.0, 1.0] Default: 0.5. Maximum length of the silent part as a fraction of the total sound length. fade: bool Default: False. When set to True, add a linear fade in and fade out of the silent part. This can smooth out an unwanted abrupt change between two consecutive samples (which sounds like a transient/click/pop). p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/time_stretch/","title":"TimeStretch","text":"

Added in v0.2.0

Change the speed or duration of the signal without changing the pitch. This transform employs librosa.effects.time_stretch under the hood to achieve the effect.

Under the hood this uses phase vocoding. Note that phase vocoding can degrade audio quality by \"smearing\" transient sounds, altering the timbre of harmonic sounds, and distorting pitch modulations. This may result in a loss of sharpness, clarity, or naturalness in the transformed audio, especially when the rate is set to an extreme value.

If you need a better sounding time stretch method, consider the following alternatives:

  • atempo in ffmpeg
  • Rubber Band library
  • https://github.com/KAIST-MACLab/PyTSMod
  • https://github.com/vinusankars/ESOLA
"},{"location":"waveform_transforms/time_stretch/#input-output-example","title":"Input-output example","text":"

In this example we speed up a sound by 25%. This corresponds to a rate of 1.25.

Input sound Transformed sound"},{"location":"waveform_transforms/time_stretch/#usage-example","title":"Usage example","text":"
from audiomentations import TimeStretch\n\ntransform = TimeStretch(\n    min_rate=0.8,\n    max_rate=1.25,\n    leave_length_unchanged=True,\n    p=1.0\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/time_stretch/#timestretch-api","title":"TimeStretch API","text":"min_rate: float \u2022 range: [0.1, 10.0] Default: 0.8. Minimum rate of change of total duration of the signal. A rate below 1 means the audio is slowed down. max_rate: float \u2022 range: [0.1, 10.0] Default: 1.25. Maximum rate of change of total duration of the signal. A rate greater than 1 means the audio is sped up. leave_length_unchanged: bool Default: True. The rate changes the duration and effects the samples. This flag is used to keep the total length of the generated output to be same as that of the input signal. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."},{"location":"waveform_transforms/trim/","title":"Trim","text":"

Added in v0.7.0

Trim leading and trailing silence from an audio signal using librosa.effects.trim. It considers threshold (in decibels) below reference defined in parameter top_db as silence.

"},{"location":"waveform_transforms/trim/#input-output-example","title":"Input-output example","text":"

In this example we remove silence from the start and end, using the default top_db parameter value

Input sound Transformed sound"},{"location":"waveform_transforms/trim/#usage-example","title":"Usage example","text":"
from audiomentations import Trim\n\ntransform = Trim(\n    top_db=30.0,\n    p=1.0\n)\n\naugmented_sound = transform(my_waveform_ndarray, sample_rate=16000)\n
"},{"location":"waveform_transforms/trim/#trim-api","title":"Trim API","text":"top_db: float \u2022 unit: Decibel Default: 30.0. The threshold value (in decibels) below which to consider silence and trim. p: float \u2022 range: [0.0, 1.0] Default: 0.5. The probability of applying this transform."}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 00000000..aabc46f2 --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,238 @@ + + + + https://iver56.github.io/audiomentations/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/alternatives/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/changelog/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/spectrogram_transforms/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/guides/cpu_vs_gpu/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/guides/multichannel_audio_array_shapes/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/guides/transform_parameters/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/add_background_noise/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/add_color_noise/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/add_gaussian_noise/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/add_gaussian_snr/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/add_short_noises/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/adjust_duration/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/air_absorption/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/aliasing/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/apply_impulse_response/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/band_pass_filter/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/band_stop_filter/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/bit_crush/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/clip/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/clipping_distortion/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/gain/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/gain_transition/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/high_pass_filter/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/high_shelf_filter/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/lambda/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/limiter/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/loudness_normalization/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/low_pass_filter/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/low_shelf_filter/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/mp3_compression/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/normalize/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/padding/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/peaking_filter/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/pitch_shift/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/polarity_inversion/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/post_gain/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/repeat_part/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/resample/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/reverse/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/room_simulator/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/seven_band_parametric_eq/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/shift/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/tanh_distortion/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/time_mask/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/time_stretch/ + 2024-03-15 + daily + + + https://iver56.github.io/audiomentations/waveform_transforms/trim/ + 2024-03-15 + daily + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz new file mode 100644 index 00000000..648a8135 Binary files /dev/null and b/sitemap.xml.gz differ diff --git a/spectrogram_transforms/index.html b/spectrogram_transforms/index.html new file mode 100644 index 00000000..fe6f8b74 --- /dev/null +++ b/spectrogram_transforms/index.html @@ -0,0 +1,1027 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Spectrogram transforms - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

audiomentations is in a very early (read: not very useful yet) stage when it comes to spectrogram transforms. Consider applying waveform transforms before converting your waveforms to spectrograms, or check out alternative libraries

+

SpecChannelShuffle

+

Added in v0.13.0

+

Shuffle the channels of a multichannel spectrogram. This can help combat positional bias.

+

SpecFrequencyMask

+

Added in v0.13.0

+

Mask a set of frequencies in a spectrogram, à la Google AI SpecAugment. This type of data +augmentation has proved to make speech recognition models more robust.

+

The masked frequencies can be replaced with either the mean of the original values or a +given constant (e.g. zero).

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/AddBackgroundNoise.webp b/waveform_transforms/AddBackgroundNoise.webp new file mode 100644 index 00000000..d46db812 Binary files /dev/null and b/waveform_transforms/AddBackgroundNoise.webp differ diff --git a/waveform_transforms/AddBackgroundNoise_input.flac b/waveform_transforms/AddBackgroundNoise_input.flac new file mode 100644 index 00000000..89b6734f Binary files /dev/null and b/waveform_transforms/AddBackgroundNoise_input.flac differ diff --git a/waveform_transforms/AddBackgroundNoise_transformed.flac b/waveform_transforms/AddBackgroundNoise_transformed.flac new file mode 100644 index 00000000..4695ba34 Binary files /dev/null and b/waveform_transforms/AddBackgroundNoise_transformed.flac differ diff --git a/waveform_transforms/AddGaussianNoise.webp b/waveform_transforms/AddGaussianNoise.webp new file mode 100644 index 00000000..e2aa48df Binary files /dev/null and b/waveform_transforms/AddGaussianNoise.webp differ diff --git a/waveform_transforms/AddGaussianNoise_input.flac b/waveform_transforms/AddGaussianNoise_input.flac new file mode 100644 index 00000000..89b6734f Binary files /dev/null and b/waveform_transforms/AddGaussianNoise_input.flac differ diff --git a/waveform_transforms/AddGaussianNoise_transformed.flac b/waveform_transforms/AddGaussianNoise_transformed.flac new file mode 100644 index 00000000..e6581b9f Binary files /dev/null and b/waveform_transforms/AddGaussianNoise_transformed.flac differ diff --git a/waveform_transforms/AddGaussianSNR.webp b/waveform_transforms/AddGaussianSNR.webp new file mode 100644 index 00000000..7beddeb7 Binary files /dev/null and b/waveform_transforms/AddGaussianSNR.webp differ diff --git a/waveform_transforms/AddGaussianSNR_input.flac b/waveform_transforms/AddGaussianSNR_input.flac new file mode 100644 index 00000000..89b6734f Binary files /dev/null and b/waveform_transforms/AddGaussianSNR_input.flac differ diff --git a/waveform_transforms/AddGaussianSNR_transformed.flac b/waveform_transforms/AddGaussianSNR_transformed.flac new file mode 100644 index 00000000..1f1dd76d Binary files /dev/null and b/waveform_transforms/AddGaussianSNR_transformed.flac differ diff --git a/waveform_transforms/AddShortNoises.webp b/waveform_transforms/AddShortNoises.webp new file mode 100644 index 00000000..a3d52a9d Binary files /dev/null and b/waveform_transforms/AddShortNoises.webp differ diff --git a/waveform_transforms/AddShortNoises_input.flac b/waveform_transforms/AddShortNoises_input.flac new file mode 100644 index 00000000..9a274e00 Binary files /dev/null and b/waveform_transforms/AddShortNoises_input.flac differ diff --git a/waveform_transforms/AddShortNoises_transformed.flac b/waveform_transforms/AddShortNoises_transformed.flac new file mode 100644 index 00000000..55830941 Binary files /dev/null and b/waveform_transforms/AddShortNoises_transformed.flac differ diff --git a/waveform_transforms/AdjustDuration.webp b/waveform_transforms/AdjustDuration.webp new file mode 100644 index 00000000..65b2e955 Binary files /dev/null and b/waveform_transforms/AdjustDuration.webp differ diff --git a/waveform_transforms/AdjustDuration_input.flac b/waveform_transforms/AdjustDuration_input.flac new file mode 100644 index 00000000..9a274e00 Binary files /dev/null and b/waveform_transforms/AdjustDuration_input.flac differ diff --git a/waveform_transforms/AdjustDuration_transformed.flac b/waveform_transforms/AdjustDuration_transformed.flac new file mode 100644 index 00000000..32de3df7 Binary files /dev/null and b/waveform_transforms/AdjustDuration_transformed.flac differ diff --git a/waveform_transforms/AirAbsorption.webp b/waveform_transforms/AirAbsorption.webp new file mode 100644 index 00000000..84baaa39 Binary files /dev/null and b/waveform_transforms/AirAbsorption.webp differ diff --git a/waveform_transforms/AirAbsorption_input.flac b/waveform_transforms/AirAbsorption_input.flac new file mode 100644 index 00000000..0390a382 Binary files /dev/null and b/waveform_transforms/AirAbsorption_input.flac differ diff --git a/waveform_transforms/AirAbsorption_transformed.flac b/waveform_transforms/AirAbsorption_transformed.flac new file mode 100644 index 00000000..2f5df076 Binary files /dev/null and b/waveform_transforms/AirAbsorption_transformed.flac differ diff --git a/waveform_transforms/Aliasing.webp b/waveform_transforms/Aliasing.webp new file mode 100644 index 00000000..21d59b4e Binary files /dev/null and b/waveform_transforms/Aliasing.webp differ diff --git a/waveform_transforms/Aliasing_input.flac b/waveform_transforms/Aliasing_input.flac new file mode 100644 index 00000000..0390a382 Binary files /dev/null and b/waveform_transforms/Aliasing_input.flac differ diff --git a/waveform_transforms/Aliasing_transformed.flac b/waveform_transforms/Aliasing_transformed.flac new file mode 100644 index 00000000..4f19c1dc Binary files /dev/null and b/waveform_transforms/Aliasing_transformed.flac differ diff --git a/waveform_transforms/ApplyImpulseResponse.webp b/waveform_transforms/ApplyImpulseResponse.webp new file mode 100644 index 00000000..44201c95 Binary files /dev/null and b/waveform_transforms/ApplyImpulseResponse.webp differ diff --git a/waveform_transforms/ApplyImpulseResponse_input.flac b/waveform_transforms/ApplyImpulseResponse_input.flac new file mode 100644 index 00000000..0390a382 Binary files /dev/null and b/waveform_transforms/ApplyImpulseResponse_input.flac differ diff --git a/waveform_transforms/ApplyImpulseResponse_transformed.flac b/waveform_transforms/ApplyImpulseResponse_transformed.flac new file mode 100644 index 00000000..060dff3d Binary files /dev/null and b/waveform_transforms/ApplyImpulseResponse_transformed.flac differ diff --git a/waveform_transforms/BandPassFilter.webp b/waveform_transforms/BandPassFilter.webp new file mode 100644 index 00000000..eed1da8a Binary files /dev/null and b/waveform_transforms/BandPassFilter.webp differ diff --git a/waveform_transforms/BandPassFilter_input.flac b/waveform_transforms/BandPassFilter_input.flac new file mode 100644 index 00000000..0390a382 Binary files /dev/null and b/waveform_transforms/BandPassFilter_input.flac differ diff --git a/waveform_transforms/BandPassFilter_transformed.flac b/waveform_transforms/BandPassFilter_transformed.flac new file mode 100644 index 00000000..e3a9c829 Binary files /dev/null and b/waveform_transforms/BandPassFilter_transformed.flac differ diff --git a/waveform_transforms/BandStopFilter.webp b/waveform_transforms/BandStopFilter.webp new file mode 100644 index 00000000..608769c4 Binary files /dev/null and b/waveform_transforms/BandStopFilter.webp differ diff --git a/waveform_transforms/BandStopFilter_input.flac b/waveform_transforms/BandStopFilter_input.flac new file mode 100644 index 00000000..0390a382 Binary files /dev/null and b/waveform_transforms/BandStopFilter_input.flac differ diff --git a/waveform_transforms/BandStopFilter_transformed.flac b/waveform_transforms/BandStopFilter_transformed.flac new file mode 100644 index 00000000..ccd3ae37 Binary files /dev/null and b/waveform_transforms/BandStopFilter_transformed.flac differ diff --git a/waveform_transforms/BitCrush.webp b/waveform_transforms/BitCrush.webp new file mode 100644 index 00000000..f7a9e64d Binary files /dev/null and b/waveform_transforms/BitCrush.webp differ diff --git a/waveform_transforms/BitCrush_input.flac b/waveform_transforms/BitCrush_input.flac new file mode 100644 index 00000000..0390a382 Binary files /dev/null and b/waveform_transforms/BitCrush_input.flac differ diff --git a/waveform_transforms/BitCrush_transformed.flac b/waveform_transforms/BitCrush_transformed.flac new file mode 100644 index 00000000..8b4b4bef Binary files /dev/null and b/waveform_transforms/BitCrush_transformed.flac differ diff --git a/waveform_transforms/Limiter.webp b/waveform_transforms/Limiter.webp new file mode 100644 index 00000000..5926c91d Binary files /dev/null and b/waveform_transforms/Limiter.webp differ diff --git a/waveform_transforms/Limiter_input.flac b/waveform_transforms/Limiter_input.flac new file mode 100644 index 00000000..89b6734f Binary files /dev/null and b/waveform_transforms/Limiter_input.flac differ diff --git a/waveform_transforms/Limiter_transformed.flac b/waveform_transforms/Limiter_transformed.flac new file mode 100644 index 00000000..b1a96ff1 Binary files /dev/null and b/waveform_transforms/Limiter_transformed.flac differ diff --git a/waveform_transforms/PitchShift.webp b/waveform_transforms/PitchShift.webp new file mode 100644 index 00000000..1bca168b Binary files /dev/null and b/waveform_transforms/PitchShift.webp differ diff --git a/waveform_transforms/PitchShift_input.flac b/waveform_transforms/PitchShift_input.flac new file mode 100644 index 00000000..55afcd91 Binary files /dev/null and b/waveform_transforms/PitchShift_input.flac differ diff --git a/waveform_transforms/PitchShift_transformed.flac b/waveform_transforms/PitchShift_transformed.flac new file mode 100644 index 00000000..04f80139 Binary files /dev/null and b/waveform_transforms/PitchShift_transformed.flac differ diff --git a/waveform_transforms/RepeatPart.webp b/waveform_transforms/RepeatPart.webp new file mode 100644 index 00000000..7098d459 Binary files /dev/null and b/waveform_transforms/RepeatPart.webp differ diff --git a/waveform_transforms/RepeatPart_input.flac b/waveform_transforms/RepeatPart_input.flac new file mode 100644 index 00000000..0390a382 Binary files /dev/null and b/waveform_transforms/RepeatPart_input.flac differ diff --git a/waveform_transforms/RepeatPart_transformed.flac b/waveform_transforms/RepeatPart_transformed.flac new file mode 100644 index 00000000..863e88fd Binary files /dev/null and b/waveform_transforms/RepeatPart_transformed.flac differ diff --git a/waveform_transforms/Reverse.webp b/waveform_transforms/Reverse.webp new file mode 100644 index 00000000..3d0541fb Binary files /dev/null and b/waveform_transforms/Reverse.webp differ diff --git a/waveform_transforms/Reverse_input.flac b/waveform_transforms/Reverse_input.flac new file mode 100644 index 00000000..0390a382 Binary files /dev/null and b/waveform_transforms/Reverse_input.flac differ diff --git a/waveform_transforms/Reverse_transformed.flac b/waveform_transforms/Reverse_transformed.flac new file mode 100644 index 00000000..9468a88f Binary files /dev/null and b/waveform_transforms/Reverse_transformed.flac differ diff --git a/waveform_transforms/TanhDistortion.webp b/waveform_transforms/TanhDistortion.webp new file mode 100644 index 00000000..bd726886 Binary files /dev/null and b/waveform_transforms/TanhDistortion.webp differ diff --git a/waveform_transforms/TanhDistortion_input.flac b/waveform_transforms/TanhDistortion_input.flac new file mode 100644 index 00000000..c9ac9fc0 Binary files /dev/null and b/waveform_transforms/TanhDistortion_input.flac differ diff --git a/waveform_transforms/TanhDistortion_transformed.flac b/waveform_transforms/TanhDistortion_transformed.flac new file mode 100644 index 00000000..c757f116 Binary files /dev/null and b/waveform_transforms/TanhDistortion_transformed.flac differ diff --git a/waveform_transforms/TimeMask.webp b/waveform_transforms/TimeMask.webp new file mode 100644 index 00000000..5a1d9487 Binary files /dev/null and b/waveform_transforms/TimeMask.webp differ diff --git a/waveform_transforms/TimeMask_input.flac b/waveform_transforms/TimeMask_input.flac new file mode 100644 index 00000000..89b6734f Binary files /dev/null and b/waveform_transforms/TimeMask_input.flac differ diff --git a/waveform_transforms/TimeMask_transformed.flac b/waveform_transforms/TimeMask_transformed.flac new file mode 100644 index 00000000..a04afb97 Binary files /dev/null and b/waveform_transforms/TimeMask_transformed.flac differ diff --git a/waveform_transforms/TimeStretch.webp b/waveform_transforms/TimeStretch.webp new file mode 100644 index 00000000..9f9a4b20 Binary files /dev/null and b/waveform_transforms/TimeStretch.webp differ diff --git a/waveform_transforms/TimeStretch_input.flac b/waveform_transforms/TimeStretch_input.flac new file mode 100644 index 00000000..c9ac9fc0 Binary files /dev/null and b/waveform_transforms/TimeStretch_input.flac differ diff --git a/waveform_transforms/TimeStretch_transformed.flac b/waveform_transforms/TimeStretch_transformed.flac new file mode 100644 index 00000000..10b01856 Binary files /dev/null and b/waveform_transforms/TimeStretch_transformed.flac differ diff --git a/waveform_transforms/Trim.webp b/waveform_transforms/Trim.webp new file mode 100644 index 00000000..de909d57 Binary files /dev/null and b/waveform_transforms/Trim.webp differ diff --git a/waveform_transforms/Trim_input.flac b/waveform_transforms/Trim_input.flac new file mode 100644 index 00000000..89b6734f Binary files /dev/null and b/waveform_transforms/Trim_input.flac differ diff --git a/waveform_transforms/Trim_transformed.flac b/waveform_transforms/Trim_transformed.flac new file mode 100644 index 00000000..ce887132 Binary files /dev/null and b/waveform_transforms/Trim_transformed.flac differ diff --git a/waveform_transforms/add_background_noise/index.html b/waveform_transforms/add_background_noise/index.html new file mode 100644 index 00000000..154d47e1 --- /dev/null +++ b/waveform_transforms/add_background_noise/index.html @@ -0,0 +1,1204 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + AddBackgroundNoise - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

AddBackgroundNoise

+

Added in v0.9.0

+

Mix in another sound, e.g. a background noise. Useful if your original sound is clean and +you want to simulate an environment where background noise is present.

+

Can also be used for mixup when training +classification/annotation models.

+

A path to a file/folder with sound(s), or a list of file/folder paths, must be +specified. These sounds should ideally be at least as long as the input sounds to be +transformed. Otherwise, the background sound will be repeated, which may sound unnatural.

+

Note that in the default case (noise_rms="relative") the gain of the added noise is +relative to the amount of signal in the input. This implies that if the input is +completely silent, no noise will be added.

+

Optionally, the added noise sound can be transformed (with noise_transform) before it gets mixed in.

+

Here are some examples of datasets that can be downloaded and used as background noise:

+ +

Input-output example

+

Here we add some music to a speech recording, targeting a signal-to-noise ratio (SNR) of +5 decibels (dB), which means that the speech (signal) is 5 dB louder than the music (noise).

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage examples

+
+
+
+
from audiomentations import AddBackgroundNoise, PolarityInversion
+
+transform = AddBackgroundNoise(
+    sounds_path="/path/to/folder_with_sound_files",
+    min_snr_in_db=3.0,
+    max_snr_in_db=30.0,
+    noise_transform=PolarityInversion(),
+    p=1.0
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+
+
+
from audiomentations import AddBackgroundNoise, PolarityInversion
+
+transform = AddBackgroundNoise(
+    sounds_path="/path/to/folder_with_sound_files",
+    noise_rms="absolute",
+    min_absolute_rms_in_db=-45.0,
+    max_absolute_rms_in_db=-15.0,
+    noise_transform=PolarityInversion(),
+    p=1.0
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+
+
+
+

AddBackgroundNoise API

+
+
sounds_path: Union[List[Path], List[str], Path, str]
+
A path or list of paths to audio file(s) and/or folder(s) +with audio files. Can be str or Path instance(s). The audio files given here are +supposed to be background noises.
+
min_snr_db: float • unit: Decibel
+
Default: 3.0. Minimum signal-to-noise ratio in dB. Is only +used if noise_rms is set to "relative"
+
max_snr_db: float • unit: Decibel
+
Default: 30.0. Maximum signal-to-noise ratio in dB. Is +only used if noise_rms is set to "relative"
+
min_snr_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use min_snr_db instead
+
max_snr_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use max_snr_db instead
+
noise_rms: str • choices: "absolute", "relative"
+
Default: "relative". Defines how the background noise will +be added to the audio input. If the chosen option is "relative", the root mean +square (RMS) of the added noise will be proportional to the RMS of the input sound. +If the chosen option is "absolute", the background noise will have an RMS +independent of the rms of the input audio file
+
min_absolute_rms_db: float • unit: Decibel
+
Default: -45.0. Is only used if noise_rms is set to +"absolute". It is the minimum RMS value in dB that the added noise can take. The +lower the RMS is, the lower the added sound will be.
+
max_absolute_rms_db: float • unit: Decibel
+
Default: -15.0. Is only used if noise_rms is set to +"absolute". It is the maximum RMS value in dB that the added noise can take. Note +that this value can not exceed 0.
+
min_absolute_rms_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use min_absolute_rms_db instead
+
max_absolute_rms_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use max_absolute_rms_in_db instead
+
noise_transform: Optional[Callable[[NDArray[np.float32], int], NDArray[np.float32]]]
+
Default: None. A callable waveform transform (or +composition of transforms) that gets applied to the noise before it gets mixed in. +The callable is expected to input audio waveform (numpy array) and sample rate (int).
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
lru_cache_size: int
+
Default: 2. Maximum size of the LRU cache for storing noise files in memory
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/add_color_noise/index.html b/waveform_transforms/add_color_noise/index.html new file mode 100644 index 00000000..a7794f4d --- /dev/null +++ b/waveform_transforms/add_color_noise/index.html @@ -0,0 +1,1128 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + AddColorNoise - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

AddColorNoise

+

To be added in v0.35.0

+

Mix in noise with color, optionally weighted by an A-weighting curve. When +f_decay=0, this is equivalent to AddGaussianNoise. Otherwise, see: Colors of Noise .

+

AddColorNoise API

+
+
min_snr_db: float • unit: Decibel
+
Default: 5.0. Minimum signal-to-noise ratio in dB. A lower +number means more noise.
+
max_snr_db: float • unit: Decibel
+
Default: 40.0. Maximum signal-to-noise ratio in dB. A +greater number means less noise.
+
min_f_decay: float • unit: Decibels/octave
+
Default: -6.0. Minimum noise decay in dB per octave.
+
max_f_decay: float • unit: Decibels/octave
+
Default: 6.0. Maximum noise decay in dB per octave.
+
+

Those values can be chosen from the following table:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Colourf_decay (db/octave)
pink-3.01
brown/brownian-6.02
red-6.02
blue3.01
azure3.01
violet6.02
white0.0
+

See Colors of noise on Wikipedia about those values.

+
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
p_apply_a_weighting: float • range: [0.0, 1.0]
+
Default: 0.0. The probability of additionally weighting the transform using an A-weighting curve.
+
n_fft: int
+
Default: 128. The number of points the decay curve is computed (for coloring white noise).
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/add_gaussian_noise/index.html b/waveform_transforms/add_gaussian_noise/index.html new file mode 100644 index 00000000..038a76a1 --- /dev/null +++ b/waveform_transforms/add_gaussian_noise/index.html @@ -0,0 +1,1131 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + AddGaussianNoise - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

AddGaussianNoise

+

Added in v0.1.0

+

Add gaussian noise to the samples

+

Input-output example

+

Here we add some gaussian noise (with amplitude 0.01) to a speech recording.

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import AddGaussianNoise
+
+transform = AddGaussianNoise(
+    min_amplitude=0.001,
+    max_amplitude=0.015,
+    p=1.0
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+

AddGaussianNoise API

+
+
min_amplitude: float • unit: linear amplitude
+
Default: 0.001. Minimum noise amplification factor.
+
max_amplitude: float • unit: linear amplitude
+
Default: 0.015. Maximum noise amplification factor.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/add_gaussian_snr/index.html b/waveform_transforms/add_gaussian_snr/index.html new file mode 100644 index 00000000..9af27503 --- /dev/null +++ b/waveform_transforms/add_gaussian_snr/index.html @@ -0,0 +1,1152 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + AddGaussianSNR - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

AddGaussianSNR

+

Added in v0.7.0

+

The AddGaussianSNR transform injects Gaussian noise into an audio signal. It applies +a Signal-to-Noise Ratio (SNR) that is chosen randomly from a uniform distribution on the +decibel scale. This choice is consistent with the nature of human hearing, which is +logarithmic rather than linear.

+

SNR is a common measure used in science and engineering to compare the level of a +desired signal to the level of noise. In the context of audio, the signal is the +meaningful sound that you're interested in, like a person's voice, music, or other +audio content, while the noise is unwanted sound that can interfere with the signal.

+

The SNR quantifies the ratio of the power of the signal to the power of the noise. The +higher the SNR, the less the noise is present in relation to the signal.

+

Gaussian noise, a kind of white noise, is a type of statistical noise where the +amplitude of the noise signal follows a Gaussian distribution. This means that most of +the samples are close to the mean (zero), and fewer of them are farther away. It's +called Gaussian noise due to its characteristic bell-shaped Gaussian distribution.

+

Gaussian noise is similar to the sound of a radio or TV tuned to a nonexistent station: +a kind of constant, uniform hiss or static.

+

Input-output example

+

Here we add some gaussian noise (with SNR = 16 dB) to a speech recording.

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import AddGaussianSNR
+
+transform = AddGaussianSNR(
+    min_snr_db=5.0,
+    max_snr_db=40.0,
+    p=1.0
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+

AddGaussianSNR API

+
+
min_snr_db: float • unit: Decibel
+
Default: 5.0. Minimum signal-to-noise ratio in dB. A lower +number means more noise.
+
max_snr_db: float • unit: decibel
+
Default: 40.0. Maximum signal-to-noise ratio in dB. A +greater number means less noise.
+
min_snr_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use min_snr_db instead
+
max_snr_in_db: float • unit: decibel
+
⚠ Deprecated as of v0.31.0. Use max_snr_db instead
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/add_short_noises/index.html b/waveform_transforms/add_short_noises/index.html new file mode 100644 index 00000000..f135493c --- /dev/null +++ b/waveform_transforms/add_short_noises/index.html @@ -0,0 +1,1257 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + AddShortNoises - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

AddShortNoises

+

Added in v0.9.0

+

Mix in various (bursts of overlapping) sounds with random pauses between. Useful if your +original sound is clean and you want to simulate an environment where short noises sometimes +occur.

+

A folder of (noise) sounds to be mixed in must be specified.

+

Input-output example

+

Here we add some short noise sounds to a voice recording.

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage examples

+
+
+
+
from audiomentations import AddShortNoises, PolarityInversion
+
+transform = AddShortNoises(
+    sounds_path="/path/to/folder_with_sound_files",
+    min_snr_in_db=3.0,
+    max_snr_in_db=30.0,
+    noise_rms="relative_to_whole_input",
+    min_time_between_sounds=2.0,
+    max_time_between_sounds=8.0,
+    noise_transform=PolarityInversion(),
+    p=1.0
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+
+
+
from audiomentations import AddShortNoises, PolarityInversion
+
+transform = AddShortNoises(
+    sounds_path="/path/to/folder_with_sound_files",
+    min_absolute_noise_rms_db=-50.0,
+    max_absolute_noise_rms_db=-20.0,        
+    noise_rms="absolute",
+    min_time_between_sounds=2.0,
+    max_time_between_sounds=8.0,
+    noise_transform=PolarityInversion(),
+    p=1.0
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+
+
+
+

AddShortNoises API

+
+
sounds_path: Union[List[Path], List[str], Path, str]
+
A path or list of paths to audio file(s) and/or folder(s) +with audio files. Can be str or Path instance(s). The audio files given here are +supposed to be (short) noises.
+
min_snr_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use min_snr_db instead
+
max_snr_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use max_snr_db instead
+
min_snr_db: float • unit: Decibel
+
Default: -6.0. Minimum signal-to-noise ratio in dB. A lower +value means the added sounds/noises will be louder. This gets ignored if noise_rms +is set to "absolute".
+
max_snr_db: float • unit: Decibel
+
Default: 18.0. Maximum signal-to-noise ratio in dB. A +lower value means the added sounds/noises will be louder. This gets ignored if +noise_rms is set to "absolute".
+
min_time_between_sounds: float • unit: seconds
+
Default: 2.0. Minimum pause time (in seconds) between the +added sounds/noises
+
max_time_between_sounds: float • unit: seconds
+
Default: 8.0. Maximum pause time (in seconds) between the +added sounds/noises
+
noise_rms: str • choices: "absolute", "relative", "relative_to_whole_input"
+
+

Default: "relative" (<=v0.27), but will be changed to +"relative_to_whole_input" in a future version.

+

This parameter defines how the noises will be added to the audio input.

+
    +
  • "relative": the RMS value of the added noise will be proportional to the RMS value of + the input sound calculated only for the region where the noise is added.
  • +
  • "absolute": the added noises will have an RMS independent of the RMS of the input audio + file.
  • +
  • "relative_to_whole_input": the RMS of the added noises will be + proportional to the RMS of the whole input sound.
  • +
+
+
min_absolute_noise_rms_db: float • unit: Decibel
+
Default: -50.0. Is only used if noise_rms is set to +"absolute". It is the minimum RMS value in dB that the added noise can take. The +lower the RMS is, the lower will the added sound be.
+
max_absolute_noise_rms_db: float • unit: seconds
+
Default: -20.0. Is only used if noise_rms is set to +"absolute". It is the maximum RMS value in dB that the added noise can take. Note +that this value can not exceed 0.
+
add_all_noises_with_same_level: bool
+
Default: False. Whether to add all the short noises +(within one audio snippet) with the same SNR. If noise_rms is set to "absolute", +the RMS is used instead of SNR. The target SNR (or RMS) will change every time the +parameters of the transform are randomized.
+
include_silence_in_noise_rms_estimation: bool
+
Default: True. It chooses how the RMS of +the noises to be added will be calculated. If this option is set to False, the silence +in the noise files will be disregarded in the RMS calculation. It is useful for +non-stationary noises where silent periods occur.
+
burst_probability: float
+
Default: 0.22. For every noise that gets added, there +is a probability of adding an extra burst noise that overlaps with the noise. This +parameter controls that probability. min_pause_factor_during_burst and +max_pause_factor_during_burst control the amount of overlap.
+
min_pause_factor_during_burst: float
+
Default: 0.1. Min value of how far into the current sound (as +fraction) the burst sound should start playing. The value must be greater than 0.
+
max_pause_factor_during_burst: float
+
Default: 1.1. Max value of how far into the current sound (as +fraction) the burst sound should start playing. The value must be greater than 0.
+
min_fade_in_time: float • unit: seconds
+
Default: 0.005. Min noise fade in time in seconds. Use a +value larger than 0 to avoid a "click" at the start of the noise.
+
max_fade_in_time: float • unit: seconds
+
Default: 0.08. Max noise fade in time in seconds. Use a +value larger than 0 to avoid a "click" at the start of the noise.
+
min_fade_out_time: float • unit: seconds
+
Default: 0.01. Min sound/noise fade out time in seconds. +Use a value larger than 0 to avoid a "click" at the end of the sound/noise.
+
max_fade_out_time: float • unit: seconds
+
Default: 0.1. Max sound/noise fade out time in seconds. +Use a value larger than 0 to avoid a "click" at the end of the sound/noise.
+
signal_gain_in_db_during_noise: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use signal_gain_db_during_noise instead
+
signal_gain_db_during_noise: float • unit: Decibel
+
+

Default: 0.0. Gain applied to the signal during a short noise. +When fading the signal to the custom gain, the same fade times are used as +for the noise, so it's essentially cross-fading. The default value (0.0) means +the signal will not be gained. If set to a very low value, e.g. -100.0, this +feature could be used for completely replacing the signal with the noise. +This could be relevant in some use cases, for example:

+
    +
  • replace the signal with another signal of a similar class (e.g. replace some + speech with a cough)
  • +
  • simulate an ECG off-lead condition (electrodes are temporarily disconnected)
  • +
+
+
noise_transform: Optional[Callable[[NDArray[np.float32], int], NDArray[np.float32]]]
+
Default: None. A callable waveform transform (or +composition of transforms) that gets applied to noises before they get mixed in.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
lru_cache_size: int
+
Default: 64. Maximum size of the LRU cache for storing +noise files in memory
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/adjust_duration/index.html b/waveform_transforms/adjust_duration/index.html new file mode 100644 index 00000000..276ec9cb --- /dev/null +++ b/waveform_transforms/adjust_duration/index.html @@ -0,0 +1,1141 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + AdjustDuration - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

AdjustDuration

+

Added in v0.30.0

+

Trim or pad the audio to the specified length/duration in samples or seconds. If the +input sound is longer than the target duration, pick a random offset and crop the +sound to the target duration. If the input sound is shorter than the target +duration, pad the sound so the duration matches the target duration.

+

This transform can be useful if you need audio with constant length, e.g. as input to a +machine learning model. The reason for varying audio clip lengths can be e.g.

+
    +
  • the nature of the audio dataset (different audio clips have different lengths)
  • +
  • data augmentation transforms that change the lengths (e.g. time stretching or + convolving with impulse responses without cutting the tail)
  • +
+

Input-output example

+

Here we input an audio clip and remove a part of the start and the end, so the length of the result matches the specified target length.

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage examples

+
+
+
+
from audiomentations import AdjustDuration
+
+transform = AdjustDuration(duration_samples=60000, p=1.0)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+
+
+
from audiomentations import AdjustDuration
+
+transform = AdjustDuration(duration_seconds=3.75, p=1.0)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+
+
+
+

AdjustDuration API

+
+
duration_samples: int • range: [0, ∞)
+
Target duration in number of samples.
+
duration_seconds: float • range: [0.0, ∞)
+
Target duration in seconds.
+
padding_mode: str • choices: "silence", "wrap", "reflect"
+
Default: "silence". Padding mode. Only used when audio input is shorter than the target duration.
+
padding_position: str • choices: "start", "end"
+
Default: "end". The position of the inserted/added padding. Only used when audio input is shorter than the target duration.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/air_absorption/index.html b/waveform_transforms/air_absorption/index.html new file mode 100644 index 00000000..7e529c0a --- /dev/null +++ b/waveform_transforms/air_absorption/index.html @@ -0,0 +1,1156 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + AirAbsorption - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

AirAbsorption

+

Added in v0.25.0

+

A lowpass-like filterbank with variable octave attenuation that simulates attenuation of +high frequencies due to air absorption. This transform is parametrized by temperature, +humidity, and the distance between audio source and microphone.

+

This is not a scientifically accurate transform but basically applies a uniform +filterbank with attenuations given by:

+

att = exp(- distance * absorption_coefficient)

+

where distance is the microphone-source assumed distance in meters and absorption_coefficient +is adapted from a lookup table by pyroomacoustics. +It can also be seen as a lowpass filter with variable octave attenuation.

+

Note that since this transform mostly affects high frequencies, it is only +suitable for audio with sufficiently high sample rate, like 32 kHz and above.

+

Note also that this transform only "simulates" the dampening of high frequencies, and +does not attenuate according to the distance law. Gain augmentation needs to be done +separately.

+

Input-output example

+

Here we input a high-quality speech recording and apply AirAbsorption with an air +temperature of 20 degrees celsius, 70% humidity and a distance of 20 meters. One can see +clearly in the spectrogram that the highs, especially above ~13 kHz, are rolled off in +the output, but it may require a quiet room and some concentration to +hear it clearly in the audio comparison.

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import AirAbsorption
+
+transform = AirAbsorption(
+    min_distance=10.0,
+    max_distance=50.0,
+    p=1.0,
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=48000)
+
+

AirAbsorption API

+
+
min_temperature: float • unit: Celsius • choices: [10.0, 20.0]
+
Default: 10.0. Minimum temperature in Celsius (can take a value of either 10.0 or 20.0)
+
max_temperature: float • unit: Celsius • choices: [10.0, 20.0]
+
Default: 20.0. Maximum temperature in Celsius (can take a value of either 10.0 or 20.0)
+
min_humidity: float • unit: percent • range: [30.0, 90.0]
+
Default: 30.0. Minimum humidity in percent (between 30.0 and 90.0)
+
max_humidity: float • unit: percent • range: [30.0, 90.0]
+
Default: 90.0. Maximum humidity in percent (between 30.0 and 90.0)
+
min_distance: float • unit: meters
+
Default: 10.0. Minimum microphone-source distance in meters.
+
max_distance: float • unit: meters
+
Default: 100.0. Maximum microphone-source distance in meters.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/aliasing/index.html b/waveform_transforms/aliasing/index.html new file mode 100644 index 00000000..402349ca --- /dev/null +++ b/waveform_transforms/aliasing/index.html @@ -0,0 +1,1126 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Aliasing - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Aliasing

+

To be added in v0.35.0

+

Downsample the audio to a lower sample rate by linear interpolation, without low-pass +filtering it first, resulting in aliasing artifacts. You get aliasing artifacts when +there is high-frequency audio in the input audio that falls above the nyquist frequency +of the chosen target sample rate. Audio with frequencies above the nyquist frequency +cannot be reproduced accurately and get "reflected"/mirrored to other frequencies. The +aliasing artifacts "replace" the original high frequency signals. The result can be +described as coarse and metallic.

+

After the downsampling, the signal gets upsampled to the original signal again, so the +length of the output becomes the same as the length of the input.

+

For more information, see

+
    +
  • Sample rate reduction on Wikipedia
  • +
  • Intro to downsampling by NTNU, Department of Music, Music Technology. Note: that article describes a slightly different downsampling technique, called sample-and-hold, while audiomentations implements linear interpolation. However, both methods lead to aliasing artifacts.
  • +
+

Input-output example

+

Here we target a sample rate of 12000 Hz. Note the vertical mirroring in the spectrogram in the transformed sound.

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import Aliasing
+
+transform = Aliasing(min_sample_rate=8000, max_sample_rate=30000, p=1.0)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=44100)
+
+

Aliasing API

+
+
min_sample_rate: int • unit: Hz • range: [2, ∞)
+
Minimum target sample rate to downsample to
+
max_sample_rate: int • unit: Hz • range: [2, ∞)
+
Maximum target sample rate to downsample to
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/apply_impulse_response/index.html b/waveform_transforms/apply_impulse_response/index.html new file mode 100644 index 00000000..04935ebc --- /dev/null +++ b/waveform_transforms/apply_impulse_response/index.html @@ -0,0 +1,1166 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + ApplyImpulseResponse - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

ApplyImpulseResponse

+

Added in v0.7.0

+

This transform convolves the audio with a randomly selected (room) impulse response file.

+

ApplyImpulseResponse is commonly used as a data augmentation technique that adds +realistic-sounding reverb to recordings. This can for example make denoisers and speech +recognition systems more robust to different acoustic environments and distances between +the sound source and the microphone. It could also be used to generate roomy audio +examples for the training of dereverberation models.

+

Convolution with an impulse response is a powerful technique in signal processing that +can be employed to emulate the acoustic characteristics of specific environments or +devices. This process can transform a dry recording, giving it the sonic signature of +being played in a specific location or through a particular device.

+

What is an impulse response? An impulse response (IR) captures the unique acoustical +signature of a space or object. It's essentially a recording of how a specific +environment or system responds to an impulse (a short, sharp sound). By convolving +an audio signal with an impulse response, we can simulate how that signal would sound in +the captured environment.

+

Note that some impulse responses, especially those captured in larger spaces or from +specific equipment, can introduce a noticeable delay when convolved with an audio +signal. In some applications, this delay is a desirable property. However, in some other +applications, the convolved audio should not have a delay compared to the original +audio. If this is the case for you, you can align the audio afterwards with +fast-align-audio , for example.

+

Impulse responses can be created using e.g. http://tulrich.com/recording/ir_capture/

+

Some datasets of impulse responses are publicly available:

+
    +
  • EchoThief containing 115 impulse responses acquired in a + wide range of locations.
  • +
  • The MIT McDermott dataset + containing 271 impulse responses acquired in everyday places.
  • +
+

Impulse responses are represented as audio (ideally wav) files in the given ir_path.

+

Another thing worth checking is that your IR files have the same sample rate as your +audio inputs. Why? Because if they have different sample rates, the internal resampling +will slow down execution, and because some high frequencies may get lost.

+

Input-output example

+

Here we make a dry speech recording quite reverbant by convolving it with a room impulse response

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import ApplyImpulseResponse
+
+transform = ApplyImpulseResponse(ir_path="/path/to/sound_folder", p=1.0)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=48000)
+
+

ApplyImpulseResponse API

+
+
ir_path: Union[List[Path], List[str], str, Path]
+
A path or list of paths to audio file(s) and/or folder(s) with +audio files. Can be str or Path instance(s). The audio files given here are +supposed to be (room) impulse responses.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
lru_cache_size: int
+
Default: 128. Maximum size of the LRU cache for storing +impulse response files in memory.
+
leave_length_unchanged: bool
+
Default: True. When set to True, the tail of the sound +(e.g. reverb at the end) will be chopped off so that the length of the output is +equal to the length of the input.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/band_pass_filter/index.html b/waveform_transforms/band_pass_filter/index.html new file mode 100644 index 00000000..b58a86cc --- /dev/null +++ b/waveform_transforms/band_pass_filter/index.html @@ -0,0 +1,1138 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + BandPassFilter - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

BandPassFilter

+

Added in v0.18.0, updated in v0.21.0

+

Apply band-pass filtering to the input audio. Filter steepness (6/12/18... dB / octave) +is parametrized. Can also be set for zero-phase filtering (will result in a 6 dB drop at +cutoffs).

+

Input-output example

+

Here we input a high-quality speech recording and apply BandPassFilter with a center +frequency of 2500 Hz and a bandwidth fraction of 0.8, which means that the bandwidth in +this example is 2000 Hz, so the low frequency cutoff is 1500 Hz and the high frequency +cutoff is 3500 Hz. One can see in the spectrogram that the high and the low frequencies +are both attenuated in the output. If you listen to the audio example, you might notice +that the transformed output almost sounds like a phone call from the time when +phone audio was narrowband and mostly contained frequencies between ~300 and ~3400 Hz.

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import BandPassFilter
+
+transform = BandPassFilter(min_center_freq=100.0, max_center_freq=6000, p=1.0)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=48000)
+
+

BandPassFilter API

+
+
min_center_freq: float • unit: hertz
+
Default: 200.0. Minimum center frequency in hertz
+
max_center_freq: float • unit: hertz
+
Default: 4000.0. Maximum center frequency in hertz
+
min_bandwidth_fraction: float • range: [0.0, 2.0]
+
Default: 0.5. Minimum bandwidth relative to center frequency
+
max_bandwidth_fraction: float • range: [0.0, 2.0]
+
Default: 1.99. Maximum bandwidth relative to center frequency
+
min_rolloff: float • unit: Decibels/octave
+
Default: 12. Minimum filter roll-off (in dB/octave). +Must be a multiple of 6
+
max_rolloff: float • unit: Decibels/octave
+
Default: 24. Maximum filter roll-off (in dB/octave) +Must be a multiple of 6
+
zero_phase: bool
+
Default: False. Whether filtering should be zero phase. +When this is set to True it will not affect the phase of the input signal but will +sound 3 dB lower at the cutoff frequency compared to the non-zero phase case (6 dB +vs. 3 dB). Additionally, it is 2 times slower than in the non-zero phase case. If +you absolutely want no phase distortions (e.g. want to augment an audio file with +lots of transients, like a drum track), set this to True.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/band_stop_filter/index.html b/waveform_transforms/band_stop_filter/index.html new file mode 100644 index 00000000..ae29399a --- /dev/null +++ b/waveform_transforms/band_stop_filter/index.html @@ -0,0 +1,1121 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + BandStopFilter - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

BandStopFilter

+

Added in v0.21.0

+

Apply band-stop filtering to the input audio. Also known as notch filter or +band reject filter. It relates to the frequency mask idea in the SpecAugment paper . +Center frequency gets picked in mel space, so it is somewhat aligned with human hearing, +which is not linear. Filter steepness (6/12/18... dB / octave) is parametrized. Can also +be set for zero-phase filtering (will result in a 6 dB drop at cutoffs).

+

Applying band-stop filtering as data augmentation during model training can aid in +preventing overfitting to specific frequency relationships, helping to make the model +robust to diverse audio environments and scenarios, where frequency losses can occur.

+

Input-output example

+

Here we input a speech recording and apply BandStopFilter with a center +frequency of 2500 Hz and a bandwidth fraction of 0.8, which means that the bandwidth in +this example is 2000 Hz, so the low frequency cutoff is 1500 Hz and the high frequency +cutoff is 3500 Hz. One can see in the spectrogram of the transformed sound that the band +stop filter has attenuated this frequency range. If you listen to the audio example, you +can hear that the timbre is different in the transformed sound than in the original.

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

BandStopFilter API

+
+
min_center_freq: float • unit: hertz
+
Default: 200.0. Minimum center frequency in hertz
+
max_center_freq: float • unit: hertz
+
Default: 4000.0. Maximum center frequency in hertz
+
min_bandwidth_fraction: float
+
Default: 0.5. Minimum bandwidth relative to center frequency
+
max_bandwidth_fraction: float
+
Default: 1.99. Maximum bandwidth relative to center frequency
+
min_rolloff: float • unit: Decibels/octave
+
Default: 12. Minimum filter roll-off (in dB/octave). +Must be a multiple of 6
+
max_rolloff: float • unit: Decibels/octave
+
Default: 24. Maximum filter roll-off (in dB/octave) +Must be a multiple of 6
+
zero_phase: bool
+
Default: False. Whether filtering should be zero phase. +When this is set to True it will not affect the phase of the input signal but will +sound 3 dB lower at the cutoff frequency compared to the non-zero phase case (6 dB +vs. 3 dB). Additionally, it is 2 times slower than in the non-zero phase case. If +you absolutely want no phase distortions (e.g. want to augment an audio file with +lots of transients, like a drum track), set this to True.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/bit_crush/index.html b/waveform_transforms/bit_crush/index.html new file mode 100644 index 00000000..09808739 --- /dev/null +++ b/waveform_transforms/bit_crush/index.html @@ -0,0 +1,1121 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + BitCrush - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

BitCrush

+

To be added in v0.35.0

+

Apply a bit crush effect to the audio by reducing the bit depth. In other words, it +reduces the number of bits that can be used for representing each audio sample. +This adds quantization noise, and affects dynamic range. This transform does not apply +dithering.

+

For more information, see

+ +

Input-output example

+

Here we reduce the bit depth from 16 to 6 bits per sample

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import BitCrush
+
+transform = BitCrush(min_bit_depth=5, max_bit_depth=14, p=1.0)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+

BitCrush API

+
+
min_bit_depth: int • unit: bits • range: [1, 32]
+
Minimum bit depth the audio will be "converted" to
+
max_bit_depth: int • unit: bits • range: [1, 32]
+
Maximum bit depth the audio will be "converted" to
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/clip/index.html b/waveform_transforms/clip/index.html new file mode 100644 index 00000000..3affeb6c --- /dev/null +++ b/waveform_transforms/clip/index.html @@ -0,0 +1,1039 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Clip - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Clip

+

Added in v0.17.0

+

Clip audio by specified values. e.g. set a_min=-1.0 and a_max=1.0 to ensure that no +samples in the audio exceed that extent. This can be relevant for avoiding integer +overflow or underflow (which results in unintended wrap distortion that can sound +horrible) when exporting to e.g. 16-bit PCM wav.

+

Another way of ensuring that all values stay between -1.0 and 1.0 is to apply +PeakNormalization.

+

This transform is different from ClippingDistortion in that it takes fixed values +for clipping instead of clipping a random percentile of the samples. Arguably, this +transform is not very useful for data augmentation. Instead, think of it as a very +cheap and harsh limiter (for samples that exceed the allotted extent) that can +sometimes be useful at the end of a data augmentation pipeline.

+

Clip API

+
+
a_min: float
+
Default: -1.0. Minimum value for clipping.
+
a_max: float
+
Default: 1.0. Maximum value for clipping.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/clipping_distortion/index.html b/waveform_transforms/clipping_distortion/index.html new file mode 100644 index 00000000..9d77a523 --- /dev/null +++ b/waveform_transforms/clipping_distortion/index.html @@ -0,0 +1,1034 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + ClippingDistortion - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

ClippingDistortion

+

Added in v0.8.0

+

Distort signal by clipping a random percentage of points

+

The percentage of points that will be clipped is drawn from a uniform distribution between +the two input parameters min_percentile_threshold and max_percentile_threshold. If for instance +30% is drawn, the samples are clipped if they're below the 15th or above the 85th percentile.

+

ClippingDistortion API

+
+
min_percentile_threshold: int
+
Default: 0. A lower bound on the total percent of samples +that will be clipped
+
max_percentile_threshold: int
+
Default: 40. An upper bound on the total percent of +samples that will be clipped
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/gain/index.html b/waveform_transforms/gain/index.html new file mode 100644 index 00000000..14eba551 --- /dev/null +++ b/waveform_transforms/gain/index.html @@ -0,0 +1,1037 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Gain - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Gain

+

Added in v0.11.0

+

Multiply the audio by a random amplitude factor to reduce or increase the volume. This +technique can help a model become somewhat invariant to the overall gain of the input audio.

+

Warning: This transform can return samples outside the [-1, 1] range, which may lead to +clipping or wrap distortion, depending on what you do with the audio in a later stage. +See also https://en.wikipedia.org/wiki/Clipping_(audio)#Digital_clipping

+

Gain API

+
+
min_gain_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use min_gain_db instead
+
max_gain_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use max_gain_db instead
+
min_gain_db: float • unit: Decibel
+
Default: -12.0. Minimum gain.
+
max_gain_db: float • unit: Decibel
+
Default: 12.0. Maximum gain.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/gain_transition/index.html b/waveform_transforms/gain_transition/index.html new file mode 100644 index 00000000..fa54cce7 --- /dev/null +++ b/waveform_transforms/gain_transition/index.html @@ -0,0 +1,1055 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + GainTransition - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

GainTransition

+

Added in v0.22.0

+

Gradually change the volume up or down over a random time span. Also known as +fade in and fade out. The fade works on a logarithmic scale, which is natural to +human hearing.

+

The way this works is that it picks two gains: a first gain and a second gain. +Then it picks a time range for the transition between those two gains. +Note that this transition can start before the audio starts and/or end after the +audio ends, so the output audio can start or end in the middle of a transition. +The gain starts at the first gain and is held constant until the transition start. +Then it transitions to the second gain. Then that gain is held constant until the +end of the sound.

+

GainTransition API

+
+
min_gain_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use min_gain_db instead
+
max_gain_in_db: float • unit: Decibel
+
⚠ Deprecated as of v0.31.0. Use max_gain_db instead
+
min_gain_db: float • unit: Decibel
+
Default: -24.0. Minimum gain.
+
max_gain_db: float • unit: Decibel
+
Default: 6.0. Maximum gain.
+
min_duration: Union[float, int] • unit: see duration_unit
+
Default: 0.2. Minimum length of transition.
+
max_duration: Union[float, int] • unit: see duration_unit
+
Default: 6.0. Maximum length of transition.
+
duration_unit: str • choices: "fraction", "samples", "seconds"
+
+

Default: "seconds". Defines the unit of the value of min_duration and max_duration.

+
    +
  • "fraction": Fraction of the total sound length
  • +
  • "samples": Number of audio samples
  • +
  • "seconds": Number of seconds
  • +
+
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/high_pass_filter/index.html b/waveform_transforms/high_pass_filter/index.html new file mode 100644 index 00000000..6d1862ba --- /dev/null +++ b/waveform_transforms/high_pass_filter/index.html @@ -0,0 +1,1043 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + HighPassFilter - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

HighPassFilter

+

Added in v0.18.0, updated in v0.21.0

+

Apply high-pass filtering to the input audio of parametrized filter steepness (6/12/18... dB / octave). +Can also be set for zero-phase filtering (will result in a 6 dB drop at cutoff).

+

HighPassFilter API

+
+
min_cutoff_freq: float • unit: hertz
+
Default: 20.0. Minimum cutoff frequency
+
max_cutoff_freq: float • unit: hertz
+
Default: 2400.0. Maximum cutoff frequency
+
min_rolloff: float • unit: Decibels/octave
+
Default: 12. Minimum filter roll-off (in dB/octave). +Must be a multiple of 6
+
max_rolloff: float • unit: Decibels/octave
+
Default: 24. Maximum filter roll-off (in dB/octave). +Must be a multiple of 6
+
zero_phase: bool
+
Default: False. Whether filtering should be zero phase. +When this is set to True it will not affect the phase of the input signal but will +sound 3 dB lower at the cutoff frequency compared to the non-zero phase case (6 dB +vs. 3 dB). Additionally, it is 2 times slower than in the non-zero phase case. If +you absolutely want no phase distortions (e.g. want to augment an audio file with +lots of transients, like a drum track), set this to True.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/high_shelf_filter/index.html b/waveform_transforms/high_shelf_filter/index.html new file mode 100644 index 00000000..3d8e67e2 --- /dev/null +++ b/waveform_transforms/high_shelf_filter/index.html @@ -0,0 +1,1043 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + HighShelfFilter - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

HighShelfFilter

+

Added in v0.21.0

+

A high shelf filter is a filter that either boosts (increases amplitude) or cuts +(decreases amplitude) frequencies above a certain center frequency. This transform +applies a high-shelf filter at a specific center frequency in hertz. +The gain at nyquist frequency is controlled by {min,max}_gain_db (note: can be positive or negative!). +Filter coefficients are taken from the W3 Audio EQ Cookbook

+

HighShelfFilter API

+
+
min_center_freq: float • unit: hertz
+
Default: 300.0. The minimum center frequency of the shelving filter
+
max_center_freq: float • unit: hertz
+
Default: 7500.0. The maximum center frequency of the shelving filter
+
min_gain_db: float • unit: Decibel
+
Default: -18.0. The minimum gain at the nyquist frequency
+
max_gain_db: float • unit: Decibel
+
Default: 18.0. The maximum gain at the nyquist frequency
+
min_q: float • range: (0.0, 1.0]
+
Default: 0.1. The minimum quality factor Q. The higher +the Q, the steeper the transition band will be.
+
max_q: float • range: (0.0, 1.0]
+
Default: 0.999. The maximum quality factor Q. The higher +the Q, the steeper the transition band will be.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/lambda/index.html b/waveform_transforms/lambda/index.html new file mode 100644 index 00000000..16e3beb9 --- /dev/null +++ b/waveform_transforms/lambda/index.html @@ -0,0 +1,1096 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Lambda - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Lambda

+

Added in v0.26.0

+

Apply a user-defined transform (callable) to the signal. The inspiration for this +transform comes from albumentation's lambda transform. This allows one to have a little +more fine-grained control over the operations in the context of a Compose, OneOf or SomeOf

+

Usage example

+
import random
+
+from audiomentations import Lambda, OneOf, Gain
+
+
+def gain_only_left_channel(samples, sample_rate):
+    samples[0, :] *= random.uniform(0.8, 1.25)
+    return samples
+
+
+transform = OneOf(
+    transforms=[Lambda(transform=gain_only_left_channel, p=1.0), Gain(p=1.0)]
+)
+
+augmented_sound = transform(my_stereo_waveform_ndarray, sample_rate=16000)
+
+

Lambda API

+
+
transform: Callable
+
A callable to be applied. It should input +samples (ndarray), sample_rate (int) and optionally some user-defined +keyword arguments.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
**kwargs
+
Optional extra parameters passed to the callable transform
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/limiter/index.html b/waveform_transforms/limiter/index.html new file mode 100644 index 00000000..81066e9e --- /dev/null +++ b/waveform_transforms/limiter/index.html @@ -0,0 +1,1161 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Limiter - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Limiter

+

Added in v0.26.0

+

The Limiter, based on cylimiter , is a straightforward audio transform that applies dynamic range compression. +It is capable of limiting the audio signal based on certain parameters. +Additionally, please note that this transform introduces a slight delay in the signal, equivalent to a fraction of the attack time.

+
    +
  • The threshold determines the audio level above which the limiter kicks in.
  • +
  • The attack time is how quickly the limiter kicks in once the audio signal starts exceeding the threshold.
  • +
  • The release time determines how quickly the limiter stops working after the signal drops below the threshold.
  • +
+

Input-output example

+

In this example we apply the limiter with a threshold that is 10 dB lower than the signal peak

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage examples

+
+
+
+
from audiomentations import Limiter
+
+transform = Limiter(
+    min_threshold_db=-16.0,
+    max_threshold_db=-6.0,
+    threshold_mode="relative_to_signal_peak",
+    p=1.0,
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+
+
+
from audiomentations import Limiter
+
+transform = Limiter(
+    min_threshold_db=-16.0,
+    max_threshold_db=-6.0,
+    threshold_mode="absolute",
+    p=1.0,
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+
+
+
+

Limiter API

+
+
min_threshold_db: float • unit: Decibel
+
Default: -24.0. Minimum threshold
+
max_threshold_db: float • unit: Decibel
+
Default: -2.0. Maximum threshold
+
min_attack: float • unit: seconds
+
Default: 0.0005. Minimum attack time
+
max_attack: float • unit: seconds
+
Default: 0.025. Maximum attack time
+
min_release: float • unit: seconds
+
Default: 0.05. Minimum release time
+
max_release: float • unit: seconds
+
Default: 0.7. Maximum release time
+
threshold_mode: str • choices: "relative_to_signal_peak", "absolute"
+
+

Default: relative_to_signal_peak. Specifies the mode for determining the threshold.

+
    +
  • "relative_to_signal_peak" means the threshold is relative to peak of the signal.
  • +
  • "absolute" means the threshold is relative to 0 dBFS, so it doesn't depend + on the peak of the signal.
  • +
+
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/loudness_normalization/index.html b/waveform_transforms/loudness_normalization/index.html new file mode 100644 index 00000000..6c178187 --- /dev/null +++ b/waveform_transforms/loudness_normalization/index.html @@ -0,0 +1,1043 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + LoudnessNormalization - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

LoudnessNormalization

+

Added in v0.14.0

+

Apply a constant amount of gain to match a specific loudness (in LUFS). This is an +implementation of ITU-R BS.1770-4.

+

For an explanation on LUFS, see https://en.wikipedia.org/wiki/LUFS

+

See also the following web pages for more info on audio loudness normalization:

+ +

Warning: This transform can return samples outside the [-1, 1] range, which may lead to +clipping or wrap distortion, depending on what you do with the audio in a later stage. +See also https://en.wikipedia.org/wiki/Clipping_(audio)#Digital_clipping

+

LoudnessNormalization API

+
+
min_lufs_in_db: float • unit: LUFS
+
⚠ Deprecated as of v0.31.0. Use min_lufs instead
+
max_lufs_in_db: float • unit: LUFS
+
⚠ Deprecated as of v0.31.0. Use max_lufs instead
+
min_lufs: float • unit: LUFS
+
Default: -31.0. Minimum loudness target
+
max_lufs: float • unit: LUFS
+
Default: -13.0. Maximum loudness target
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/low_pass_filter/index.html b/waveform_transforms/low_pass_filter/index.html new file mode 100644 index 00000000..f47795a8 --- /dev/null +++ b/waveform_transforms/low_pass_filter/index.html @@ -0,0 +1,1043 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + LowPassFilter - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

LowPassFilter

+

Added in v0.18.0, updated in v0.21.0

+

Apply low-pass filtering to the input audio of parametrized filter steepness (6/12/18... dB / octave). +Can also be set for zero-phase filtering (will result in a 6db drop at cutoff).

+

LowPassFilter API

+
+
min_cutoff_freq: float • unit: hertz
+
Default: 150.0. Minimum cutoff frequency
+
max_cutoff_freq: float • unit: hertz
+
Default: 7500.0. Maximum cutoff frequency
+
min_rolloff: float • unit: Decibels/octave
+
Default: 12. Minimum filter roll-off (in dB/octave). +Must be a multiple of 6
+
max_rolloff: float • unit: Decibels/octave
+
Default: 24. Maximum filter roll-off (in dB/octave) +Must be a multiple of 6
+
zero_phase: bool
+
Default: False. Whether filtering should be zero phase. +When this is set to True it will not affect the phase of the input signal but will +sound 3 dB lower at the cutoff frequency compared to the non-zero phase case (6 dB +vs. 3 dB). Additionally, it is 2 times slower than in the non-zero phase case. If +you absolutely want no phase distortions (e.g. want to augment an audio file with +lots of transients, like a drum track), set this to True.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/low_shelf_filter/index.html b/waveform_transforms/low_shelf_filter/index.html new file mode 100644 index 00000000..f917a076 --- /dev/null +++ b/waveform_transforms/low_shelf_filter/index.html @@ -0,0 +1,1043 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + LowShelfFilter - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

LowShelfFilter

+

Added in v0.21.0

+

A low shelf filter is a filter that either boosts (increases amplitude) or cuts +(decreases amplitude) frequencies below a certain center frequency. This transform +applies a low-shelf filter at a specific center frequency in hertz. +The gain at DC frequency is controlled by {min,max}_gain_db (note: can be positive or negative!). +Filter coefficients are taken from the W3 Audio EQ Cookbook

+

LowShelfFilter API

+
+
min_center_freq: float • unit: hertz
+
Default: 50.0. The minimum center frequency of the shelving filter
+
max_center_freq: float • unit: hertz
+
Default: 4000.0. The maximum center frequency of the shelving filter
+
min_gain_db: float • unit: Decibel
+
Default: -18.0. The minimum gain at DC (0 Hz)
+
max_gain_db: float • unit: Decibel
+
Default: 18.0. The maximum gain at DC (0 Hz)
+
min_q: float • range: (0.0, 1.0]
+
Default: 0.1. The minimum quality factor Q. The higher +the Q, the steeper the transition band will be.
+
max_q: float • range: (0.0, 1.0]
+
Default: 0.999. The maximum quality factor Q. The higher +the Q, the steeper the transition band will be.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/mp3_compression/index.html b/waveform_transforms/mp3_compression/index.html new file mode 100644 index 00000000..f2b04ac4 --- /dev/null +++ b/waveform_transforms/mp3_compression/index.html @@ -0,0 +1,1046 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Mp3Compression - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Mp3Compression

+

Added in v0.12.0

+

Compress the audio using an MP3 encoder to lower the audio quality. This may help machine +learning models deal with compressed, low-quality audio.

+

This transform depends on either lameenc or pydub/ffmpeg.

+

Note that bitrates below 32 kbps are only supported for low sample rates (up to 24000 Hz).

+

Note: When using the "lameenc" backend, the output may be slightly longer than the input due +to the fact that the LAME encoder inserts some silence at the beginning of the audio.

+

Warning: This transform writes to disk, so it may be slow.

+

Mp3Compression API

+
+
min_bitrate: int • unit: kbps • range: [8, max_bitrate]
+
Default: 8. Minimum bitrate in kbps
+
max_bitrate: int • unit: kbps • range: [min_bitrate, 320]
+
Default: 64. Maximum bitrate in kbps
+
backend: str • choices: "pydub", "lameenc"
+
+

Default: "pydub".

+
    +
  • "pydub": May use ffmpeg under the hood. Pro: Seems to avoid introducing latency in + the output. Con: Slightly slower than "lameenc".
  • +
  • "lameenc": Pro: With this backend you can set the quality parameter in addition + to the bitrate (although this parameter is not exposed in the audiomentations API + yet). Con: Seems to introduce some silence at the start of the audio.
  • +
+
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/normalize/index.html b/waveform_transforms/normalize/index.html new file mode 100644 index 00000000..14114dbb --- /dev/null +++ b/waveform_transforms/normalize/index.html @@ -0,0 +1,1027 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Normalize - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Normalize

+

Added in v0.6.0

+

Apply a constant amount of gain, so that highest signal level present in the sound +becomes 0 dBFS, i.e. the loudest level allowed if all samples must be between -1 and 1. +Also known as peak normalization.

+

Normalize API

+
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/padding/index.html b/waveform_transforms/padding/index.html new file mode 100644 index 00000000..8eab6ff4 --- /dev/null +++ b/waveform_transforms/padding/index.html @@ -0,0 +1,1035 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Padding - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Padding

+

Added in v0.23.0

+

Apply padding to the audio signal - take a fraction of the end or the start of the +audio and replace that part with padding. This can be useful for preparing ML models +with constant input length for padded inputs.

+

Padding API

+
+
mode: str • choices: "silence", "wrap", "reflect"
+
Default: "silence". Padding mode.
+
min_fraction: float • range: [0.0, 1.0]
+
Default: 0.01. Minimum fraction of the signal duration to be padded
+
max_fraction: float • range: [0.0, 1.0]
+
Default: 0.7. Maximum fraction of the signal duration to be padded
+
pad_section: str • choices: "start", "end"
+
Default: "end". Which part of the signal should be replaced with padding
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/peaking_filter/index.html b/waveform_transforms/peaking_filter/index.html new file mode 100644 index 00000000..fc06683f --- /dev/null +++ b/waveform_transforms/peaking_filter/index.html @@ -0,0 +1,1039 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + PeakingFilter - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

PeakingFilter

+

Added in v0.21.0

+

Add a biquad peaking filter transform

+

PeakingFilter API

+
+
min_center_freq: float • unit: hertz • range: [0.0, ∞)
+
Default: 50.0. The minimum center frequency of the peaking filter
+
max_center_freq: float • unit: hertz • range: [0.0, ∞)
+
Default: 7500.0. The maximum center frequency of the peaking filter
+
min_gain_db: float • unit: Decibel
+
Default: -24.0. The minimum gain at center frequency
+
max_gain_db: float • unit: Decibel
+
Default: 24.0. The maximum gain at center frequency
+
min_q: float • range: [0.0, ∞)
+
Default: 0.5. The minimum quality factor Q. The higher the +Q, the steeper the transition band will be.
+
max_q: float • range: [0.0, ∞)
+
Default: 5.0. The maximum quality factor Q. The higher the +Q, the steeper the transition band will be.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/pitch_shift/index.html b/waveform_transforms/pitch_shift/index.html new file mode 100644 index 00000000..990bd2e2 --- /dev/null +++ b/waveform_transforms/pitch_shift/index.html @@ -0,0 +1,1128 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + PitchShift - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

PitchShift

+

Added in v0.4.0

+

Pitch shift the sound up or down without changing the tempo.

+

Under the hood this does time stretching (by phase vocoding) followed by resampling. +Note that phase vocoding can degrade audio quality by "smearing" transient sounds, +altering the timbre of harmonic sounds, and distorting pitch modulations. This may +result in a loss of sharpness, clarity, or naturalness in the transformed audio.

+

If you need a better sounding pitch shifting method, consider the following alternatives:

+ +

Input-output example

+

Here we pitch down a piano recording by 4 semitones:

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import PitchShift
+
+transform = PitchShift(
+    min_semitones=-5.0,
+    max_semitones=5.0,
+    p=1.0
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=44100)
+
+

PitchShift API

+
+
min_semitones: float • unit: semitones • range: [-12.0, 12.0]
+
Default: -4.0. Minimum semitones to shift. Negative number means shift down.
+
max_semitones: float • unit: semitones • range: [-12.0, 12.0]
+
Default: 4.0. Maximum semitones to shift. Positive number means shift up.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/polarity_inversion/index.html b/waveform_transforms/polarity_inversion/index.html new file mode 100644 index 00000000..63c4b6aa --- /dev/null +++ b/waveform_transforms/polarity_inversion/index.html @@ -0,0 +1,1031 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + PolarityInversion - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

PolarityInversion

+

Added in v0.11.0

+

Flip the audio samples upside-down, reversing their polarity. In other words, multiply the +waveform by -1, so negative values become positive, and vice versa. The result will sound +the same compared to the original when played back in isolation. However, when mixed with +other audio sources, the result may be different. This waveform inversion technique +is sometimes used for audio cancellation or obtaining the difference between two waveforms. +However, in the context of audio data augmentation, this transform can be useful when +training phase-aware machine learning models.

+

PolarityInversion API

+
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/post_gain/index.html b/waveform_transforms/post_gain/index.html new file mode 100644 index 00000000..15f6e3ca --- /dev/null +++ b/waveform_transforms/post_gain/index.html @@ -0,0 +1,1031 @@ + + + + + + + + + + + + + + + + + + + + + + PostGain - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

PostGain

+

Added in v0.31.0

+

Gain up or down the audio after the given transform (or set of transforms) has +processed the audio. There are several methods that determine how the audio should +be gained. PostGain can be useful for compensating for any gain differences introduced +by a (set of) transform(s), or for preventing clipping in the output.

+

PostGain API

+
+
transform: Callable[[NDArray[np.float32], int], NDArray[np.float32]]
+
A callable to be applied. It should input +samples (ndarray), sample_rate (int) and optionally some user-defined +keyword arguments.
+
method: str • choices: "same_rms", "same_lufs" or "peak_normalize_always"
+
+

This parameter defines the method for choosing the post gain amount.

+
    +
  • "same_rms": The sound gets post-gained so that the RMS (Root Mean Square) of + the output matches the RMS of the input.
  • +
  • "same_lufs": The sound gets post-gained so that the LUFS (Loudness Units Full Scale) of + the output matches the LUFS of the input.
  • +
  • "peak_normalize_always": The sound gets peak normalized (gained up or down so + that the absolute value of the most extreme sample in the output is 1.0)
  • +
  • "peak_normalize_if_too_loud": The sound gets peak normalized if it is too + loud (max absolute value greater than 1.0). This option can be useful for + avoiding clipping.
  • +
+
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/repeat_part/index.html b/waveform_transforms/repeat_part/index.html new file mode 100644 index 00000000..bac50539 --- /dev/null +++ b/waveform_transforms/repeat_part/index.html @@ -0,0 +1,1185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + RepeatPart - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

RepeatPart

+

Added in v0.32.0

+

Select a subsection (or "part") of the audio and repeat that part a number of times. +This can be useful when simulating scenarios where a short audio snippet gets +repeated, for example:

+
    +
  • Repetitions of some musical note or sound in a rhythmical way
  • +
  • A person stutters or says the same word (with variations) multiple times on a row
  • +
  • A mechanical noise with periodic repetitions
  • +
  • A "skip in the record" or a "stuck needle" effect, reminiscent of vinyl records or + CDs when they repeatedly play a short section due to a scratch or other + imperfection.
  • +
  • Digital audio glitches, such as a buffer underrun in video games, + where the current audio frame gets looped continuously due to system overloads + or a software crash.
  • +
+

Note that the length of inputs you give it must be compatible with the part +duration range and crossfade duration. If you give it an input audio array that is +too short, a UserWarning will be raised and no operation is applied to the signal.

+

Input-output example

+

In this speech example, the audio was transformed with

+
    +
  • a part duration of approximately 0.4 seconds
  • +
  • "insert" mode. In this mode, the output becomes longer than the input.
  • +
  • a SevenBandParametricEQ part transform. This is why each repeat in the output + has a different timbre.
  • +
+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage examples

+
+
+
+
from audiomentations import RepeatPart
+
+transform = RepeatPart(mode="insert", p=1.0)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+
+
+
from audiomentations import RepeatPart
+
+transform = RepeatPart(mode="replace", p=1.0)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+
+
+
+

RepeatPart API

+
+
min_repeats: int • range: [1, max_repeats]
+
Default: 1. Minimum number of times a selected audio + segment should be repeated in addition to the original. For instance, if the selected + number of repeats is 1, the selected segment will be followed by one repeat.
+
max_repeats: int • range: [min_repeats, ∞)
+
Default: 3. Maximum number of times a selected audio + segment can be repeated in addition to the original
+
min_part_duration: float • unit: seconds • range: [0.00025, max_part_duration]
+
Default: 0.25. Minimum duration (in seconds) of the audio + segment that can be selected for repetition.
+
max_part_duration: float • unit: seconds • range: [min_part_duration, ∞)
+
Default: 1.2. Maximum duration (in seconds) of the audio + segment that can be selected for repetition.
+
mode: str • choices: "insert", "replace"
+
+

Default: "insert". This parameter has two options:

+
    +
  • "insert": Insert the repeat(s), making the array longer. After the last + repeat there will be the last part of the original audio, offset in time + compared to the input array.
  • +
  • "replace": Have the repeats replace (as in overwrite) the original audio. + Any remaining part at the end (if not overwritten by repeats) will be + left untouched without offset. The length of the output array is the + same as the input array.
  • +
+
+
crossfade_duration: float • unit: seconds • range: 0.0 or [0.00025, ∞)
+
Default: 0.005. Duration for crossfading between repeated + parts as well as potentially from the original audio to the repeats and back. + The crossfades will be equal-energy or equal-gain depending on the audio and/or the + chosen parameters of the transform. The crossfading feature can be used to smooth + transitions and avoid abrupt changes, which can lead to impulses/clicks in the audio. + If you know what you're doing, and impulses/clicks are desired for your use case, + you can disable the crossfading by setting this value to 0.0.
+
part_transform: Optional[Callable[[NDArray[np.float32], int], NDArray[np.float32]]]
+
An optional callable (audiomentations transform) that + gets applied individually to each repeat. This can be used to make each + repeat slightly different from the previous one. Note that a part_transform + that makes the part shorter is only supported if the transformed part is at + least two times the crossfade duration.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/resample/index.html b/waveform_transforms/resample/index.html new file mode 100644 index 00000000..b85ff564 --- /dev/null +++ b/waveform_transforms/resample/index.html @@ -0,0 +1,1031 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Resample - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Resample

+

Added in v0.8.0

+

Resample signal using librosa.core.resample

+

To do downsampling only set both minimum and maximum sampling rate lower than original +sampling rate and vice versa to do upsampling only.

+

Resample API

+
+
min_sample_rate: int • unit: Hz
+
Default: 8000. Minimum sample rate
+
max_sample_rate: int • unit: Hz
+
Default: 44100. Maximum sample rate
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/reverse/index.html b/waveform_transforms/reverse/index.html new file mode 100644 index 00000000..dbf4d81c --- /dev/null +++ b/waveform_transforms/reverse/index.html @@ -0,0 +1,1113 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Reverse - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Reverse

+

Added in v0.18.0

+

Reverse the audio. Also known as time inversion. Inversion of an audio track along its time +axis relates to the random flip of an image, which is an augmentation technique that is +widely used in the visual domain. This can be relevant in the context of audio +classification. It was successfully applied in the paper +AudioCLIP: Extending CLIP to Image, Text and Audio .

+

Input-output example

+

In this example, we reverse a speech recording

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import Reverse
+
+transform = Reverse(p=1.0)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=44100)
+
+

Reverse API

+
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/room_simulator/index.html b/waveform_transforms/room_simulator/index.html new file mode 100644 index 00000000..2226a207 --- /dev/null +++ b/waveform_transforms/room_simulator/index.html @@ -0,0 +1,1224 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + RoomSimulator - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

RoomSimulator

+

Added in v0.23.0

+

A ShoeBox Room Simulator. Simulates a cuboid of parametrized size and average surface absorption coefficient. It also includes a source +and microphones in parametrized locations.

+

Use it when you want a ton of synthetic room impulse responses of specific configurations +characteristics or simply to quickly add reverb for augmentation purposes

+

RoomSimulator API

+
+
min_size_x: float • unit: meters
+
Default: 3.6. Minimum width (x coordinate) of the room in meters
+
max_size_x: float • unit: meters
+
Default: 5.6. Maximum width of the room in meters
+
min_size_y: float • unit: meters
+
Default: 3.6. Minimum depth (y coordinate) of the room in meters
+
max_size_y: float • unit: meters
+
Default: 3.9. Maximum depth of the room in meters
+
min_size_z: float • unit: meters
+
Default: 2.4. Minimum height (z coordinate) of the room in meters
+
max_size_z: float • unit: meters
+
Default: 3.0. Maximum height of the room in meters
+
min_absorption_value: float
+
+

Default: 0.075. Minimum absorption coefficient value. +When calculation_mode is "absorption" +it will set the given coefficient value for the surfaces of the room (walls, +ceilings, and floor). This coefficient takes values between 0 (fully reflective +surface) and 1 (fully absorbing surface).

+

Example values (may differ!):

+ + + + + + + + + + + + + + + + + + + + + +
EnvironmentCoefficient value
Studio with acoustic panels> 0.40
Office / Library~ 0.15
Factory~ 0.05
+
+
max_absorption_value: float
+
Default: 0.4. Maximum absorption coefficient value. See +min_absorption_value for more +info.
+
min_target_rt60: float • unit: seconds
+
+

Default: 0.15. Minimum target RT60. RT60 is defined as the +measure of the time after the sound source ceases that it takes for the sound +pressure level to reduce by 60 dB. When +calculation_mode is "rt60", it tries +to set the absorption value of the surfaces of the room to achieve a target RT60 +(in seconds). Note that this parameter changes only the materials (absorption +coefficients) of the surfaces, not the dimension of the rooms.

+

Example values (may differ!):

+ + + + + + + + + + + + + + + + + + + + + +
EnvironmentRT60
Recording studio0.3 s
Office0.5 s
Concert hall1.5 s
+
+
max_target_rt60: float • unit: seconds
+
Default: 0.8. Maximum target RT60. See +min_target_rt60 for more info.
+
min_source_x: float • unit: meters
+
Default: 0.1. Minimum x location of the source
+
max_source_x: float • unit: meters
+
Default: 3.5. Maximum x location of the source
+
min_source_y: float • unit: meters
+
Default: 0.1. Minimum y location of the source
+
max_source_x: float • unit: meters
+
Default: 2.7. Maximum y location of the source
+
min_source_z: float • unit: meters
+
Default: 1.0. Minimum z location of the source
+
max_source_x: float • unit: meters
+
Default: 2.1. Maximum z location of the source
+
min_mic_distance: float • unit: meters
+
Default: 0.15. Minimum distance of the microphone from the +source in meters
+
max_mic_distance: float • unit: meters
+
Default: 0.35. Maximum distance of the microphone from the +source in meters
+
min_mic_azimuth: float • unit: radians
+
Default: -math.pi. Minimum azimuth (angle around z axis) of the +microphone relative to the source.
+
max_mic_azimuth: float • unit: radians
+
Default: math.pi. Maximum azimuth (angle around z axis) of the +microphone relative to the source.
+
min_mic_elevation: float • unit: radians
+
Default: -math.pi. Minimum elevation of the microphone relative +to the source, in radians.
+
max_mic_elevation: float • unit: radians
+
Default: math.pi. Maximum elevation of the microphone relative +to the source, in radians.
+
calculation_mode: str • choices: "rt60", "absorption"
+
Default: "absorption". When set to "absorption", it will +create the room with surfaces based on +min_absorption_value and +max_absorption_value. If set to +"rt60" it will try to assign surface materials that lead to a room impulse +response with target rt60 given by +min_target_rt60 and +max_target_rt60
+
use_ray_tracing: bool
+
Default: True. Whether to use ray_tracing or not (slower +but much more accurate). Disable this if you need speed but do not really care for +incorrect results.
+
max_order: int • range: [1, ∞)
+
+

Default: 1. Maximum order of reflections for the Image +Source Model. E.g. a value of 1 will only add first order reflections while a value +of 12 will add a diffuse reverberation tail.

+
+

Warning

+

Placing this higher than 11-12 will result in a very slow augmentation process when calculation_mode="rt60".

+
+
+

Tip

+

When using calculation_mode="rt60", keep it around 3-4.

+
+
+
leave_length_unchanged: bool
+
Default: False. When set to True, the tail of the sound +(e.g. reverb at the end) will be chopped off so that the length of the output is +equal to the length of the input.
+
padding: float • unit: meters
+
Default: 0.1. Minimum distance in meters between source or +mic and the room walls, floor or ceiling.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
ray_tracing_options: Optional[Dict]
+
Default: None. Options for the ray tracer. See set_ray_tracing here:
+https://github.com/LCAV/pyroomacoustics/blob/master/pyroomacoustics/room.py
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/seven_band_parametric_eq/index.html b/waveform_transforms/seven_band_parametric_eq/index.html new file mode 100644 index 00000000..65253501 --- /dev/null +++ b/waveform_transforms/seven_band_parametric_eq/index.html @@ -0,0 +1,1098 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + SevenBandParametricEQ - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

SevenBandParametricEQ

+

Added in v0.24.0

+

Adjust the volume of different frequency bands. This transform is a 7-band +parametric equalizer - a combination of one low shelf filter, five peaking filters +and one high shelf filter, all with randomized gains, Q values and center frequencies.

+

Because this transform changes the timbre, but keeps the overall "class" of the +sound the same (depending on application), it can be used for data augmentation to +make ML models more robust to various frequency spectrums. Many things can affect +the spectrum, for example:

+
    +
  • the nature and quality of the sound source
  • +
  • room acoustics
  • +
  • any objects between the microphone and the sound source
  • +
  • microphone type/model
  • +
  • the distance between the sound source and the microphone
  • +
+

The seven bands have center frequencies picked in the following ranges (min-max):

+
    +
  • 42-95 Hz
  • +
  • 91-204 Hz
  • +
  • 196-441 Hz
  • +
  • 421-948 Hz
  • +
  • 909-2045 Hz
  • +
  • 1957-4404 Hz
  • +
  • 4216-9486 Hz
  • +
+

SevenBandParametricEQ API

+
+
min_gain_db: float • unit: Decibel
+
Default: -12.0. Minimum number of dB to cut or boost a band
+
max_gain_db: float • unit: decibel
+
Default: 12.0. Maximum number of dB to cut or boost a band
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/shift/index.html b/waveform_transforms/shift/index.html new file mode 100644 index 00000000..81626cf2 --- /dev/null +++ b/waveform_transforms/shift/index.html @@ -0,0 +1,1141 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Shift - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Shift

+

Added in v0.5.0

+

Shift the samples forwards or backwards, with or without rollover

+

Shift API

+

ℹ This only applies to version 0.33.0 and newer. If you are using an older +version, you should consider upgrading. Or if you really want to keep using the old +version, you can check the "Old Shift API (<=v0.32.0)" section below

+
+
min_shift: float | int
+
Default: -0.5. Minimum amount of shifting in time. See also +shift_unit.
+
max_shift: float | int
+
Default: 0.5. Maximum amount of shifting in time. See also +shift_unit.
+
shift_unit: str • choices: "fraction", "samples", "seconds"
+
+

Default: "fraction" Defines the unit of the value of +min_shift and max_shift.

+
    +
  • "fraction": Fraction of the total sound length
  • +
  • "samples": Number of audio samples
  • +
  • "seconds": Number of seconds
  • +
+
+
rollover: bool
+
Default: True. When set to True, samples that roll +beyond the first or last position are re-introduced at the last or first. When set +to False, samples that roll beyond the first or last position are discarded. In +other words, rollover=False results in an empty space (with zeroes).
+
fade_duration: float • unit: seconds • range: 0.0 or [0.00025, ∞)
+
Default: 0.005. If you set this to a positive number, +there will be a fade in and/or out at the "stitch" (that was the start or the end +of the audio before the shift). This can smooth out an unwanted abrupt +change between two consecutive samples (which sounds like a +transient/click/pop). This parameter denotes the duration of the fade in +seconds. To disable the fading feature, set this parameter to 0.0.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+

Old Shift API (<=v0.32.0)

+

⚠ This only applies to version 0.32.0 and older

+
+
min_fraction: float • range: [-1, 1]
+
Default: -0.5. Minimum fraction of total sound length to +shift.
+
max_fraction: float • range: [-1, 1]
+
Default: 0.5. Maximum fraction of total sound length to +shift.
+
rollover: bool
+
Default: True. When set to True, samples that roll +beyond the first or last position are re-introduced at the last or first. When set +to False, samples that roll beyond the first or last position are discarded. In +other words, rollover=False results in an empty space (with zeroes).
+
fade: bool
+
Default: False. When set to True, there will be a short +fade in and/or out at the "stitch" (that was the start or the end of the audio +before the shift). This can smooth out an unwanted abrupt change between two +consecutive samples (which sounds like a transient/click/pop).
+
fade_duration: float • unit: seconds
+
Default: 0.01. If fade=True, then this is the duration +of the fade in seconds.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/tanh_distortion/index.html b/waveform_transforms/tanh_distortion/index.html new file mode 100644 index 00000000..76df47fb --- /dev/null +++ b/waveform_transforms/tanh_distortion/index.html @@ -0,0 +1,1138 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + TanhDistortion - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

TanhDistortion

+

Added in v0.19.0

+

Apply tanh (hyperbolic tangent) distortion to the audio. This technique is sometimes +used for adding distortion to guitar recordings. The tanh() function can give a rounded +"soft clipping" kind of distortion, and the distortion amount is proportional to the +loudness of the input and the pre-gain. Tanh is symmetric, so the positive and +negative parts of the signal are squashed in the same way. This transform can be +useful as data augmentation because it adds harmonics. In other words, it changes +the timbre of the sound.

+

See this page for examples: http://gdsp.hf.ntnu.no/lessons/3/17/

+

Input-output example

+

In this example we apply tanh distortion with the "distortion amount" (think of it as a knob that goes from 0 to 1) set to 0.25

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import TanhDistortion
+
+transform = TanhDistortion(
+    min_distortion=0.01,
+    max_distortion=0.7,
+    p=1.0
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+

TanhDistortion API

+
+
min_distortion: float • range: [0.0, 1.0]
+
Default: 0.01. Minimum "amount" of distortion to apply to the signal.
+
max_distortion: float • range: [0.0, 1.0]
+
Default: 0.7. Maximum "amount" of distortion to apply to the signal.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/time_mask/index.html b/waveform_transforms/time_mask/index.html new file mode 100644 index 00000000..4553ed3a --- /dev/null +++ b/waveform_transforms/time_mask/index.html @@ -0,0 +1,1139 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + TimeMask - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

TimeMask

+

Added in v0.7.0

+

Make a randomly chosen part of the audio silent. Inspired by +https://arxiv.org/pdf/1904.08779.pdf

+

Input-output example

+

Here we silence a part of a speech recording.

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import TimeMask
+
+transform = TimeMask(
+    min_band_part=0.1,
+    max_band_part=0.15,
+    fade=True,
+    p=1.0,
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+

TimeMask API

+
+
min_band_part: float • range: [0.0, 1.0]
+
Default: 0.0. Minimum length of the silent part as a +fraction of the total sound length.
+
max_band_part: float • range: [0.0, 1.0]
+
Default: 0.5. Maximum length of the silent part as a +fraction of the total sound length.
+
fade: bool
+
Default: False. When set to True, add a linear fade in +and fade out of the silent part. This can smooth out an unwanted abrupt change +between two consecutive samples (which sounds like a transient/click/pop).
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/time_stretch/index.html b/waveform_transforms/time_stretch/index.html new file mode 100644 index 00000000..afe4867e --- /dev/null +++ b/waveform_transforms/time_stretch/index.html @@ -0,0 +1,1147 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + TimeStretch - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

TimeStretch

+

Added in v0.2.0

+

Change the speed or duration of the signal without changing the pitch. This transform +employs librosa.effects.time_stretch under the hood to achieve the effect.

+

Under the hood this uses phase vocoding. Note that phase vocoding can degrade audio +quality by "smearing" transient sounds, altering the timbre of harmonic sounds, and +distorting pitch modulations. This may result in a loss of sharpness, clarity, or +naturalness in the transformed audio, especially when the rate is set to an extreme +value.

+

If you need a better sounding time stretch method, consider the following alternatives:

+ +

Input-output example

+

In this example we speed up a sound by 25%. This corresponds to a rate of 1.25.

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import TimeStretch
+
+transform = TimeStretch(
+    min_rate=0.8,
+    max_rate=1.25,
+    leave_length_unchanged=True,
+    p=1.0
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+

TimeStretch API

+
+
min_rate: float • range: [0.1, 10.0]
+
Default: 0.8. Minimum rate of change of total duration of the signal. A rate below 1 means the audio is slowed down.
+
max_rate: float • range: [0.1, 10.0]
+
Default: 1.25. Maximum rate of change of total duration of the signal. A rate greater than 1 means the audio is sped up.
+
leave_length_unchanged: bool
+
Default: True. The rate changes the duration and effects the samples. This flag is used to keep the total length of the generated output to be same as that of the input signal.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/waveform_transforms/trim/index.html b/waveform_transforms/trim/index.html new file mode 100644 index 00000000..cf270dbf --- /dev/null +++ b/waveform_transforms/trim/index.html @@ -0,0 +1,1129 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Trim - audiomentations documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Trim

+

Added in v0.7.0

+

Trim leading and trailing silence from an audio signal using librosa.effects.trim. It considers threshold +(in decibels) below reference defined in parameter top_db as silence.

+

Input-output example

+

In this example we remove silence from the start and end, using the default top_db parameter value

+

Input-output waveforms and spectrograms

+ + + + + + + + + + + + + +
Input soundTransformed sound
+

Usage example

+
from audiomentations import Trim
+
+transform = Trim(
+    top_db=30.0,
+    p=1.0
+)
+
+augmented_sound = transform(my_waveform_ndarray, sample_rate=16000)
+
+

Trim API

+
+
top_db: float • unit: Decibel
+
Default: 30.0. The threshold value (in decibels) below which to consider silence and trim.
+
p: float • range: [0.0, 1.0]
+
Default: 0.5. The probability of applying this transform.
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file