diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..11c0170 --- /dev/null +++ b/404.html @@ -0,0 +1,567 @@ + + + + + + + + + + + + + + + + + + + + + + OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +

404 - Not found

+ +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/accessing/index.html b/accessing/index.html new file mode 100644 index 0000000..23e3634 --- /dev/null +++ b/accessing/index.html @@ -0,0 +1,787 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Getting access to OSCAR - OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

Getting access to OSCAR

+

There are two ways of accessing OSCAR: through Huma-Num, or through HuggingFace. +Depending on your status, you might not have the choice.

+ + + + + + + + + + + + + + + + + + + + +
Research/AcademicIndividual
Huma-Num✔✖
Hugging-Face✔✔
+
+
+
+

You can apply for an access request by sending us an email!

+
+

Warning

+

Carefully respect the following instructions, as incorrect submissions might significantly delay your access.

+
+
+

Danger

+

Do not create an account by yourselves, as it could delay you access by weeks! We will create an account for you.

+
+

Send us an email at contact at oscar-project.org, with OSCAR Access Request as the title, and the following (completed) as the body:

+
+

Warning

+

Please send your email using your institutional/academic address when possible. Otherwise, your access might be delayed/refused.

+
+
- First name:
+- Last name:
+- Affiliation:
+- Contact details:
+- Corpus version: 
+- Languages:
+
++ a short description of your usecase.
+
+
+

Note

+

Access requests can take some days to be answered, sometimes more.

+

We post updates on our Discord server on exceptional delays, and you can always contact us there to inquire about yours.

+
+

After some time, you should get an email back from us with access instructions!

+
+
+

Using datasets

+

The following implies that you already have installed the Python datasets library

+
    +
  1. Create an account on HuggingFace.
  2. +
  3. Create a user access token.
  4. +
  5. Open the OSCAR Team page.
  6. +
  7. Open your corpus of choice. Instructions should be in the corpus page.
  8. +
+

After all of this, you should be able to easily use OSCAR data with the datasets library 🙂 :

+
# example with OSCAR 2201
+from datasets import load_dataset
+
+
+dataset = load_dataset("oscar-corpus/OSCAR-2201",
+                        use_auth_token=True, # required
+                        language="ar", 
+                        streaming=True, # optional
+                        split="train") # optional
+
+for d in dataset:
+    print(d) # prints documents
+
+

Using Git LFS

+

You can also get the raw data from HuggingFace using Git LFS.

+

The following steps assume you have git and git-lfs installed, and are on a UNIX system. +The procedure should roughly be the same on Windows, but hasn’t been attempted.

+

This will download the Basque corpus from OSCAR 2109.

+
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/oscar-corpus/OSCAR-2109 
+cd OSCAR-2109 # go inside the directory
+git lfs pull --include packaged/eu/eu.txt.gz # pull the required file(s) (here the Basque corpus). Check with the manpage for pull options
+
+
+
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 0000000..1cf13b9 Binary files /dev/null and b/assets/images/favicon.png differ diff --git a/assets/javascripts/bundle.51d95adb.min.js b/assets/javascripts/bundle.51d95adb.min.js new file mode 100644 index 0000000..b20ec68 --- /dev/null +++ b/assets/javascripts/bundle.51d95adb.min.js @@ -0,0 +1,29 @@ +"use strict";(()=>{var Hi=Object.create;var xr=Object.defineProperty;var Pi=Object.getOwnPropertyDescriptor;var $i=Object.getOwnPropertyNames,kt=Object.getOwnPropertySymbols,Ii=Object.getPrototypeOf,Er=Object.prototype.hasOwnProperty,an=Object.prototype.propertyIsEnumerable;var on=(e,t,r)=>t in e?xr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,P=(e,t)=>{for(var r in t||(t={}))Er.call(t,r)&&on(e,r,t[r]);if(kt)for(var r of kt(t))an.call(t,r)&&on(e,r,t[r]);return e};var sn=(e,t)=>{var r={};for(var n in e)Er.call(e,n)&&t.indexOf(n)<0&&(r[n]=e[n]);if(e!=null&&kt)for(var n of kt(e))t.indexOf(n)<0&&an.call(e,n)&&(r[n]=e[n]);return r};var Ht=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var Fi=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of $i(t))!Er.call(e,o)&&o!==r&&xr(e,o,{get:()=>t[o],enumerable:!(n=Pi(t,o))||n.enumerable});return e};var yt=(e,t,r)=>(r=e!=null?Hi(Ii(e)):{},Fi(t||!e||!e.__esModule?xr(r,"default",{value:e,enumerable:!0}):r,e));var fn=Ht((wr,cn)=>{(function(e,t){typeof wr=="object"&&typeof cn!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(wr,function(){"use strict";function e(r){var n=!0,o=!1,i=null,a={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function s(T){return!!(T&&T!==document&&T.nodeName!=="HTML"&&T.nodeName!=="BODY"&&"classList"in T&&"contains"in T.classList)}function f(T){var Ke=T.type,We=T.tagName;return!!(We==="INPUT"&&a[Ke]&&!T.readOnly||We==="TEXTAREA"&&!T.readOnly||T.isContentEditable)}function c(T){T.classList.contains("focus-visible")||(T.classList.add("focus-visible"),T.setAttribute("data-focus-visible-added",""))}function u(T){T.hasAttribute("data-focus-visible-added")&&(T.classList.remove("focus-visible"),T.removeAttribute("data-focus-visible-added"))}function p(T){T.metaKey||T.altKey||T.ctrlKey||(s(r.activeElement)&&c(r.activeElement),n=!0)}function m(T){n=!1}function d(T){s(T.target)&&(n||f(T.target))&&c(T.target)}function h(T){s(T.target)&&(T.target.classList.contains("focus-visible")||T.target.hasAttribute("data-focus-visible-added"))&&(o=!0,window.clearTimeout(i),i=window.setTimeout(function(){o=!1},100),u(T.target))}function v(T){document.visibilityState==="hidden"&&(o&&(n=!0),B())}function B(){document.addEventListener("mousemove",z),document.addEventListener("mousedown",z),document.addEventListener("mouseup",z),document.addEventListener("pointermove",z),document.addEventListener("pointerdown",z),document.addEventListener("pointerup",z),document.addEventListener("touchmove",z),document.addEventListener("touchstart",z),document.addEventListener("touchend",z)}function re(){document.removeEventListener("mousemove",z),document.removeEventListener("mousedown",z),document.removeEventListener("mouseup",z),document.removeEventListener("pointermove",z),document.removeEventListener("pointerdown",z),document.removeEventListener("pointerup",z),document.removeEventListener("touchmove",z),document.removeEventListener("touchstart",z),document.removeEventListener("touchend",z)}function z(T){T.target.nodeName&&T.target.nodeName.toLowerCase()==="html"||(n=!1,re())}document.addEventListener("keydown",p,!0),document.addEventListener("mousedown",m,!0),document.addEventListener("pointerdown",m,!0),document.addEventListener("touchstart",m,!0),document.addEventListener("visibilitychange",v,!0),B(),r.addEventListener("focus",d,!0),r.addEventListener("blur",h,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var un=Ht(Sr=>{(function(e){var t=function(){try{return!!Symbol.iterator}catch(c){return!1}},r=t(),n=function(c){var u={next:function(){var p=c.shift();return{done:p===void 0,value:p}}};return r&&(u[Symbol.iterator]=function(){return u}),u},o=function(c){return encodeURIComponent(c).replace(/%20/g,"+")},i=function(c){return decodeURIComponent(String(c).replace(/\+/g," "))},a=function(){var c=function(p){Object.defineProperty(this,"_entries",{writable:!0,value:{}});var m=typeof p;if(m!=="undefined")if(m==="string")p!==""&&this._fromString(p);else if(p instanceof c){var d=this;p.forEach(function(re,z){d.append(z,re)})}else if(p!==null&&m==="object")if(Object.prototype.toString.call(p)==="[object Array]")for(var h=0;hd[0]?1:0}),c._entries&&(c._entries={});for(var p=0;p1?i(d[1]):"")}})})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Sr);(function(e){var t=function(){try{var o=new e.URL("b","http://a");return o.pathname="c d",o.href==="http://a/c%20d"&&o.searchParams}catch(i){return!1}},r=function(){var o=e.URL,i=function(f,c){typeof f!="string"&&(f=String(f)),c&&typeof c!="string"&&(c=String(c));var u=document,p;if(c&&(e.location===void 0||c!==e.location.href)){c=c.toLowerCase(),u=document.implementation.createHTMLDocument(""),p=u.createElement("base"),p.href=c,u.head.appendChild(p);try{if(p.href.indexOf(c)!==0)throw new Error(p.href)}catch(T){throw new Error("URL unable to set base "+c+" due to "+T)}}var m=u.createElement("a");m.href=f,p&&(u.body.appendChild(m),m.href=m.href);var d=u.createElement("input");if(d.type="url",d.value=f,m.protocol===":"||!/:/.test(m.href)||!d.checkValidity()&&!c)throw new TypeError("Invalid URL");Object.defineProperty(this,"_anchorElement",{value:m});var h=new e.URLSearchParams(this.search),v=!0,B=!0,re=this;["append","delete","set"].forEach(function(T){var Ke=h[T];h[T]=function(){Ke.apply(h,arguments),v&&(B=!1,re.search=h.toString(),B=!0)}}),Object.defineProperty(this,"searchParams",{value:h,enumerable:!0});var z=void 0;Object.defineProperty(this,"_updateSearchParams",{enumerable:!1,configurable:!1,writable:!1,value:function(){this.search!==z&&(z=this.search,B&&(v=!1,this.searchParams._fromString(this.search),v=!0))}})},a=i.prototype,s=function(f){Object.defineProperty(a,f,{get:function(){return this._anchorElement[f]},set:function(c){this._anchorElement[f]=c},enumerable:!0})};["hash","host","hostname","port","protocol"].forEach(function(f){s(f)}),Object.defineProperty(a,"search",{get:function(){return this._anchorElement.search},set:function(f){this._anchorElement.search=f,this._updateSearchParams()},enumerable:!0}),Object.defineProperties(a,{toString:{get:function(){var f=this;return function(){return f.href}}},href:{get:function(){return this._anchorElement.href.replace(/\?$/,"")},set:function(f){this._anchorElement.href=f,this._updateSearchParams()},enumerable:!0},pathname:{get:function(){return this._anchorElement.pathname.replace(/(^\/?)/,"/")},set:function(f){this._anchorElement.pathname=f},enumerable:!0},origin:{get:function(){var f={"http:":80,"https:":443,"ftp:":21}[this._anchorElement.protocol],c=this._anchorElement.port!=f&&this._anchorElement.port!=="";return this._anchorElement.protocol+"//"+this._anchorElement.hostname+(c?":"+this._anchorElement.port:"")},enumerable:!0},password:{get:function(){return""},set:function(f){},enumerable:!0},username:{get:function(){return""},set:function(f){},enumerable:!0}}),i.createObjectURL=function(f){return o.createObjectURL.apply(o,arguments)},i.revokeObjectURL=function(f){return o.revokeObjectURL.apply(o,arguments)},e.URL=i};if(t()||r(),e.location!==void 0&&!("origin"in e.location)){var n=function(){return e.location.protocol+"//"+e.location.hostname+(e.location.port?":"+e.location.port:"")};try{Object.defineProperty(e.location,"origin",{get:n,enumerable:!0})}catch(o){setInterval(function(){e.location.origin=n()},100)}}})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Sr)});var Qr=Ht((Lt,Kr)=>{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof Lt=="object"&&typeof Kr=="object"?Kr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Lt=="object"?Lt.ClipboardJS=r():t.ClipboardJS=r()})(Lt,function(){return function(){var e={686:function(n,o,i){"use strict";i.d(o,{default:function(){return ki}});var a=i(279),s=i.n(a),f=i(370),c=i.n(f),u=i(817),p=i.n(u);function m(j){try{return document.execCommand(j)}catch(O){return!1}}var d=function(O){var w=p()(O);return m("cut"),w},h=d;function v(j){var O=document.documentElement.getAttribute("dir")==="rtl",w=document.createElement("textarea");w.style.fontSize="12pt",w.style.border="0",w.style.padding="0",w.style.margin="0",w.style.position="absolute",w.style[O?"right":"left"]="-9999px";var k=window.pageYOffset||document.documentElement.scrollTop;return w.style.top="".concat(k,"px"),w.setAttribute("readonly",""),w.value=j,w}var B=function(O,w){var k=v(O);w.container.appendChild(k);var F=p()(k);return m("copy"),k.remove(),F},re=function(O){var w=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},k="";return typeof O=="string"?k=B(O,w):O instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(O==null?void 0:O.type)?k=B(O.value,w):(k=p()(O),m("copy")),k},z=re;function T(j){return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?T=function(w){return typeof w}:T=function(w){return w&&typeof Symbol=="function"&&w.constructor===Symbol&&w!==Symbol.prototype?"symbol":typeof w},T(j)}var Ke=function(){var O=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},w=O.action,k=w===void 0?"copy":w,F=O.container,q=O.target,Le=O.text;if(k!=="copy"&&k!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(q!==void 0)if(q&&T(q)==="object"&&q.nodeType===1){if(k==="copy"&&q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(k==="cut"&&(q.hasAttribute("readonly")||q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(Le)return z(Le,{container:F});if(q)return k==="cut"?h(q):z(q,{container:F})},We=Ke;function Ie(j){return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?Ie=function(w){return typeof w}:Ie=function(w){return w&&typeof Symbol=="function"&&w.constructor===Symbol&&w!==Symbol.prototype?"symbol":typeof w},Ie(j)}function Ti(j,O){if(!(j instanceof O))throw new TypeError("Cannot call a class as a function")}function nn(j,O){for(var w=0;w0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof F.action=="function"?F.action:this.defaultAction,this.target=typeof F.target=="function"?F.target:this.defaultTarget,this.text=typeof F.text=="function"?F.text:this.defaultText,this.container=Ie(F.container)==="object"?F.container:document.body}},{key:"listenClick",value:function(F){var q=this;this.listener=c()(F,"click",function(Le){return q.onClick(Le)})}},{key:"onClick",value:function(F){var q=F.delegateTarget||F.currentTarget,Le=this.action(q)||"copy",Rt=We({action:Le,container:this.container,target:this.target(q),text:this.text(q)});this.emit(Rt?"success":"error",{action:Le,text:Rt,trigger:q,clearSelection:function(){q&&q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(F){return yr("action",F)}},{key:"defaultTarget",value:function(F){var q=yr("target",F);if(q)return document.querySelector(q)}},{key:"defaultText",value:function(F){return yr("text",F)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(F){var q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return z(F,q)}},{key:"cut",value:function(F){return h(F)}},{key:"isSupported",value:function(){var F=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],q=typeof F=="string"?[F]:F,Le=!!document.queryCommandSupported;return q.forEach(function(Rt){Le=Le&&!!document.queryCommandSupported(Rt)}),Le}}]),w}(s()),ki=Ri},828:function(n){var o=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function a(s,f){for(;s&&s.nodeType!==o;){if(typeof s.matches=="function"&&s.matches(f))return s;s=s.parentNode}}n.exports=a},438:function(n,o,i){var a=i(828);function s(u,p,m,d,h){var v=c.apply(this,arguments);return u.addEventListener(m,v,h),{destroy:function(){u.removeEventListener(m,v,h)}}}function f(u,p,m,d,h){return typeof u.addEventListener=="function"?s.apply(null,arguments):typeof m=="function"?s.bind(null,document).apply(null,arguments):(typeof u=="string"&&(u=document.querySelectorAll(u)),Array.prototype.map.call(u,function(v){return s(v,p,m,d,h)}))}function c(u,p,m,d){return function(h){h.delegateTarget=a(h.target,p),h.delegateTarget&&d.call(u,h)}}n.exports=f},879:function(n,o){o.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},o.nodeList=function(i){var a=Object.prototype.toString.call(i);return i!==void 0&&(a==="[object NodeList]"||a==="[object HTMLCollection]")&&"length"in i&&(i.length===0||o.node(i[0]))},o.string=function(i){return typeof i=="string"||i instanceof String},o.fn=function(i){var a=Object.prototype.toString.call(i);return a==="[object Function]"}},370:function(n,o,i){var a=i(879),s=i(438);function f(m,d,h){if(!m&&!d&&!h)throw new Error("Missing required arguments");if(!a.string(d))throw new TypeError("Second argument must be a String");if(!a.fn(h))throw new TypeError("Third argument must be a Function");if(a.node(m))return c(m,d,h);if(a.nodeList(m))return u(m,d,h);if(a.string(m))return p(m,d,h);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(m,d,h){return m.addEventListener(d,h),{destroy:function(){m.removeEventListener(d,h)}}}function u(m,d,h){return Array.prototype.forEach.call(m,function(v){v.addEventListener(d,h)}),{destroy:function(){Array.prototype.forEach.call(m,function(v){v.removeEventListener(d,h)})}}}function p(m,d,h){return s(document.body,m,d,h)}n.exports=f},817:function(n){function o(i){var a;if(i.nodeName==="SELECT")i.focus(),a=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var s=i.hasAttribute("readonly");s||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),s||i.removeAttribute("readonly"),a=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var f=window.getSelection(),c=document.createRange();c.selectNodeContents(i),f.removeAllRanges(),f.addRange(c),a=f.toString()}return a}n.exports=o},279:function(n){function o(){}o.prototype={on:function(i,a,s){var f=this.e||(this.e={});return(f[i]||(f[i]=[])).push({fn:a,ctx:s}),this},once:function(i,a,s){var f=this;function c(){f.off(i,c),a.apply(s,arguments)}return c._=a,this.on(i,c,s)},emit:function(i){var a=[].slice.call(arguments,1),s=((this.e||(this.e={}))[i]||[]).slice(),f=0,c=s.length;for(f;f{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var is=/["'&<>]/;Jo.exports=as;function as(e){var t=""+e,r=is.exec(t);if(!r)return t;var n,o="",i=0,a=0;for(i=r.index;i0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[n++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function W(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var n=r.call(e),o,i=[],a;try{for(;(t===void 0||t-- >0)&&!(o=n.next()).done;)i.push(o.value)}catch(s){a={error:s}}finally{try{o&&!o.done&&(r=n.return)&&r.call(n)}finally{if(a)throw a.error}}return i}function D(e,t,r){if(r||arguments.length===2)for(var n=0,o=t.length,i;n1||s(m,d)})})}function s(m,d){try{f(n[m](d))}catch(h){p(i[0][3],h)}}function f(m){m.value instanceof Xe?Promise.resolve(m.value.v).then(c,u):p(i[0][2],m)}function c(m){s("next",m)}function u(m){s("throw",m)}function p(m,d){m(d),i.shift(),i.length&&s(i[0][0],i[0][1])}}function mn(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof xe=="function"?xe(e):e[Symbol.iterator](),r={},n("next"),n("throw"),n("return"),r[Symbol.asyncIterator]=function(){return this},r);function n(i){r[i]=e[i]&&function(a){return new Promise(function(s,f){a=e[i](a),o(s,f,a.done,a.value)})}}function o(i,a,s,f){Promise.resolve(f).then(function(c){i({value:c,done:s})},a)}}function A(e){return typeof e=="function"}function at(e){var t=function(n){Error.call(n),n.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var $t=at(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(n,o){return o+1+") "+n.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function De(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Fe=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,n,o,i;if(!this.closed){this.closed=!0;var a=this._parentage;if(a)if(this._parentage=null,Array.isArray(a))try{for(var s=xe(a),f=s.next();!f.done;f=s.next()){var c=f.value;c.remove(this)}}catch(v){t={error:v}}finally{try{f&&!f.done&&(r=s.return)&&r.call(s)}finally{if(t)throw t.error}}else a.remove(this);var u=this.initialTeardown;if(A(u))try{u()}catch(v){i=v instanceof $t?v.errors:[v]}var p=this._finalizers;if(p){this._finalizers=null;try{for(var m=xe(p),d=m.next();!d.done;d=m.next()){var h=d.value;try{dn(h)}catch(v){i=i!=null?i:[],v instanceof $t?i=D(D([],W(i)),W(v.errors)):i.push(v)}}}catch(v){n={error:v}}finally{try{d&&!d.done&&(o=m.return)&&o.call(m)}finally{if(n)throw n.error}}}if(i)throw new $t(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)dn(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&De(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&De(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Or=Fe.EMPTY;function It(e){return e instanceof Fe||e&&"closed"in e&&A(e.remove)&&A(e.add)&&A(e.unsubscribe)}function dn(e){A(e)?e():e.unsubscribe()}var Ae={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var st={setTimeout:function(e,t){for(var r=[],n=2;n0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var n=this,o=this,i=o.hasError,a=o.isStopped,s=o.observers;return i||a?Or:(this.currentObservers=null,s.push(r),new Fe(function(){n.currentObservers=null,De(s,r)}))},t.prototype._checkFinalizedStatuses=function(r){var n=this,o=n.hasError,i=n.thrownError,a=n.isStopped;o?r.error(i):a&&r.complete()},t.prototype.asObservable=function(){var r=new U;return r.source=this,r},t.create=function(r,n){return new wn(r,n)},t}(U);var wn=function(e){ne(t,e);function t(r,n){var o=e.call(this)||this;return o.destination=r,o.source=n,o}return t.prototype.next=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.next)===null||o===void 0||o.call(n,r)},t.prototype.error=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.error)===null||o===void 0||o.call(n,r)},t.prototype.complete=function(){var r,n;(n=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||n===void 0||n.call(r)},t.prototype._subscribe=function(r){var n,o;return(o=(n=this.source)===null||n===void 0?void 0:n.subscribe(r))!==null&&o!==void 0?o:Or},t}(E);var Et={now:function(){return(Et.delegate||Date).now()},delegate:void 0};var wt=function(e){ne(t,e);function t(r,n,o){r===void 0&&(r=1/0),n===void 0&&(n=1/0),o===void 0&&(o=Et);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=n,i._timestampProvider=o,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=n===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,n),i}return t.prototype.next=function(r){var n=this,o=n.isStopped,i=n._buffer,a=n._infiniteTimeWindow,s=n._timestampProvider,f=n._windowTime;o||(i.push(r),!a&&i.push(s.now()+f)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var n=this._innerSubscribe(r),o=this,i=o._infiniteTimeWindow,a=o._buffer,s=a.slice(),f=0;f0?e.prototype.requestAsyncId.call(this,r,n,o):(r.actions.push(this),r._scheduled||(r._scheduled=ut.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,n,o){var i;if(o===void 0&&(o=0),o!=null?o>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,n,o);var a=r.actions;n!=null&&((i=a[a.length-1])===null||i===void 0?void 0:i.id)!==n&&(ut.cancelAnimationFrame(n),r._scheduled=void 0)},t}(Ut);var On=function(e){ne(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var n=this._scheduled;this._scheduled=void 0;var o=this.actions,i;r=r||o.shift();do if(i=r.execute(r.state,r.delay))break;while((r=o[0])&&r.id===n&&o.shift());if(this._active=!1,i){for(;(r=o[0])&&r.id===n&&o.shift();)r.unsubscribe();throw i}},t}(Wt);var we=new On(Tn);var R=new U(function(e){return e.complete()});function Dt(e){return e&&A(e.schedule)}function kr(e){return e[e.length-1]}function Qe(e){return A(kr(e))?e.pop():void 0}function Se(e){return Dt(kr(e))?e.pop():void 0}function Vt(e,t){return typeof kr(e)=="number"?e.pop():t}var pt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function zt(e){return A(e==null?void 0:e.then)}function Nt(e){return A(e[ft])}function qt(e){return Symbol.asyncIterator&&A(e==null?void 0:e[Symbol.asyncIterator])}function Kt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function Ki(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var Qt=Ki();function Yt(e){return A(e==null?void 0:e[Qt])}function Gt(e){return ln(this,arguments,function(){var r,n,o,i;return Pt(this,function(a){switch(a.label){case 0:r=e.getReader(),a.label=1;case 1:a.trys.push([1,,9,10]),a.label=2;case 2:return[4,Xe(r.read())];case 3:return n=a.sent(),o=n.value,i=n.done,i?[4,Xe(void 0)]:[3,5];case 4:return[2,a.sent()];case 5:return[4,Xe(o)];case 6:return[4,a.sent()];case 7:return a.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function Bt(e){return A(e==null?void 0:e.getReader)}function $(e){if(e instanceof U)return e;if(e!=null){if(Nt(e))return Qi(e);if(pt(e))return Yi(e);if(zt(e))return Gi(e);if(qt(e))return _n(e);if(Yt(e))return Bi(e);if(Bt(e))return Ji(e)}throw Kt(e)}function Qi(e){return new U(function(t){var r=e[ft]();if(A(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function Yi(e){return new U(function(t){for(var r=0;r=2;return function(n){return n.pipe(e?_(function(o,i){return e(o,i,n)}):me,Oe(1),r?He(t):zn(function(){return new Xt}))}}function Nn(){for(var e=[],t=0;t=2,!0))}function fe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new E}:t,n=e.resetOnError,o=n===void 0?!0:n,i=e.resetOnComplete,a=i===void 0?!0:i,s=e.resetOnRefCountZero,f=s===void 0?!0:s;return function(c){var u,p,m,d=0,h=!1,v=!1,B=function(){p==null||p.unsubscribe(),p=void 0},re=function(){B(),u=m=void 0,h=v=!1},z=function(){var T=u;re(),T==null||T.unsubscribe()};return g(function(T,Ke){d++,!v&&!h&&B();var We=m=m!=null?m:r();Ke.add(function(){d--,d===0&&!v&&!h&&(p=jr(z,f))}),We.subscribe(Ke),!u&&d>0&&(u=new et({next:function(Ie){return We.next(Ie)},error:function(Ie){v=!0,B(),p=jr(re,o,Ie),We.error(Ie)},complete:function(){h=!0,B(),p=jr(re,a),We.complete()}}),$(T).subscribe(u))})(c)}}function jr(e,t){for(var r=[],n=2;ne.next(document)),e}function K(e,t=document){return Array.from(t.querySelectorAll(e))}function V(e,t=document){let r=se(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function se(e,t=document){return t.querySelector(e)||void 0}function _e(){return document.activeElement instanceof HTMLElement&&document.activeElement||void 0}function tr(e){return L(b(document.body,"focusin"),b(document.body,"focusout")).pipe(ke(1),l(()=>{let t=_e();return typeof t!="undefined"?e.contains(t):!1}),N(e===_e()),Y())}function Be(e){return{x:e.offsetLeft,y:e.offsetTop}}function Yn(e){return L(b(window,"load"),b(window,"resize")).pipe(Ce(0,we),l(()=>Be(e)),N(Be(e)))}function rr(e){return{x:e.scrollLeft,y:e.scrollTop}}function dt(e){return L(b(e,"scroll"),b(window,"resize")).pipe(Ce(0,we),l(()=>rr(e)),N(rr(e)))}var Bn=function(){if(typeof Map!="undefined")return Map;function e(t,r){var n=-1;return t.some(function(o,i){return o[0]===r?(n=i,!0):!1}),n}return function(){function t(){this.__entries__=[]}return Object.defineProperty(t.prototype,"size",{get:function(){return this.__entries__.length},enumerable:!0,configurable:!0}),t.prototype.get=function(r){var n=e(this.__entries__,r),o=this.__entries__[n];return o&&o[1]},t.prototype.set=function(r,n){var o=e(this.__entries__,r);~o?this.__entries__[o][1]=n:this.__entries__.push([r,n])},t.prototype.delete=function(r){var n=this.__entries__,o=e(n,r);~o&&n.splice(o,1)},t.prototype.has=function(r){return!!~e(this.__entries__,r)},t.prototype.clear=function(){this.__entries__.splice(0)},t.prototype.forEach=function(r,n){n===void 0&&(n=null);for(var o=0,i=this.__entries__;o0},e.prototype.connect_=function(){!zr||this.connected_||(document.addEventListener("transitionend",this.onTransitionEnd_),window.addEventListener("resize",this.refresh),xa?(this.mutationsObserver_=new MutationObserver(this.refresh),this.mutationsObserver_.observe(document,{attributes:!0,childList:!0,characterData:!0,subtree:!0})):(document.addEventListener("DOMSubtreeModified",this.refresh),this.mutationEventsAdded_=!0),this.connected_=!0)},e.prototype.disconnect_=function(){!zr||!this.connected_||(document.removeEventListener("transitionend",this.onTransitionEnd_),window.removeEventListener("resize",this.refresh),this.mutationsObserver_&&this.mutationsObserver_.disconnect(),this.mutationEventsAdded_&&document.removeEventListener("DOMSubtreeModified",this.refresh),this.mutationsObserver_=null,this.mutationEventsAdded_=!1,this.connected_=!1)},e.prototype.onTransitionEnd_=function(t){var r=t.propertyName,n=r===void 0?"":r,o=ya.some(function(i){return!!~n.indexOf(i)});o&&this.refresh()},e.getInstance=function(){return this.instance_||(this.instance_=new e),this.instance_},e.instance_=null,e}(),Jn=function(e,t){for(var r=0,n=Object.keys(t);r0},e}(),Zn=typeof WeakMap!="undefined"?new WeakMap:new Bn,eo=function(){function e(t){if(!(this instanceof e))throw new TypeError("Cannot call a class as a function.");if(!arguments.length)throw new TypeError("1 argument required, but only 0 present.");var r=Ea.getInstance(),n=new Ra(t,r,this);Zn.set(this,n)}return e}();["observe","unobserve","disconnect"].forEach(function(e){eo.prototype[e]=function(){var t;return(t=Zn.get(this))[e].apply(t,arguments)}});var ka=function(){return typeof nr.ResizeObserver!="undefined"?nr.ResizeObserver:eo}(),to=ka;var ro=new E,Ha=I(()=>H(new to(e=>{for(let t of e)ro.next(t)}))).pipe(x(e=>L(Te,H(e)).pipe(C(()=>e.disconnect()))),J(1));function de(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ge(e){return Ha.pipe(S(t=>t.observe(e)),x(t=>ro.pipe(_(({target:r})=>r===e),C(()=>t.unobserve(e)),l(()=>de(e)))),N(de(e)))}function bt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function ar(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}var no=new E,Pa=I(()=>H(new IntersectionObserver(e=>{for(let t of e)no.next(t)},{threshold:0}))).pipe(x(e=>L(Te,H(e)).pipe(C(()=>e.disconnect()))),J(1));function sr(e){return Pa.pipe(S(t=>t.observe(e)),x(t=>no.pipe(_(({target:r})=>r===e),C(()=>t.unobserve(e)),l(({isIntersecting:r})=>r))))}function oo(e,t=16){return dt(e).pipe(l(({y:r})=>{let n=de(e),o=bt(e);return r>=o.height-n.height-t}),Y())}var cr={drawer:V("[data-md-toggle=drawer]"),search:V("[data-md-toggle=search]")};function io(e){return cr[e].checked}function qe(e,t){cr[e].checked!==t&&cr[e].click()}function je(e){let t=cr[e];return b(t,"change").pipe(l(()=>t.checked),N(t.checked))}function $a(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Ia(){return L(b(window,"compositionstart").pipe(l(()=>!0)),b(window,"compositionend").pipe(l(()=>!1))).pipe(N(!1))}function ao(){let e=b(window,"keydown").pipe(_(t=>!(t.metaKey||t.ctrlKey)),l(t=>({mode:io("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),_(({mode:t,type:r})=>{if(t==="global"){let n=_e();if(typeof n!="undefined")return!$a(n,r)}return!0}),fe());return Ia().pipe(x(t=>t?R:e))}function Me(){return new URL(location.href)}function ot(e){location.href=e.href}function so(){return new E}function co(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)co(e,r)}function M(e,t,...r){let n=document.createElement(e);if(t)for(let o of Object.keys(t))typeof t[o]!="undefined"&&(typeof t[o]!="boolean"?n.setAttribute(o,t[o]):n.setAttribute(o,""));for(let o of r)co(n,o);return n}function fr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function fo(){return location.hash.substring(1)}function uo(e){let t=M("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Fa(){return b(window,"hashchange").pipe(l(fo),N(fo()),_(e=>e.length>0),J(1))}function po(){return Fa().pipe(l(e=>se(`[id="${e}"]`)),_(e=>typeof e!="undefined"))}function Nr(e){let t=matchMedia(e);return Zt(r=>t.addListener(()=>r(t.matches))).pipe(N(t.matches))}function lo(){let e=matchMedia("print");return L(b(window,"beforeprint").pipe(l(()=>!0)),b(window,"afterprint").pipe(l(()=>!1))).pipe(N(e.matches))}function qr(e,t){return e.pipe(x(r=>r?t():R))}function ur(e,t={credentials:"same-origin"}){return ve(fetch(`${e}`,t)).pipe(ce(()=>R),x(r=>r.status!==200?Tt(()=>new Error(r.statusText)):H(r)))}function Ue(e,t){return ur(e,t).pipe(x(r=>r.json()),J(1))}function mo(e,t){let r=new DOMParser;return ur(e,t).pipe(x(n=>n.text()),l(n=>r.parseFromString(n,"text/xml")),J(1))}function pr(e){let t=M("script",{src:e});return I(()=>(document.head.appendChild(t),L(b(t,"load"),b(t,"error").pipe(x(()=>Tt(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(l(()=>{}),C(()=>document.head.removeChild(t)),Oe(1))))}function ho(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function bo(){return L(b(window,"scroll",{passive:!0}),b(window,"resize",{passive:!0})).pipe(l(ho),N(ho()))}function vo(){return{width:innerWidth,height:innerHeight}}function go(){return b(window,"resize",{passive:!0}).pipe(l(vo),N(vo()))}function yo(){return Q([bo(),go()]).pipe(l(([e,t])=>({offset:e,size:t})),J(1))}function lr(e,{viewport$:t,header$:r}){let n=t.pipe(X("size")),o=Q([n,r]).pipe(l(()=>Be(e)));return Q([r,t,o]).pipe(l(([{height:i},{offset:a,size:s},{x:f,y:c}])=>({offset:{x:a.x-f,y:a.y-c+i},size:s})))}(()=>{function e(n,o){parent.postMessage(n,o||"*")}function t(...n){return n.reduce((o,i)=>o.then(()=>new Promise(a=>{let s=document.createElement("script");s.src=i,s.onload=a,document.body.appendChild(s)})),Promise.resolve())}var r=class{constructor(n){this.url=n,this.onerror=null,this.onmessage=null,this.onmessageerror=null,this.m=a=>{a.source===this.w&&(a.stopImmediatePropagation(),this.dispatchEvent(new MessageEvent("message",{data:a.data})),this.onmessage&&this.onmessage(a))},this.e=(a,s,f,c,u)=>{if(s===this.url.toString()){let p=new ErrorEvent("error",{message:a,filename:s,lineno:f,colno:c,error:u});this.dispatchEvent(p),this.onerror&&this.onerror(p)}};let o=new EventTarget;this.addEventListener=o.addEventListener.bind(o),this.removeEventListener=o.removeEventListener.bind(o),this.dispatchEvent=o.dispatchEvent.bind(o);let i=document.createElement("iframe");i.width=i.height=i.frameBorder="0",document.body.appendChild(this.iframe=i),this.w.document.open(),this.w.document.write(` + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

OSCAR

+

OSCAR LogoThe OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. The OSCAR project has developed high-performance data pipelines specifically conceived to classify and filter large amounts of web data. The project has also put special attention in improving the data quality of web-based corpora as well as providing data for low-resource languages, so that these new ML/AI technologies are accessible to as many communities as possible.

+

Getting access +Latest version +Quickstart guide

+
+

Info

+

The new ✨ OSCAR 2301 ✨ is available! 🥳

+
+

This website aims to gather information about the corpus in a technical point of view:

+
    +
  • Corpus versions and their respective file formats.
  • +
  • Tools and pipelines, how to install and use them.
  • +
  • More general documentation and how to contribute.
  • +
+ + + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/javascripts/tablesort.js b/javascripts/tablesort.js new file mode 100644 index 0000000..ee04e90 --- /dev/null +++ b/javascripts/tablesort.js @@ -0,0 +1,6 @@ +document$.subscribe(function () { + var tables = document.querySelectorAll("article table:not([class])") + tables.forEach(function (table) { + new Tablesort(table) + }) +}) \ No newline at end of file diff --git a/quickstart/index.html b/quickstart/index.html new file mode 100644 index 0000000..c1f335f --- /dev/null +++ b/quickstart/index.html @@ -0,0 +1,800 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + OSCAR Quickstart - OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

OSCAR Quickstart

+

What is OSCAR?

+

OSCAR is a collection of web-based multilingual corpus of several terabytes, containing subcorpora in more than 150 languages.

+

Each OSCAR Corpus has a version name that tells you its approximate generation time, which usually coincides with the source crawl time. +The latest OSCAR Corpus is OSCAR 2301. +We advise you to always use the latest version, as we incrementally include new features that enable new ways of filtering the corpus for your applications.

+

Basic data layout

+

OSCAR is, since OSCAR 2109, document-oriented, which means that subcorpora are comprised of documents rather than individual lines.

+

This has important implications as to how to preprocess the data:

+

You can (and will) find sentences in other languages than the one you're interested in. For example, it is expected to encounter English sentences in documents from the French subcorpus.

+
+

Example

+

The Wikipedia article about the French anthem, La Marseillaise, contains its lyrics in French. +As such, this article is expected to be present in the English subcorpus with those French lyrics.

+

The good news is that you can easily remove those sentences if you are not interested in them, thanks to the metadata provided alongside the main content.

+
+

OSCAR is distributed in JSONLines files, usually compressed (gzip, zstd depending on the version).

+

Each line of a file is a JSON Object representing a single document. +Here is an example from OSCAR 2301:

+
{
+   "content":"English sentence\nphrase en français\n????????????", // (1)
+   "warc_headers":{ // (2)
+      "warc-identified-content-language":"fra,eng",
+      "warc-target-uri":"https://fr.wikipedia.org/wiki/...",
+      "warc-record-id":"<urn:uuid:29eaa920-d299-4b1d-b687-c72bd8d68116>",
+      "warc-type":"conversion",
+      "content-length":"35298", // (3)
+      "warc-refers-to":"<urn:uuid:39e42055-0d94-4e45-9c6c-9e7056635d64>",
+      "warc-block-digest":"sha1:WFH2A5WHCS2H365GIAFYQPI7UOAMFGHB", // (3)
+      "warc-date":"2022-11-26T09:45:47Z",
+      "content-type":"text/plain"
+   },
+   "metadata":{
+      "identification":{ // (4)
+         "label":"fr",
+         "prob":0.8938327
+      },
+      "harmful_pp":4063.1814, // (5)
+      "tlsh":"tlsh:T125315FF2B6088901EEA097015DB39B4600B...", // (6)
+      "quality_warnings":[ // (7)
+         "short_sentences",
+         "header",
+         "footer"
+      ],
+      "categories":[ // (8)
+         "examen_pix",
+         "liste_bu"
+      ],
+      "sentence_identifications":[ // (9)
+         {
+            "label":"fr",
+            "prob":0.99837273
+         },
+         {
+            "label":"en",
+            "prob":0.9992377
+         },
+         null
+      ]
+   }
+}
+
+
    +
  1. Newline-separated content.
  2. +
  3. Headers from the crawled dumps are left untouched. See the WARC specification for more info.
  4. +
  5. Since warc_headers are copied and content can be altered by Ungoliant at generation stage, content-length and warc-block-digest can be different from actual values.
  6. +
  7. Document-level identification. Computation details can be found on the OSCAR 22.01 paper.
  8. +
  9. Perplexity of the document, computed using a KenLM model trained on harmful content. See this pre-print for more info. The lower this number is, the higher the probability that it will contain harmful or adult content. This annotation will be changed from harmful_pp to harmful_pplin future releases.
  10. +
  11. Locality Sensitive Hash of the documents' content, using TLSH. Useful for both exact and near deduplication.
  12. +
  13. (Corresponds to annotations pre-23.01) Potential quality warnings. Based on content/sentence length. See [OSCAR 22.01 paper for more info.
  14. +
  15. Blocklist-based categories. Uses the UT1 Blocklist, plus custom additions. Please refer to the UT1 website for categories description. Note that the categories are in French.
  16. +
  17. Sentence-level identifications. A null value means no identification with a good enough threshold (>0.8 on 23.01).
  18. +
+

Getting access

+

There are different ways of getting access to OSCAR depending on your status! Head on to our dedicated page.

+

Using the corpus

+

TODO

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/schema/schema-v2/index.html b/schema/schema-v2/index.html new file mode 100644 index 0000000..8a2c07e --- /dev/null +++ b/schema/schema-v2/index.html @@ -0,0 +1,937 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + OSCAR Schema v2 - OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

OSCAR Schema v2

+

OSCAR (Open Super-large Crawled Aggregated coRpus) is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.

+

The new OSCAR schema is a major, breaking change from the previous ones, adding more metadata and laying out documents rather than lines.

+

Changes

+

OSCAR Schema v2 groups text and metadata in JSON dictionnaries, in JSONLines format.

+
/
+├── af
+   ├── af_sha256.txt
+   └── af.jsonl.gz
+├── de
+   ├── de_sha256.txt    # Checksum file 
+   └── de.jsonl.gz        # Textual content
+├── en
+   ├── en_part_1.jsonl.gz        # Multipart example
+   ├── en_part_2.jsonl.gz
+   └── en_sha256.txt
+├── yi
+   ├── yi_sha256.txt
+   └── yi.jsonl.gz
+└── zh
+    ├── zh_sha256.txt
+    └── zh.jsonl.gz
+
+

File formats

+

.jsonl files

+

These are the metadata, in JSONLines format.

+

Each line follows the following JSON Scheme:

+
{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Document",
+  "description": "Serializable version of [Document].",
+  "type": "object",
+  "required": [
+    "content",
+    "metadata",
+    "warc_headers"
+  ],
+  "properties": {
+    "content": {
+      "type": "string"
+    },
+    "metadata": {
+      "$ref": "#/definitions/Metadata"
+    },
+    "warc_headers": {
+      "type": "object",
+      "additionalProperties": {
+        "type": "string"
+      }
+    }
+  },
+  "definitions": {
+    "Identification": {
+      "type": "object",
+      "required": [
+        "label",
+        "prob"
+      ],
+      "properties": {
+        "label": {
+          "$ref": "#/definitions/Lang"
+        },
+        "prob": {
+          "type": "number",
+          "format": "float"
+        }
+      }
+    },
+    "Lang": {
+      "type": "string",
+      "enum": [
+        "Af",
+        "Als",
+        "...",
+        "Yue",
+        "Zh",
+        "Multi"
+      ]
+    },
+    "Metadata": {
+      "description": "OSCAR-specific metadata",
+      "type": "object",
+      "required": [
+        "identification",
+        "sentence_identifications"
+      ],
+      "properties": {
+        "annotation": {
+          "type": [
+            "array",
+            "null"
+          ],
+          "items": {
+            "type": "string"
+          }
+        },
+        "identification": {
+          "$ref": "#/definitions/Identification"
+        },
+        "sentence_identifications": {
+          "type": "array",
+          "items": {
+            "anyOf": [
+              {
+                "$ref": "#/definitions/Identification"
+              },
+              {
+                "type": "null"
+              }
+            ]
+          }
+        }
+      }
+    }
+  }
+}
+
+

Example: +

{
+
+  // text is here, separated by \n characters that have been removed here for lisibility
+  "content": "Adopt-a-user Home • Talk || Adoptee's Area • Resources || Adopter's Area • Resources • List of Adopters || Teahouse || Live Help Chat (IRC)\n
+  Shortcuts\n
+  WP:AAU\n
+  WP:ADOPT\n
+  WP:ADOPTION\n
+  WP:WIKIADOPT\n
+  The Adopt-a-user program is designed to help new and inexperienced users by pairing them with more experienced Wikipedians. These editors (referred to as adopters or mentors) will \"adopt\" newer users, guiding them along the way as they learn about Wikipedia and its various aspects.\n
+  The project aims to inform new users about the ins and outs of Wikipedia and steer them away from making less-than-constructive edits or misplaced test edits. Well over a thousand users have been involved in the program at one time or another.\n
+  So, if you're new or inexperienced and would like to:\nAsk questions about editing, contributing to Wikipedia and creating your first article\n
+  Learn to navigate processes and policies and guidelines\n
+  Get help with article creation or image uploads or any other activities on Wikipedia\n
+  . . .then an adopter should be able to help you. Adoption lasts as long as the adopter and adoptee want to continue, so you can stop any time if you feel you've learned enough, or you'd like to take a break.\n
+  If you are looking to contribute to Wikipedia but do not intend to remain as an active user well after adoption, then this program is not for you. Adoption is for users who intend to be long-term contributors and members of the community, so if you are simply here to create one article, see this page for help and do not request adoption.\n
+  Users who don't want adopting – but who do need help with one-off problems – might like to consider whether the Teahouse question forum, the Help desk, or a {{Help me}} request might be better ways to get quick answers.\n
+  Participation\n
+  Being adopted is easy and fun. Why not select an adopter from the list of adopters and contact them directly to request adoption? If you choose an adopter who shares your interests, they will be more able to assist you while you learn under their tutelage.\n
+  View the list of adopters!\n
+  ...",
+
+  // WARC Headers are extracted and put there untouched.
+  // The content-length shoud not be understood as the current document length, but as the original document length.
+  "warc_headers": {
+    "warc-block-digest": "sha1:U2OJPXXE3JCPSLAB6UPB3TEGBDHKPTAO",
+    "warc-record-id": "<urn:uuid:fec8808f-96ef-4ae5-8a57-df5b44e42dcf>",
+    "warc-identified-content-language": "eng,nno",
+    "content-type": "text/plain",
+    "warc-refers-to": "<urn:uuid:2f59440d-3700-418c-aa94-5c63bab316c3>",
+    "warc-date": "2021-09-16T12:40:45Z",
+    "warc-target-uri": "https://en.wikipedia.org/wiki/Wikipedia:Adopt-a-user",
+    "content-length": "5385",
+    "warc-type": "conversion"
+  },
+
+  // OSCAR metadata
+  "metadata": {
+
+    // Document identification
+    "identification": {
+      "label": "en",
+      "prob": 0.6775619
+    },
+
+    // Annotations of the document
+    "annotation": [
+      "short_sentences",
+      "header",
+      "footer"
+    ],
+
+    // Sentence identifications.
+    // null: identification confidence too low (<0.8)
+    // There is exactly one identification per line.
+    "sentence_identifications": [
+      null,
+      null,
+      {
+        "label": "en",
+        "prob": 0.89475197
+      },
+      {
+        "label": "en",
+        "prob": 0.9124037
+      },
+      {
+        "label": "en",
+        "prob": 0.8080786
+      },
+      null,
+      {
+        "label": "en",
+        "prob": 0.9665413
+      }, 
+    ]
+  }
+}
+

+

<lang>_sha256.txt files

+

These are used to check for eventual corruption during download. +They can be used by running sha256sum -c <lang>_sha256.txt.

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 0000000..52561a8 --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"OSCAR","text":"

The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. The OSCAR project has developed high-performance data pipelines specifically conceived to classify and filter large amounts of web data. The project has also put special attention in improving the data quality of web-based corpora as well as providing data for low-resource languages, so that these new ML/AI technologies are accessible to as many communities as possible.

Getting access Latest version Quickstart guide

Info

The new OSCAR 2301 is available!

This website aims to gather information about the corpus in a technical point of view:

  • Corpus versions and their respective file formats.
  • Tools and pipelines, how to install and use them.
  • More general documentation and how to contribute.
"},{"location":"accessing/","title":"Getting access to OSCAR","text":"

There are two ways of accessing OSCAR: through Huma-Num, or through HuggingFace. Depending on your status, you might not have the choice.

Research/Academic Individual Huma-Num Hugging-Face Huma-NumHuggingFace

You can apply for an access request by sending us an email!

Warning

Carefully respect the following instructions, as incorrect submissions might significantly delay your access.

Danger

Do not create an account by yourselves, as it could delay you access by weeks! We will create an account for you.

Send us an email at contact at oscar-project.org, with OSCAR Access Request as the title, and the following (completed) as the body:

Warning

Please send your email using your institutional/academic address when possible. Otherwise, your access might be delayed/refused.

- First name:\n- Last name:\n- Affiliation:\n- Contact details:\n- Corpus version: \n- Languages:\n\n+ a short description of your usecase.\n

Note

Access requests can take some days to be answered, sometimes more.

We post updates on our Discord server on exceptional delays, and you can always contact us there to inquire about yours.

After some time, you should get an email back from us with access instructions!

"},{"location":"accessing/#using-datasets","title":"Using datasets","text":"

The following implies that you already have installed the Python datasets library

  1. Create an account on HuggingFace.
  2. Create a user access token.
  3. Open the OSCAR Team page.
  4. Open your corpus of choice. Instructions should be in the corpus page.

After all of this, you should be able to easily use OSCAR data with the datasets library :

# example with OSCAR 2201\nfrom datasets import load_dataset\ndataset = load_dataset(\"oscar-corpus/OSCAR-2201\",\nuse_auth_token=True, # required\nlanguage=\"ar\", \nstreaming=True, # optional\nsplit=\"train\") # optional\nfor d in dataset:\nprint(d) # prints documents\n
"},{"location":"accessing/#using-git-lfs","title":"Using Git LFS","text":"

You can also get the raw data from HuggingFace using Git LFS.

The following steps assume you have git and git-lfs installed, and are on a UNIX system. The procedure should roughly be the same on Windows, but hasn\u2019t been attempted.

This will download the Basque corpus from OSCAR 2109.

GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/oscar-corpus/OSCAR-2109 cd OSCAR-2109 # go inside the directory\ngit lfs pull --include packaged/eu/eu.txt.gz # pull the required file(s) (here the Basque corpus). Check with the manpage for pull options\n
"},{"location":"quickstart/","title":"OSCAR Quickstart","text":""},{"location":"quickstart/#what-is-oscar","title":"What is OSCAR?","text":"

OSCAR is a collection of web-based multilingual corpus of several terabytes, containing subcorpora in more than 150 languages.

Each OSCAR Corpus has a version name that tells you its approximate generation time, which usually coincides with the source crawl time. The latest OSCAR Corpus is OSCAR 2301. We advise you to always use the latest version, as we incrementally include new features that enable new ways of filtering the corpus for your applications.

"},{"location":"quickstart/#basic-data-layout","title":"Basic data layout","text":"

OSCAR is, since OSCAR 2109, document-oriented, which means that subcorpora are comprised of documents rather than individual lines.

This has important implications as to how to preprocess the data:

You can (and will) find sentences in other languages than the one you're interested in. For example, it is expected to encounter English sentences in documents from the French subcorpus.

Example

The Wikipedia article about the French anthem, La Marseillaise, contains its lyrics in French. As such, this article is expected to be present in the English subcorpus with those French lyrics.

The good news is that you can easily remove those sentences if you are not interested in them, thanks to the metadata provided alongside the main content.

OSCAR is distributed in JSONLines files, usually compressed (gzip, zstd depending on the version).

Each line of a file is a JSON Object representing a single document. Here is an example from OSCAR 2301:

{\n\"content\":\"English sentence\\nphrase en fran\u00e7ais\\n????????????\", // (1)\n\"warc_headers\":{ // (2)\n\"warc-identified-content-language\":\"fra,eng\",\n\"warc-target-uri\":\"https://fr.wikipedia.org/wiki/...\",\n\"warc-record-id\":\"<urn:uuid:29eaa920-d299-4b1d-b687-c72bd8d68116>\",\n\"warc-type\":\"conversion\",\n\"content-length\":\"35298\", // (3)\n\"warc-refers-to\":\"<urn:uuid:39e42055-0d94-4e45-9c6c-9e7056635d64>\",\n\"warc-block-digest\":\"sha1:WFH2A5WHCS2H365GIAFYQPI7UOAMFGHB\", // (3)\n\"warc-date\":\"2022-11-26T09:45:47Z\",\n\"content-type\":\"text/plain\"\n},\n\"metadata\":{\n\"identification\":{ // (4)\n\"label\":\"fr\",\n\"prob\":0.8938327\n},\n\"harmful_pp\":4063.1814, // (5)\n\"tlsh\":\"tlsh:T125315FF2B6088901EEA097015DB39B4600B...\", // (6)\n\"quality_warnings\":[ // (7)\n\"short_sentences\",\n\"header\",\n\"footer\"\n],\n\"categories\":[ // (8)\n\"examen_pix\",\n\"liste_bu\"\n],\n\"sentence_identifications\":[ // (9)\n{\n\"label\":\"fr\",\n\"prob\":0.99837273\n},\n{\n\"label\":\"en\",\n\"prob\":0.9992377\n},\nnull\n]\n}\n}\n
  1. Newline-separated content.
  2. Headers from the crawled dumps are left untouched. See the WARC specification for more info.
  3. Since warc_headers are copied and content can be altered by Ungoliant at generation stage, content-length and warc-block-digest can be different from actual values.
  4. Document-level identification. Computation details can be found on the OSCAR 22.01 paper.
  5. Perplexity of the document, computed using a KenLM model trained on harmful content. See this pre-print for more info. The lower this number is, the higher the probability that it will contain harmful or adult content. This annotation will be changed from harmful_pp to harmful_pplin future releases.
  6. Locality Sensitive Hash of the documents' content, using TLSH. Useful for both exact and near deduplication.
  7. (Corresponds to annotations pre-23.01) Potential quality warnings. Based on content/sentence length. See [OSCAR 22.01 paper for more info.
  8. Blocklist-based categories. Uses the UT1 Blocklist, plus custom additions. Please refer to the UT1 website for categories description. Note that the categories are in French.
  9. Sentence-level identifications. A null value means no identification with a good enough threshold (>0.8 on 23.01).
"},{"location":"quickstart/#getting-access","title":"Getting access","text":"

There are different ways of getting access to OSCAR depending on your status! Head on to our dedicated page.

"},{"location":"quickstart/#using-the-corpus","title":"Using the corpus","text":"

TODO

"},{"location":"schema/schema-v2/","title":"OSCAR Schema v2","text":"

OSCAR (Open Super-large Crawled Aggregated coRpus) is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.

The new OSCAR schema is a major, breaking change from the previous ones, adding more metadata and laying out documents rather than lines.

"},{"location":"schema/schema-v2/#changes","title":"Changes","text":"

OSCAR Schema v2 groups text and metadata in JSON dictionnaries, in JSONLines format.

/\n\u251c\u2500\u2500 af\n\u2502   \u251c\u2500\u2500 af_sha256.txt\n\u2502   \u2514\u2500\u2500 af.jsonl.gz\n\u251c\u2500\u2500 de\n\u2502   \u251c\u2500\u2500 de_sha256.txt    # Checksum file \n\u2502   \u2514\u2500\u2500 de.jsonl.gz        # Textual content\n\u251c\u2500\u2500 en\n\u2502   \u251c\u2500\u2500 en_part_1.jsonl.gz        # Multipart example\n\u2502   \u251c\u2500\u2500 en_part_2.jsonl.gz\n\u2502   \u2514\u2500\u2500 en_sha256.txt\n\u251c\u2500\u2500 yi\n\u2502   \u251c\u2500\u2500 yi_sha256.txt\n\u2502   \u2514\u2500\u2500 yi.jsonl.gz\n\u2514\u2500\u2500 zh\n    \u251c\u2500\u2500 zh_sha256.txt\n    \u2514\u2500\u2500 zh.jsonl.gz\n
"},{"location":"schema/schema-v2/#file-formats","title":"File formats","text":""},{"location":"schema/schema-v2/#jsonl-files","title":".jsonl files","text":"

These are the metadata, in JSONLines format.

Each line follows the following JSON Scheme:

{\n\"$schema\": \"http://json-schema.org/draft-07/schema#\",\n\"title\": \"Document\",\n\"description\": \"Serializable version of [Document].\",\n\"type\": \"object\",\n\"required\": [\n\"content\",\n\"metadata\",\n\"warc_headers\"\n],\n\"properties\": {\n\"content\": {\n\"type\": \"string\"\n},\n\"metadata\": {\n\"$ref\": \"#/definitions/Metadata\"\n},\n\"warc_headers\": {\n\"type\": \"object\",\n\"additionalProperties\": {\n\"type\": \"string\"\n}\n}\n},\n\"definitions\": {\n\"Identification\": {\n\"type\": \"object\",\n\"required\": [\n\"label\",\n\"prob\"\n],\n\"properties\": {\n\"label\": {\n\"$ref\": \"#/definitions/Lang\"\n},\n\"prob\": {\n\"type\": \"number\",\n\"format\": \"float\"\n}\n}\n},\n\"Lang\": {\n\"type\": \"string\",\n\"enum\": [\n\"Af\",\n\"Als\",\n\"...\",\n\"Yue\",\n\"Zh\",\n\"Multi\"\n]\n},\n\"Metadata\": {\n\"description\": \"OSCAR-specific metadata\",\n\"type\": \"object\",\n\"required\": [\n\"identification\",\n\"sentence_identifications\"\n],\n\"properties\": {\n\"annotation\": {\n\"type\": [\n\"array\",\n\"null\"\n],\n\"items\": {\n\"type\": \"string\"\n}\n},\n\"identification\": {\n\"$ref\": \"#/definitions/Identification\"\n},\n\"sentence_identifications\": {\n\"type\": \"array\",\n\"items\": {\n\"anyOf\": [\n{\n\"$ref\": \"#/definitions/Identification\"\n},\n{\n\"type\": \"null\"\n}\n]\n}\n}\n}\n}\n}\n}\n

Example:

{\n// text is here, separated by \\n characters that have been removed here for lisibility\n\"content\": \"Adopt-a-user Home \u2022 Talk || Adoptee's Area \u2022 Resources || Adopter's Area \u2022 Resources \u2022 List of Adopters || Teahouse || Live Help Chat (IRC)\\n\n  Shortcuts\\n\n  WP:AAU\\n\n  WP:ADOPT\\n\n  WP:ADOPTION\\n\n  WP:WIKIADOPT\\n\n  The Adopt-a-user program is designed to help new and inexperienced users by pairing them with more experienced Wikipedians. These editors (referred to as adopters or mentors) will \\\"adopt\\\" newer users, guiding them along the way as they learn about Wikipedia and its various aspects.\\n\n  The project aims to inform new users about the ins and outs of Wikipedia and steer them away from making less-than-constructive edits or misplaced test edits. Well over a thousand users have been involved in the program at one time or another.\\n\n  So, if you're new or inexperienced and would like to:\\nAsk questions about editing, contributing to Wikipedia and creating your first article\\n\n  Learn to navigate processes and policies and guidelines\\n\n  Get help with article creation or image uploads or any other activities on Wikipedia\\n\n  . . .then an adopter should be able to help you. Adoption lasts as long as the adopter and adoptee want to continue, so you can stop any time if you feel you've learned enough, or you'd like to take a break.\\n\n  If you are looking to contribute to Wikipedia but do not intend to remain as an active user well after adoption, then this program is not for you. Adoption is for users who intend to be long-term contributors and members of the community, so if you are simply here to create one article, see this page for help and do not request adoption.\\n\n  Users who don't want adopting \u2013 but who do need help with one-off problems \u2013 might like to consider whether the Teahouse question forum, the Help desk, or a {{Help me}} request might be better ways to get quick answers.\\n\n  Participation\\n\n  Being adopted is easy and fun. Why not select an adopter from the list of adopters and contact them directly to request adoption? If you choose an adopter who shares your interests, they will be more able to assist you while you learn under their tutelage.\\n\n  View the list of adopters!\\n\n  ...\",\n// WARC Headers are extracted and put there untouched.\n// The content-length shoud not be understood as the current document length, but as the original document length.\n\"warc_headers\": {\n\"warc-block-digest\": \"sha1:U2OJPXXE3JCPSLAB6UPB3TEGBDHKPTAO\",\n\"warc-record-id\": \"<urn:uuid:fec8808f-96ef-4ae5-8a57-df5b44e42dcf>\",\n\"warc-identified-content-language\": \"eng,nno\",\n\"content-type\": \"text/plain\",\n\"warc-refers-to\": \"<urn:uuid:2f59440d-3700-418c-aa94-5c63bab316c3>\",\n\"warc-date\": \"2021-09-16T12:40:45Z\",\n\"warc-target-uri\": \"https://en.wikipedia.org/wiki/Wikipedia:Adopt-a-user\",\n\"content-length\": \"5385\",\n\"warc-type\": \"conversion\"\n},\n// OSCAR metadata\n\"metadata\": {\n// Document identification\n\"identification\": {\n\"label\": \"en\",\n\"prob\": 0.6775619\n},\n// Annotations of the document\n\"annotation\": [\n\"short_sentences\",\n\"header\",\n\"footer\"\n],\n// Sentence identifications.\n// null: identification confidence too low (<0.8)\n// There is exactly one identification per line.\n\"sentence_identifications\": [\nnull,\nnull,\n{\n\"label\": \"en\",\n\"prob\": 0.89475197\n},\n{\n\"label\": \"en\",\n\"prob\": 0.9124037\n},\n{\n\"label\": \"en\",\n\"prob\": 0.8080786\n},\nnull,\n{\n\"label\": \"en\",\n\"prob\": 0.9665413\n}, ]\n}\n}\n

"},{"location":"schema/schema-v2/#lang_sha256txt-files","title":"<lang>_sha256.txt files","text":"

These are used to check for eventual corruption during download. They can be used by running sha256sum -c <lang>_sha256.txt.

"},{"location":"tools/generation-jeanzay/","title":"Generating a new OSCAR Version on Jean Zay","text":""},{"location":"tools/generation-jeanzay/#compile-the-latest-version-of-unogliant","title":"Compile the latest version of Unogliant","text":"

This is currently preferred to just getting it from cargo install ungoliant.

  • git clone https://github.com/oscar-project/ungoliant
  • Open an interactive session on a compil node: srun --partition=compil -A <GROUP ID>@cpu --pty bash
  • Run module load llvm boost cargo (boost and llvm are necessary for compiling KenLM and FastText)
  • Run cd ungoliant
  • Run cargo b --release --features kenlm
"},{"location":"tools/generation-jeanzay/#download-the-data-from-commoncrawl","title":"Download the data from CommonCrawl","text":"

We advise the use of the prepost partition for downloading the data form Common Crawl. However, please bear in mind that jobs are limited to 20hours in the prepost partition, meaning that you'll likely run out of time before completing the download of a whole Common Crawl dump.

  • Download the wet.paths.gz file for the latest release (likely here
  • gzip -d wet.paths.gz

Create a dl_corpus.slurm file with the following text inside:

#! /bin/bash\n#SBATCH --partition=prepost\n#SBATCH --job-name=get_cc # create a short name for your job\n#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)\n#SBATCH --mail-user=<YOUR MAIL>    # Where to send mail\n#SBATCH --nodes=\"1\" #Combien de n\u0153uds\n#SBATCH --ntasks-per-node=\"1\" # Une t\u00e2che par GPU\n#SBATCH --cpus-per-task=\"64\" # nombre de coeurs \u00e0 r\u00e9server par t\u00e2che\n#SBATCH --time=\"20:00:00\" # temps d'ex\u00e9cution maximum demande (HH:MM:SS)\n#SBATCH -A <GROUP ID>@cpu\nexport CARGO_HOME=<CARGO HOME PATH (in SCRATCH if you can>\nexport PATHS_FILE=<PATH TO wet.PATHS>\nexport DST=<DESTINATION>\n\n./target/release/ungoliant download $PATHS_FILE $DST\n

When the time has run out, you have to ensure that the last downloaded shards weren't corrupted (because of a potential kill while downloading).

Then, after potentially removing faulty shards, run the following slurm job. The only difference with the previous one is the use of the -o n parameter on ungoliant download, which will ignore the first n lines of the wet.paths. You can/should also use another DESTINATION folder, and then do the merge by hand.

#! /bin/bash\n#SBATCH --partition=prepost\n#SBATCH --job-name=get_cc # create a short name for your job\n#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)\n#SBATCH --mail-user=<YOUR MAIL>    # Where to send mail\n#SBATCH --nodes=\"1\" #Combien de n\u0153uds\n#SBATCH --ntasks-per-node=\"1\" # Une t\u00e2che par GPU\n#SBATCH --cpus-per-task=\"64\" # nombre de coeurs \u00e0 r\u00e9server par t\u00e2che\n#SBATCH --time=\"20:00:00\" # temps d'ex\u00e9cution maximum demande (HH:MM:SS)\n#SBATCH -A <GROUP ID>@cpu\nexport CARGO_HOME=<CARGO HOME PATH (in SCRATCH if you can>\nexport PATHS_FILE=<PATH TO wet.PATHS>\nexport DST=<DESTINATION>\n\n./target/release/ungoliant download -o <NB_DOWNLOADED> $PATHS_FILE $DST\n

You can then check that no shards are missing:

import os\nshards_dir = \"./shards\"\npaths_file = \"wet.paths\"\ncc_rooturl = \"https://data.commoncrawl.org/\"\nmissing_shards = list()\nfor i in range(88000):\nif not os.path.isfile(f\"{shards_dir}/{i}.txt.gz\"):\nmissing_shards.append(i)\nprint(f\"missing {len(missing_shards)} shards\")\nwith open(paths_file) as f:\nshard_paths = f.readlines()\nfor missing_shard_number in missing_shards:\nprint(\nf\"wget -nc {cc_rooturl}{shard_paths[missing_shard_number].strip()} -O {missing_shard_number}.txt.gz\"\n)\n

This will give you the wget commands to get the missing shards, with a -nc param to avoid overwriting already existing files.

"},{"location":"tools/generation-jeanzay/#generate-oscar","title":"Generate OSCAR","text":"

When you have your shards ready, create a new SLURM file with:

We use a QoS of t4 because since we can only use one node and corpus generation time is likely >20h, we need the 100 mark.

Other strategies could be tested (for example, splitting CC data into 4 buckets and launch 4 ungoliant jobs. Then, merging back the datasets should be done. Note that in that case, rebuild files will be less efficient (since we'll have 4 of them)

#! /bin/bash\n#SBATCH --partition=cpu_p1\n#SBATCH --job-name=gen_oscar # create a short name for your job\n#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)\n#SBATCH --mail-user=<YOUR MAIL>    # Where to send mail\n#SBATCH --nodes=\"1\" #Combien de n\u0153uds\n#SBATCH --ntasks-per-node=\"1\" # Une t\u00e2che par GPU\n#SBATCH --cpus-per-task=\"40\" # nombre de coeurs \u00e0 r\u00e9server par t\u00e2che\n#SBATCH --time=\"100:00:00\" # temps d'ex\u00e9cution maximum demande (HH:MM:SS)\n#SBATCH --qos=qos_cpu-t4\n#SBATCH -A <GROUP ID>@cpu\nexport CARGO_HOME=<CARGO HOME PATH>\nexport CC_FOLDER=<SHARDS PATH>\nexport KENLM_FOLDER=<PATH TO KENLMS MODELS IF APPLICABLE>\nexport CORPUS=<DESTINATION FOLDER>\nexport BLOCKLIST=<BLOCKLIST FOLDER (must contain subfolders with category names..)>\nexport LID_PATH=<PATH TO FASTTEXT LangID>\nexport UNGOLIANT_PATH=<PATH TO UNGOLIANT BINARY>\n\n$UNGOLIANT_PATH pipeline $CC_FOLDER $CORPUS --blocklist-path $BLOCKLIST --kenlms-path $KENLM_FOLDER --lid-path $LID_PATH\n

As of Jan. 2023, using ungoliant 1.3.0 ([c14acc8](https://github.com/oscar-project/ungoliant/tree/c14acc8c6a87913d138a022cf4819024d66b3e06)), with a 88,000-shard dump of CommonCrawl (November/December 2022, ~9.5TB compressed), this process took around 20 hours and yielded a corpus weighing arount 12TB (uncompressed).

"},{"location":"tools/generation-jeanzay/#move-oscar","title":"Move OSCAR","text":"

Files in $SCRATCH are deleted after 30 days if no R/W is operated on them. You should move out files to $STORE if you plan on keeping them. Unfortunately, due to the file size, you'll need to launch another job to do the copying of the files.

Warning

rsync -n enables a dry-run, enabling you to see which files would be moved, and where. Remove the -n parameter when you want to perform the actual copy.

#! /bin/bash\n#SBATCH --partition=prepost\n#SBATCH --job-name=copy_oscar # create a short name for your job\n#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)\n#SBATCH --mail-user=julien.abadji@inria.fr    # Where to send mail\n#SBATCH --nodes=\"1\" #Combien de n\u0153uds\n#SBATCH --ntasks-per-node=\"1\" # Une t\u00e2che par GPU\n#SBATCH --cpus-per-task=\"4\" # nombre de coeurs \u00e0 r\u00e9server par t\u00e2che\n#SBATCH --time=\"20:00:00\" # temps d'ex\u00e9cution maximum demande (HH:MM:SS)\n#SBATCH -A <GROUP ID>@cpu\nexport SRC=<CORPUS SOURCE>\nexport DST=<CORPUS DESTINATION>\n\nrsync -anvP $SRC $DST\n

On the same example as before, copying took around 9 hours.

"},{"location":"tools/generation-jeanzay/#preparing-for-release","title":"Preparing for release","text":""},{"location":"tools/generation-jeanzay/#splitting","title":"Splitting","text":"

We use oscar-tools to split the corpus.

Note

At the time of writing, oscar-tools is not available via crates.io/cargo install, so you have to compile it from source. Luckily, it's easy.

Compiling oscar-tools
  1. Get the source: git clone https://github.com/oscar-project/oscar-tools
  2. Go inside a compil node: srun --partition=compil -A <GROUP ID>@cpu --pty bash
  3. cd oscar-tools
  4. CARGO_HOME=<Somewhere not in your ~, like $SCRATCH/.cargo> cargo b --features zstd --release.
  5. Wait ~some hours~
  6. That's it! Your binary sits at target/release/oscar-tools.
#! /bin/bash\n#SBATCH --partition=prepost\n#SBATCH --job-name=split_oscar # create a short name for your job\n#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)\n#SBATCH --mail-user=<Your email address>    # Where to send mail\n#SBATCH --nodes=\"1\" #Combien de n\u0153uds\n#SBATCH --ntasks-per-node=\"1\" # Une t\u00e2che par GPU\n#SBATCH --cpus-per-task=\"10\" # nombre de coeurs \u00e0 r\u00e9server par t\u00e2che\n#SBATCH --time=\"20:00:00\" # temps d'ex\u00e9cution maximum demande (HH:MM:SS)\n#SBATCH -A <group id>@cpu\nexport OSCAR_TOOLS_BIN=<path to oscar-tools binary>\nexport CORPUS=<path to corpus>\nexport DST=<where the split corpus will be put>\n\n$OSCAR_TOOLS_BIN v2 split $CORPUS $DST -s 2000\n

This step took around 3 hours (assuming both CORPUS and DST are on $SCRATCH).

"},{"location":"tools/generation-jeanzay/#compressing","title":"Compressing","text":"
#! /bin/bash\n#SBATCH --partition=prepost\n#SBATCH --job-name=compress_oscar # create a short name for your job\n#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)\n#SBATCH --mail-user=<email address>    # Where to send mail\n#SBATCH --nodes=\"1\" #Combien de n\u0153uds\n#SBATCH --ntasks-per-node=\"1\" # Une t\u00e2che par GPU\n#SBATCH --cpus-per-task=\"48\" # nombre de coeurs \u00e0 r\u00e9server par t\u00e2che\n#SBATCH --time=\"20:00:00\" # temps d'ex\u00e9cution maximum demande (HH:MM:SS)\n#SBATCH -A <group id>@cpu\nexport OSCAR_TOOLS_BIN=<link to oscar-tools binary>\nexport CORPUS=<path to split focus>\nexport DST=<where the compressed ocrpus will be saved>\n\n$OSCAR_TOOLS_BIN v2 compress $CORPUS $DST\n

This step took around 2 hours, going from 12TB to 3.3TB

"},{"location":"tools/generation-jeanzay/#checksuming","title":"Checksuming","text":"

The last step is to create checksum files for each language, so that people can check that their downloads have been successful. Also, it acts as a split list for download-oscar.

#! /bin/bash\n#SBATCH --partition=prepost\n#SBATCH --job-name=compress_oscar # create a short name for your job\n#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)\n#SBATCH --mail-user=<email address>    # Where to send mail\n#SBATCH --nodes=\"1\" #Combien de n\u0153uds\n#SBATCH --ntasks-per-node=\"1\" # Une t\u00e2che par GPU\n#SBATCH --cpus-per-task=\"48\" # nombre de coeurs \u00e0 r\u00e9server par t\u00e2che\n#SBATCH --time=\"20:00:00\" # temps d'ex\u00e9cution maximum demande (HH:MM:SS)\n#SBATCH -A <group id>@cpu\nexport OSCAR_TOOLS_BIN=<link to oscar-tools binary>\nexport CORPUS=<path to split focus>\n\n$OSCAR_TOOLS_BIN v2 checksum $CORPUS\n

The process took around 2 hours.

"},{"location":"tools/oscar-tools/","title":"oscar-tools","text":"

oscar-tools is a toolkit that was created along with OSCAR-2201 to make operations on the corpus easy and fast.

At its core, oscar-tools provides a set of operations targeted at a given OSCAR version. As such, you shoudn't expect to have all operations available on all OSCAR versions. For example, at the time of writing, deduplicate is not available for OSCAR 22.01-like corpora.

The CLI of oscar-tools is still a bit messy and can be confusing, because we are actively working on it and on implementing essential features.

"},{"location":"tools/oscar-tools/#installation","title":"Installation","text":""},{"location":"tools/oscar-tools/#from-releases","title":"From releases","text":"

Note

Binaries are not available yet.

"},{"location":"tools/oscar-tools/#from-cargo","title":"From cargo","text":"

Note

cargo install oscar-tools is not available yet.

"},{"location":"tools/oscar-tools/#from-repository","title":"From repository","text":"

Note

This could evolve rapidly.

Right now the latest version sits on the dev-oscario branch, where we're slowly replacing inline IO blocks by our Corpus IO library, oscar-io.

$> git clone https://github.com/oscar-corpus/oscar-tools #Clone the repository\n$> cd oscar-tools\n$> git checkout dev-oscario #Change branch\n$> cargo b --release #Build the project. \n$> # Building might take some time because of \n$> # the parquet dependency that will soon be optional.\n$> touch target/release/oscar-tools #Binary is here and self-sufficient.\n
"},{"location":"tools/oscar-tools/#usage","title":"Usage","text":"

oscar-tools --help might help you find the parameters/operations you're looking for.

Note

In the tool, v1 corresponds to 2019-like corpora, whereas v2 corresponds to 22.01-like corpora.

Each operation has different parameters.

"},{"location":"tools/oscar-tools/#v1-oscar-2019","title":"v1 / OSCAR 2019","text":"

At the time of writing, the only operation available is dedup. It uses runiq to deduplicate corpora.

oscar-tools-v1-dedup \nline deduplication\n\nUSAGE:\n    oscar-tools v1 dedup [ARGS]\n\nARGS:\n    <SOURCE>         Corpus source file.\n    <DESTINATION>    Corpus destination file. Should not exist.\n\nOPTIONS:\n    -h, --help    Print help information\n
"},{"location":"tools/oscar-tools/#v2-oscar-2201","title":"v2 / OSCAR 22.01","text":"

There is a lot more operations implemented on OSCAR 22.01-like corpora.

"},{"location":"tools/oscar-tools/#extract-tags","title":"extract-tags","text":"

extract-tags extracts documents that meet certain annotation constraints.

oscar-tools-v2-extract-tags \nExtracts a OSCAR v2 corpus restricting tags. Included tags must be present and excluded ones must be\nabsent. Use --clean to extract documents with no annotation only\n\nUSAGE:\n    oscar-tools v2 extract-tags [OPTIONS] [--] [ARGS]\n\nARGS:\n    <SOURCE>         Corpus source file/folder. If folder, splits corpus files in provided\n                     folder\n    <DESTINATION>    Corpus source file/folder. If folder, splits corpus files in provided\n                     folder\n\nOPTIONS:\n        --clean                only return documents with no tags. include and exclude will be\n                               ignored\n    -e, --exclude <tags>...    space separated tags to exclude.\n    -h, --help                 Print help information\n    -i, --include <tags>...    space separated tags to include.\n
"},{"location":"tools/oscar-tools/#extract-text","title":"extract-text","text":"

extract-text \"converts\" a 2201-like corpus into a 2019-like corpus, by removing all metadata and only storing sentences. Keep in mind that while the format will be similar to 2109-like corpora, the filtering is a bit different and lines from other languages won't be stripped.

Extract text from documents. The output will be a OSCAR v1 (2019)-compatible corpus.\n\nUSAGE:\n    oscar-tools v2 extract-text [OPTIONS] <SOURCE> <DESTINATION>\n\nARGS:\n    <SOURCE>         Corpus source file.\n    <DESTINATION>    Corpus destination file (OSCAR v1 (2019)-like)\n\nOPTIONS:\n        --del_src    If set, deletes source files as they are being extracted.\n    -h, --help       Print help information\n
"},{"location":"versions/mOSCAR/","title":"mOSCAR","text":"

mOSCAR, to the best of our knowledge the first large-scale multilingual and multimodal document corpus crawled from the web. It covers 163 languages, 315M documents, 214B tokens and 1.2B images. We carefully conduct a set of filtering and evaluation steps to make sure mOSCAR is sufficiently safe, diverse and of good quality.

"},{"location":"versions/mOSCAR/#access","title":"Access","text":"

Access to the mOSCAR is granted via the Hugging Face Hub.

All data is avaialble at https://huggingface.co/datasets/oscar-corpus/mOSCAR.

"},{"location":"versions/mOSCAR/#layout","title":"Layout","text":"

To Come ...

"},{"location":"versions/mOSCAR/#language-table","title":"Language table","text":"Lang. name Code Family Script # documents # images # tokens Acehnese ace_Latn Latin 7,803 32,461 2,889,134 Mesopotamian Arabic acm_Arab Arabic 2,274 10,620 1,047,748 Tunisian Arabic aeb_Arab Arabic 7,640 41,570 2,715,187 Afrikaans afr_Latn Latin 54,895 247,774 39,956,585 South Levantine Arabic ajp_Arab Arabic 12,098 87,837 5,167,813 Tosk Albanian als_Latn Latin 861,678 2,569,164 452,737,251 Amharic amh_Ethi Ge'ez 39,588 152,646 35,089,019 North Levantine Arabic apc_Arab Arabic 19,904 128,966 9,560,701 Modern Standard Arabic arb_Arab Arabic 3,936,851 15,126,931 3,401,919,964 Najdi Arabic ars_Arab Arabic 60,229 296,741 43,610,873 Moroccan Arabic ary_Arab Arabic 142,386 698,051 204,723,454 Egyptian Arabic arz_Arab Arabic 835,529 4,054,632 653,626,387 Assamese asm_Beng Bengali 3,948 9,210 640,390 Asturian ast_Latn Latin 165,745 962,723 37,547,944 Awadhi awa_Deva Devanagari 29,324 107,483 4,961,635 Central Aymara ayr_Latn Latin 27,384 151,889 5,148,970 South Azerbaijani azb_Arab Arabic 8,274 38,233 5,256,693 North Azerbaijani azj_Latn Latin 516,021 1,808,060 257,825,849 Bashkir bak_Cyrl Cyrillic 4,532 17,174 3,038,766 Bambara bam_Latn Latin 7,674 39,190 1,243,332 Balinese ban_Latn Latin 1,886 11,266 542,015 Belarusian bel_Cyrl Cyrillic 63,309 287,539 72,976,520 Bemba bem_Latn Latin 1,096 7,479 1,340,471 Bengali ben_Beng Bengali 270,406 947,035 35,858,814 Bhojpuri bho_Deva Devanagari 6,366 28,131 875,463 Banjar bjn_Latn Latin 5,427 27,803 1,898,526 Bosnian bos_Latn Latin 1,960,599 7,633,049 1,255,000,505 Buginese bug_Latn Latin 3,312 18,648 588,678 Bulgarian bul_Cyrl Cyrillic 2,591,998 11,670,028 1,760,971,620 Catalan cat_Latn Latin 1,153,864 4,736,634 606,447,390 Cebuano ceb_Latn Latin 16,990 91,234 10,748,818 Czech ces_Latn Latin 3,918,837 13,291,309 2,823,172,996 Central Kurdish ckb_Arab Arabic 36,725 136,566 22,322,689 Crimean Tatar crh_Latn Latin 6,376 24,124 1,742,727 Welsh cym_Latn Latin 40,408 165,897 27,748,345 Danish dan_Latn Latin 2,076,298 9,559,600 1,238,277,499 German deu_Latn Latin 20,662,696 87,976,200 8,544,986,218 Southwestern Dinka dik_Latn Latin 1,712 6,635 1,319,943 Greek ell_Grek Greek 4,916,081 15,209,058 2,923,201,041 English eng_Latn Latin 52,215,013 207,904,315 33,570,108,782 Esperanto epo_Latn Latin 25,157 124,996 28,586,195 Estonian est_Latn Latin 1,040,368 5,217,366 619,215,048 Basque eus_Latn Latin 849,043 3,445,539 277,145,498 Faroese fao_Latn Latin 15,411 60,340 6,691,327 Fijian fij_Latn Latin 1,528 8,776 487,388 Finnish fin_Latn Latin 2,396,033 10,365,333 1,781,044,864 French fra_Latn Latin 20,305,739 78,179,601 14,362,579,829 Friulian fur_Latn Latin 37,290 256,456 5,949,600 Nigerian Fulfulde fuv_Latn Latin 1,568 7,124 401,852 West Central Oromo gaz_Latn Latin 4,058 11,763 1,786,093 Scottish Gaelic gla_Latn Latin 29,710 153,249 14,605,090 Irish gle_Latn Latin 68,858 315,132 47,438,400 Galician glg_Latn Latin 518,973 2,381,475 217,063,180 Guarani grn_Latn Latin 490,945 2,416,633 89,921,114 Gujarati guj_Gujr Gujarati 23,062 91,320 3,324,866 Haitian Creole hat_Latn Latin 257,745 1,570,699 62,847,106 Hausa hau_Latn Latin 25,364 104,934 13,089,932 Hebrew heb_Hebr Hebrew 1,109,591 4,766,483 893,327,320 Hindi hin_Deva Devanagari 579,430 1,830,667 122,558,353 Chhattisgarhi hne_Deva Devanagari 1,581 7,263 273,174 Croatian hrv_Latn Latin 1,719,617 8,425,510 1,010,674,096 Hungarian hun_Latn Latin 3,534,506 15,390,083 2,831,715,050 Armenian hye_Armn Armenian 339,962 1,141,885 205,635,952 Igbo ibo_Latn Latin 11,529 68,049 8,701,070 Ilocano ilo_Latn Latin 78,872 523,195 8,116,113 Indonesian ind_Latn Latin 7,016,291 17,324,777 3,981,843,468 Icelandic isl_Latn Latin 244,676 1,027,465 137,015,973 Italian ita_Latn Latin 12,937,153 47,476,971 8,311,790,842 Javanese jav_Latn Latin 24,785 135,583 16,908,805 Japanese jpn_Jpan Kanji 14,415,292 23,893,768 8,923,348,944 Kabyle kab_Latn Latin 18,508 106,730 4,079,553 Kannada kan_Knda Brahmic Kannada 12,978 42,621 1,442,776 Kashmiri kas_Arab Arabic 3,109 11,408 5,731,910 Georgian kat_Geor Caucasian Georgian 354,436 1,304,281 275,223,026 Kazakh kaz_Cyrl Cyrillic 252,242 732,648 140,049,214 Halh Mongolian khk_Cyrl Cyrillic 124,412 508,217 84,535,241 Khmer khm_Khmr Austroasiatic 24,495 122,243 3,043,925 Kinyarwanda kin_Latn Latin 30,401 172,201 12,049,616 Kyrgyz kir_Cyrl Cyrillic 53,010 199,713 34,404,281 Northern Kurdish kmr_Latn Latin 39,262 164,666 23,834,960 Korean kor_Hang Hanja 2,614,089 13,563,283 2,006,080,705 Lao lao_Laoo 50,611 208,768 31,029,380 Ligurian lij_Latn Latin 8,751 56,266 2,958,179 Limburgish lim_Latn Latin 189,547 1,076,047 42,534,327 Lingala lin_Latn Latin 24,614 152,132 4,053,459 Lithuanian lit_Latn Latin 1,688,811 8,869,443 1,161,476,040 Lombard lmo_Latn Latin 30,506 151,855 9,058,614 Latgalian ltg_Latn Latin 11,948 61,624 4,148,492 Luxembourgish ltz_Latn Latin 44,987 246,346 16,676,872 Ganda lug_Latn Latin 1,878 7,215 789,917 Mizo lus_Latn Latin 7,880 26,817 4,978,472 Standard Latvian lvs_Latn Latin 896,243 4,141,648 587,653,855 Magahi mag_Deva Devanagari 1,097 3,847 205,763 Malayalam mal_Mlym 14,140 52,679 1,689,010 Marathi mar_Deva Devanagari 50,391 163,868 6,689,250 Minangkabau min_Latn Latin 9,341 35,309 1,256,931 Macedonian mkd_Cyrl Cyrillic 542,250 1,853,070 307,232,151 Maltese mlt_Latn Latin 120,888 709,242 36,097,957 Maori mri_Latn Latin 24,322 130,137 24,957,914 Burmese mya_Mymr 8,144 44,188 539,527 Dutch nld_Latn Latin 17,096,727 65,606,013 9,670,041,731 Norwegian Nynorsk nno_Latn Latin 199,355 1,012,313 67,799,774 Norwegian Bokmal nob_Latn Latin 2,229,702 9,698,128 1,294,178,095 Nepali npi_Deva Devanagari 31,239 127,193 3,138,539 Nyanja nya_Latn Latin 12,047 67,192 8,596,769 Occitan oci_Latn Latin 164,852 671,881 59,309,549 Odia ory_Orya 4,319 15,574 378,635 Pangasinan pag_Latn Latin 4,214 32,287 546,071 Eastern Panjabi pan_Guru 11,497 46,168 1,887,991 Papiamento pap_Latn Latin 55,224 363,015 10,002,655 Southern Pasto pbt_Arab Arabic 32,604 110,807 29,170,322 Western Persian pes_Arab Arabic 7,048,946 25,200,571 6,210,479,015 Plateau Malgasy plt_Latn Latin 32,521 120,673 29,263,848 Polish pol_Latn Latin 14,549,605 60,639,244 11,104,144,109 Portuguese por_Latn Latin 8,145,664 26,530,423 4,760,063,083 Dari prs_Arab Arabic 515,041 2,589,859 517,053,967 Ayacucho Quechua quy_Latn Latin 1,578 11,817 362,690 Romanian ron_Latn Latin 5,180,171 17,964,048 3,548,291,261 Rundi run_Latn Latin 20,001 67,096 8,686,054 Russian rus_Cyrl Cyrillic 15,913,845 69,542,828 18,909,213,208 Sango sag_Latn Latin 2,124 13,556 454,455 Sicilian scn_Latn Latin 73,199 424,362 27,110,743 Sinhala sin_Sinh 58,767 221,183 14,270,972 Slovak slk_Latn Latin 3,008,599 15,067,234 1,963,804,563 Slovenian slv_Latn Latin 1,472,025 7,210,285 935,834,754 Samoan smo_Latn Latin 12,346 71,359 14,954,824 Shona sna_Latn Latin 12,698 68,782 6,112,600 Sindhi snd_Arab Arabic 21,095 74,289 17,647,825 Somali som_Latn Latin 77,343 301,429 34,554,975 Southern Sotho sot_Latn Latin 7,718 43,146 6,156,450 Spanish spa_Latn Latin 22,713,366 78,361,087 14,616,773,475 Sardinian srd_Latn Latin 675,539 4,059,493 106,159,957 Serbian srp_Cyrl Cyrillic 604,557 2,286,171 401,223,741 Sundanese sun_Latn Latin 44,310 236,025 13,627,832 Swedish swe_Latn Latin 3,302,730 10,860,518 1,779,284,152 Swahili swh_Latn Latin 137,134 593,418 59,454,896 Silesian szl_Latn Latin 23,535 132,459 5,996,972 Tamil tam_Taml Dravidian Tamil 36,196 167,669 4,834,946 Tatar tat_Cyrl Cyrillic 37,188 143,842 22,831,350 Telugu tel_Telu Brahmic Telugu 22,974 81,033 2,273,772 Tajik tgk_Cyrl Cyrillic 125,236 417,591 90,503,778 Tagalog tgl_Latn Latin 151,437 673,814 97,708,639 Thai tha_Thai Thai 2,983,837 11,621,786 2,839,211,104 Tigrinya tir_Ethi Ge'ez 2,657 8,707 1,725,422 Tok Pisin tpi_Latn Latin 5,063 35,169 460,853 Turkmen tuk_Latn Latin 13,024 57,354 9,766,999 Turkish tur_Latn Latin 4,478,700 12,401,091 2,394,669,068 Twi twi_Latn Latin 3,305 13,634 495,220 Uyghur uig_Arab Arabic 10,713 41,709 6,785,318 Ukrainian ukr_Cyrl Cyrillic 2,721,424 10,929,796 1,928,351,595 Urdu urd_Arab Arabic 407,098 1,239,125 242,007,283 Northern Uzbek uzn_Latn Latin 156,632 798,155 89,022,562 Venetian vec_Latn Latin 330,611 1,830,777 71,077,531 Vietnamese vie_Latn Latin 12,621,521 47,411,488 11,616,191,199 Wolof wol_Latn Latin 4,658 20,380 1,596,432 Xhosa xho_Latn Latin 25,950 142,387 15,809,823 Eastern Yiddish ydd_Hebr 12,486 57,510 17,369,727 Yoruba yor_Latn Latin 56,700 286,933 32,614,558 Yue Chinese yue_Hant 33,671 203,513 24,172,441 Chinese (Simplified) zho_Hans Hanzi 9,861,262 36,152,754 8,078,842,701 Chinese (Traditional) zho_Hant Hant 3,967,966 16,307,258 2,962,854,441 Standard Malay zsm_Latn Latin 1,179,744 5,488,632 432,667,199 Zulu zul_Latn Latin 30,717 156,639 11,345,288"},{"location":"versions/oscar-2019/","title":"OSCAR 2019","text":"

OSCAR 2019 is the original 2019 release of the OSCAR corpus. It has been generated from Common Crawl corpus using the goclassy architecture.

"},{"location":"versions/oscar-2019/#features","title":"Features","text":"

OSCAR 2019 is shuffled at line level and no metadata is provided. Thus it is mainly intended to be used in the training of unsupervised language models for NLP.

Data is distributed by language in both original and deduplicated form.

If you need the unshuffled version of OSCAR, please contact us using the contact form. Please include your name, affiliation, contact details, which languages do you need and a brief description of how you intend to use OSCAR. You can also download it using HuggingFace\u2019s datasets library.

Even though OSCAR is not Postcardware, we do appreciate when our users send us a postcard. If you want to send us one, you can find the address in the contact section down below.

"},{"location":"versions/oscar-2019/#citing-oscar","title":"Citing OSCAR","text":"

If you use OSCAR to train a language model, text generation model or any other ML model in general please consider citing our latest paper:

@inproceedings{ortiz-suarez-etal-2020-monolingual,\n    title = \"A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages\",\n    author = \"Ortiz Su{\\'a}rez, Pedro Javier  and\n      Romary, Laurent  and\n      Sagot, Beno{\\^\\i}t\",\n    booktitle = \"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2020\",\n    address = \"Online\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/2020.acl-main.156\",\n    pages = \"1703--1714\",\n    abstract = \"We use the multilingual OSCAR corpus, extracted from Common Crawl via language classification, filtering and cleaning, to train monolingual contextualized word embeddings (ELMo) for five mid-resource languages. We then compare the performance of OSCAR-based and Wikipedia-based ELMo embeddings for these languages on the part-of-speech tagging and parsing tasks. We show that, despite the noise in the Common-Crawl-based OSCAR data, embeddings trained on OSCAR perform much better than monolingual embeddings trained on Wikipedia. They actually equal or improve the current state of the art in tagging and parsing for all five languages. In particular, they also improve over multilingual Wikipedia-based contextual embeddings (multilingual BERT), which almost always constitutes the previous state of the art, thereby showing that the benefit of a larger, more diverse corpus surpasses the cross-lingual benefit of multilingual embedding architectures.\",\n}\n
"},{"location":"versions/oscar-2019/#the-unshuffled-oscar","title":"The Unshuffled OSCAR","text":"

If you need a copy of any of the unshuffled sub-corpora, please contact us using the contact form down below. Please include your name, affiliation, contact details, which languages do you need and a brief description of how you intend to use OSCAR. We will evaluate your request and answer accordingly.

{{% callout note %}} The unshuffled OSCAR is now available in HuggingFace\u2019s datasets library {{% /callout %}} They have obtained our permission to redistribute the unshuffled OSCAR and they allow users to download a corpus all at once as opposed to file by file. You can get more information about how to download OSCAR using their library by visiting OSCAR's dataset card.

"},{"location":"versions/oscar-2019/#downloading-oscar","title":"Downloading OSCAR","text":"

All the data is distributed by language, both the original and the deduplicated versions of the data are available. To download a file just click the desired link on the table below. Languages are split in shards of around 700MB, these shards are standalone. A plain text file with checksums is also provided.

The OSCAR corpus is yet to be filtered, so please be careful when using it, specially for text generation tasks! To see which sub-corpora have been audited, please refer to the list of publications above for more information.

You'll be asked to create an HumanID account in order to download a corpus. This is intended, and we do it in order to limit traffic and reduce abuse of the infrastructure. The OSCAR corpus is hosted by Huma-Num, you can read more about them on their website.

All sizes are for the uncompressed files.

Language Words original Size original File original Words deduplicated Size deduplicated File deduplicated Afrikaans 43,482,801 241M af 29,533,437 163M af Albanian 374,196,110 2.3G sq 186,856,699 1.2G sq Alemannic 841,750 5.0M als 459,001 2.8M als Amharic 28,301,601 360M am 16,086,628 206M am Arabic 8,117,162,828 82G ar 3,171,221,354 32G ar Aragonese 52,896 1.3M an 45,669 801K an Armenian 273,919,388 3.7G hy 110,196,043 1.5G hy Assamese 6,956,663 113M as 4,366,570 71M as Asturian 381,005 2.4M ast 325,237 2.0M ast Avaric 24,720 409K av 19,478 324K av Azerbaijani 322,641,710 2.8G az 167,742,296 1.5G az Bashkir 9,796,764 128M ba 6,922,589 90M ba Basque 120,456,652 848M eu 45,359,710 342M eu Bavarian 399 503 bar 399 503 bar Belarusian 144,579,630 1.8G be 83,499,037 1.1G be Bengali 623,575,733 11G bn 363,766,143 5.8G bn Bihari 8,848 110K bh 2,875 34K bh Bishnupriya 198,286 4.1M bpy 96,940 1.7M bpy Bosnian 106,448 447K bs 20,485 116K bs Breton 5,013,241 29M br 2,890,384 16M br Bulgarian 2,947,648,106 32G bg 1,268,114,977 14G bg Burmese 56,111,184 1.9G my 30,102,173 1.1G my Catalan 1,360,212,450 8.0G ca 729,333,440 4.3G ca Cebuano 6,603,567 39M ceb 3,675,024 24M ceb Central Bikol 312 885 bcl 312 885 bcl Central Khmer 20,690,610 1.1G km 10,082,245 581M km Central Kurdish 48,478,334 487M ckb 18,726,721 226M ckb Chavacano 130 520 cbk 130 520 cbk Chechen 711,051 8.3M ce 568,146 6.7M ce Chinese 14,986,424,850 508G zh 6,350,215,113 249G zh Chuvash 3,041,614 39M cv 2,054,810 26M cv Cornish 8,329 44K kw 2,704 14K kw Croatian 34,232,765 226M hr 16,727,640 110M hr Czech 7,715,977,441 53G cs 3,540,997,509 24G cs Danish 2,637,463,889 16G da 1,620,091,317 9.5G da Dhivehi 7,559,472 126M dv 4,726,660 79M dv Dimli 19 146 diq 19 146 diq Dutch 13,020,136,373 78G nl 6,598,786,137 39G nl Eastern Mari 565,992 7.2M mhr 469,297 6.0M mhr Egyptian Arabic 7,305,151 66M arz 3,659,419 33M arz Emilian-Romagnol 6,376 25K eml 6,121 24K eml English 418,187,793,408 2.3T en 215,841,256,971 1.2T en Erzya 90 1.4K myv 78 1.2K myv Esperanto 48,486,161 299M eo 37,324,446 228M eo Estonian 643,163,730 4.8G et 309,931,463 2.3G et Finnish 3,196,666,419 27G fi 1,597,855,468 13G fi French 46,896,036,417 282G fr 23,206,776,649 138G fr Galician 102,011,291 620M gl 63,600,602 384M gl Georgian 171,950,621 3.6G ka 91,569,739 1.9G ka German 44,878,908,446 308G de 21,529,164,172 145G de Goan Konkani 124,277 2.2M gom 102,306 1.8M gom Guarani 7,382 36K gn 4,680 24K gn Gujarati 72,045,701 1.1G gu 50,023,432 722M gu Haitian 1,014 3.9K ht 832 3.3K ht Hebrew 2,067,753,528 20G he 1,032,018,056 9.8G he Hindi 1,372,234,782 17G hi 745,774,934 8.9G hi Hungarian 5,163,936,345 40G hu 2,339,127,555 18G hu Icelandic 219,900,094 1.5G is 129,818,331 846M is Ido 25,702 147K io 22,773 130K io Iloko 142,942 874K ilo 105,564 636K ilo Indonesian 4,574,692,265 30G id 2,394,957,629 16G id Interlingua 180,231 662K ia 100,019 360K ia Interlingue 5,352 24K ie 602 1.6K ie Irish 14,483,593 88M ga 10,017,303 60M ga Italian 22,248,707,341 137G it 11,250,012,896 69G it Japanese 4,962,979,182 216G ja 1,123,067,063 106G ja Javanese 104,896 659K jv 86,654 583K jv Kalmyk 10,277 113K xal 10,155 112K xal Kannada 81,186,863 1.7G kn 49,343,462 1.1G kn Karachay-Balkar 185,436 2.6M krc 166,496 2.3M krc Kazakh 191,126,469 2.7G kk 108,388,743 1.5G kk Kirghiz 44,194,823 600M ky 28,982,620 388M ky Komi 201,404 2.3M kv 95,243 1.2M kv Korean 2,368,765,142 24G ko 1,120,375,149 12G ko Kurdish 15,561,003 94M ku 9,946,440 60M ku Lao 4,133,311 174M lo 2,583,342 114M lo Latin 4,122,201 26M la 1,328,038 8.3M la Latvian 520,761,977 4.0G lv 236,428,905 1.8G lv Lezghian 247,646 3.3M lez 224,871 3.0M lez Limburgan 4,730 29K li 4,283 27K li Lithuanian 1,159,661,742 8.8G lt 516,183,525 3.9G lt Lojban 154,330 736K jbo 141,973 678K jbo Lombard 75,229 443K lmo 73,665 433K lmo Low German 2,906,347 18M nds 2,146,417 13M nds Lower Sorbian 1,787 13K dsb 966 7.1K dsb Luxembourgish 4,403,577 29M lb 3,087,650 21M lb Macedonian 189,289,873 2.1G mk 102,849,595 1.2G mk Maithili 69,161 317K mai 874 11K mai Malagasy 3,068,360 21M mg 1,872,044 13M mg Malay 16,696,882 111M ms 6,045,753 42M ms Malayalam 189,534,472 4.9G ml 95,892,551 2.5G ml Maltese 2,995,654 24M mt 2,163,358 17M mt Marathi 162,609,404 2.7G mr 82,130,803 1.4G mr Mazanderani 73,870 691K mzn 64,481 602K mzn Minangkabau 5,682 608K min 4,825 310K min Mingrelian 299,098 5.8M xmf 228,629 4.4M xmf Mirandese 171 1.2K mwl 152 1.1K mwl Modern Greek 5,479,180,137 62G el 2,412,419,435 27G el Mongolian 181,307,167 2.2G mn 68,362,013 838M mn Nahuatl languages 1,234 12K nah 1,193 11K nah Neapolitan 5,282 17K nap 4,147 13K nap Nepali 107,448,208 1.8G ne 71,628,317 1.2G ne Newari 564,697 5.5M new 288,995 4.1M new Northern Frisian 1,516 4.4K frr 1,516 4.4K frr Northern Luri 8,022 76K lrc 6,740 63K lrc Norwegian 1,344,326,388 8.0G no 804,894,377 4.7G no Norwegian Nynorsk 14,764,980 85M nn 9,435,139 54M nn Occitan 750,301 5.8M oc 512,678 3.7M oc Oriya 14,938,567 248M or 11,321,740 188M or Ossetian 1,031,268 13M os 878,765 11M os Pampanga 130 760 pam 52 304 pam Panjabi 61,847,806 763M pa 37,555,835 460M pa Persian 9,096,554,121 79G fa 4,363,505,319 38G fa Piemontese 362,013 2.1M pms 337,246 1.9M pms Polish 15,277,255,137 109G pl 6,708,709,674 47G pl Portuguese 20,641,903,898 124G pt 10,751,156,918 64G pt Pushto 46,559,441 361M ps 31,347,348 242M ps Quechua 10,186 78K qu 8,691 67K qu Romanian 3,984,317,058 25G ro 1,741,794,069 11G ro Romansh 1,093 7.4K rm 960 6.5K rm Russia Buriat 963 13K bxr 809 11K bxr Russian 92,522,407,837 1.2T ru 46,692,691,520 568G ru Sanskrit 4,331,569 93M sa 1,713,930 37M sa Scottish Gaelic 310,689 1.9M gd 207,110 1.3M gd Serbian 364,395,411 3.9G sr 207,561,168 2.2G sr Serbo-Croatian 5,292,184 25M sh 1,040,573 5.8M sh Sicilian 554 3.3K scn 468 2.8K scn Sindhi 43,530,158 347M sd 33,028,015 263M sd Sinhala 93,053,465 1.4G si 50,864,857 802M si Slovak 1,322,247,763 9.1G sk 656,346,179 4.5G sk Slovenian 387,399,700 2.5G sl 193,926,684 1.3G sl Somali 1,202 61K so 472 16K so South Azerbaijani 2,175,054 27M azb 1,528,709 19M azb Spanish 47,545,122,279 278G es 25,928,290,729 149G es Sundanese 30,321 211K su 20,278 141K su Swahili 2,211,927 13M sw 1,376,963 8.1M sw Swedish 7,155,994,312 44G sv 4,106,120,608 25G sv Tagalog 98,949,299 573M tl 70,121,601 407M tl Tajik 31,758,142 379M tg 21,029,893 249M tg Tamil 420,537,132 9.3G ta 226,013,330 5.1G ta Tatar 51,034,893 670M tt 23,825,695 305M tt Telugu 123,711,517 2.5G te 79,094,167 1.6G te Thai 951,743,087 36G th 368,965,202 16G th Tibetan 1,483,589 187M bo 936,556 138M bo Turkish 7,577,388,700 60G tr 3,365,734,289 27G tr Turkmen 1,113,869 11M tk 752,326 6.8M tk Tuvinian 759 12K tyv 540 7.9K tyv Uighur 8,657,141 122M ug 5,852,225 83M ug Ukrainian 4,204,381,276 53G uk 2,252,380,351 28G uk Upper Sorbian 545,351 4.2M hsb 236,867 1.8M hsb Urdu 331,817,982 2.7G ur 218,030,228 1.7G ur Uzbek 2,450,256 21M uz 1,381,644 12M uz Venetian 3,492 18K vec 3,199 17K vec Vietnamese 12,036,845,359 68G vi 5,577,159,843 32G vi Volap\u00fck 321,121 2.0M vo 318,568 2.0M vo Walloon 50,720 273K wa 37,543 203K wa Waray 397,315 2.5M war 336,311 2.2M war Welsh 37,422,441 213M cy 23,574,673 133M cy Western Frisian 5,691,077 35M fy 4,223,816 26M fy Western Mari 93,338 1.2M mrj 87,780 1.1M mrj Western Panjabi 1,426,986 12M pnb 1,111,112 9.0M pnb Wu Chinese 11,189 109K wuu 4,333 32K wuu Yakut 2,547,623 42M sah 1,789,174 26M sah Yiddish 13,834,320 141M yi 8,212,970 84M yi Yoruba 8,906 55K yo 3,518 27K yo Yue Chinese 186 3.7K yue 128 2.2K yue"},{"location":"versions/oscar-2019/#license","title":"License","text":"

These data are released under this licensing scheme:

  • We do not own any of the text from which these data has been extracted.
  • We license the actual packaging of these data under the Creative Commons CC0 license (\"no rights reserved\").
  • To the extent possible under law, Inria has waived all copyright and related or neighboring rights to OSCAR.
  • This work is published from: France.

"},{"location":"versions/oscar-2019/#notice-and-take-down-policy","title":"Notice and take down policy","text":"

Notice: Should you consider that our data contains material that is owned by you and should therefore not be reproduced here, please:

  • Clearly identify yourself, with detailed contact data such as an address, telephone number or email address at which you can be contacted.
  • Clearly identify the copyrighted work claimed to be infringed.
  • Clearly identify the material that is claimed to be infringing and information reasonably sufficient to allow us to locate the material.
  • And use the contact form below.

Take down: We will comply to legitimate requests by removing the affected sources from the next release of the corpus.

"},{"location":"versions/oscar-2019/#models","title":"Models","text":"

Here is a list of some language models that have been trained using the OSCAR corpus or that are part of the OSCAR project:

Model Language Corpus Authors Paper Files License ELMo Bulgarian OSCAR Pedro J. Ortiz, Beno\u00eet Sagot and Laurent Romary ACL 2020 bg.zip MIT ELMo Bulgarian Wikipedia Pedro J. Ortiz, Beno\u00eet Sagot and Laurent Romary ACL 2020 bg.zip MIT ELMo Catalan OSCAR Pedro J. Ortiz, Beno\u00eet Sagot and Laurent Romary ACL 2020 ca.zip MIT ELMo Catalan Wikipedia Pedro J. Ortiz, Beno\u00eet Sagot and Laurent Romary ACL 2020 ca.zip MIT ELMo Danish OSCAR Pedro J. Ortiz, Beno\u00eet Sagot and Laurent Romary ACL 2020 da.zip MIT ELMo Danish Wikipedia Pedro J. Ortiz, Beno\u00eet Sagot and Laurent Romary ACL 2020 da.zip MIT ELMo French OSCAR Pedro J. Ortiz, Yoann Dupont, Benjamin Muller, Laurent Romary and Beno\u00eet Sagot LREC 2020 fr.zip MIT ELMo Finnish OSCAR Pedro J. Ortiz, Beno\u00eet Sagot and Laurent Romary ACL 2020 fi.zip MIT ELMo Finnish Wikipedia Pedro J. Ortiz, Beno\u00eet Sagot and Laurent Romary ACL 2020 fi.zip MIT ELMo Indonesian OSCAR Pedro J. Ortiz, Beno\u00eet Sagot and Laurent Romary ACL 2020 id.zip MIT ELMo Indonesian Wikipedia Pedro J. Ortiz, Beno\u00eet Sagot and Laurent Romary ACL 2020 id.zip MIT"},{"location":"versions/oscar-2019/#featured-models","title":"Featured Models","text":"

Here is a list of Language models trained by the community:

Model Language Cased Corpus Authors Paper Website Files License AraBERT Arabic Cased OSCAR, Wikipedia, 1.5B words Arabic Corpus, OSIAN, Assafir Wissam Antoun, Fady Baly and Hazem Hajj ACL Anthology GitHub Hugging Face N/A Arabic-BERT Arabic Cased OSCAR and Wikipedia Ali Safaya ArXiv GitHub Hugging Face MIT AraELECTRA Arabic Cased OSCAR, Wikipedia, 1.5B words Arabic Corpus, OSIAN, Assafir Wissam Antoun, Fady Baly and Hazem Hajj ArXiV GitHub Hugging Face N/A AraGPT2 Arabic Cased OSCAR, Wikipedia, 1.5B words Arabic Corpus, OSIAN, Assafir Wissam Antoun, Fady Baly and Hazem Hajj ArXiv GitHub Hugging Face N/A CamemBERT French Cased OSCAR Louis Martin, Benjamin Muller, Pedro Javier Ortiz Su\u00e1rez, Yoann Dupont, Laurent Romary, \u00c9ric Villemonte de la Clergerie, Djam\u00e9 Seddah and Beno\u00eet Sagot ACL 2020 camembert-model.fr camembert-base.tar.gz MIT CamemBERT French Cased Subsample of OSCAR (4 GB of text) Louis Martin, Benjamin Muller, Pedro Javier Ortiz Su\u00e1rez, Yoann Dupont, Laurent Romary, \u00c9ric Villemonte de la Clergerie, Djam\u00e9 Seddah and Beno\u00eet Sagot ACL 2020 camembert-model.fr camembert-base-oscar-4gb.tar.gz MIT LePetit French Cased Subsample of OSCAR (2 GB of text) Vincent Micheli, Martin d'Hoffschmidt, Quentin Heinrich Medium blog illuin.tech Hugging Face MIT GigaBERT Arabic Cased and Uncased OSCAR, Wikipedia, Gigaword Wuwei Lan, Yang Chen, Wei Xu, Alan Ritter EMNLP 2020 GitHub Hugging Face MIT ELECTRA Norwegian Cased OSCAR and OPUS Viktor Alm N/A Hugging Face Hugging Face N/A BERT Romanian Cased OSCAR, Wikipedia and OPUS Dumitrescu Stefan and Andrei Avram SOON GitHub Hugging Face MIT BERT Romanian Uncased OSCAR, Wikipedia and OPUS Dumitrescu Stefan and Andrei Avram SOON GitHub Hugging Face MIT RoBERTa Sinhala N/A OSCAR Keshan Sodimana N/A Hugging Face Hugging Face N/A BERT Turkish Cased and Uncased OSCAR, Wikipedia and OPUS Stefan Schweter Zenodo GitHub Hugging Face MIT ELECTRA Turkish Cased OSCAR, Wikipedia and OPUS Stefan Schweter Zenodo GitHub Hugging Face MIT XLMIndic Hindi, Bengali, Gujarati, Panjabi, Marathi, Oriya, Assamese, Sinhala, Nepali, Bihari, Bishnupriya, Maithili, Goan Konkani, Sanskrit Cased OSCAR Ibraheem Muhammad Moosa, Mahmud Shimul and Ashfia Binte Habib Arxiv GitHub Hugging Face MIT

If you have trained a model using the OSCAR corpus and would like to have it featured here, please open a pull request in our GitHub repo. Help us grow the community!

"},{"location":"versions/oscar-2109/","title":"OSCAR 21.09","text":""},{"location":"versions/oscar-2109/#features","title":"Features","text":"

These are the versions of tooling, schemes and data

  • CommonCrawl version: February/March 2021 (2021.10)
  • OSCAR Schema version: v1.1 : Incorporates metadata in a backward compatible manner.
  • Ungoliant version: v1 : New generation tool, faster and better documented/tested than the previous one: goclassy.
"},{"location":"versions/oscar-2109/#changes","title":"Changes","text":"
  • As per OSCAR Schema v1.1, each document/record has associated metadata.
  • New languages: Manx, Rusyn, Scots and West Flemish. Their size and quality still has to be assessed.
  • Removed languages: Central Bikol and Cantonese. Cantonsese was of a very low quality. Central Bikol corpus is still available on OSCAR 2019.
"},{"location":"versions/oscar-2109/#table","title":"Table","text":"Language OSCAR 2019 OSCAR 2019 deduplicated OSCAR 21.09 OSCAR 21.09 deduplicated Issues af Afrikaans 251MB 170MB 258MB 157MB sq Albanian 2GB 1GB 3GB 1GB am Amharic 377MB 215MB 405MB 241MB ar Arabic 87GB 33GB 69GB 35GB an Aragonese 1MB 822KB 1MB 608KB hy Armenian 3GB 1GB 4GB 1GB as Assamese 117MB 73MB 135MB 95MB ast Asturian 2MB 2MB 7MB 4MB av Avaric 418KB 331KB 421KB 325KB az Azerbaijani 2GB 1GB 3GB 1GB bn Bangla 10GB 6GB 14GB 7GB ba Bashkir 133MB 93MB 110MB 77MB eu Basque 889MB 358MB 900MB 503MB bar Bavarian 507B 507B 2KB 1KB be Belarusian 1GB 1GB 2GB 1GB bh Bihari languages 112KB 34KB 579KB 120KB bpy Bishnupriya 4MB 1MB 11MB 4MB bs Bosnian 459KB 120KB 310KB 175KB br Breton 29MB 16MB 49MB 23MB bg Bulgarian 33GB 14GB 34GB 15GB my Burmese 2GB 1GB 2GB 1GB yue Cantonese 3KB 2KB - - ca Catalan 8GB 4GB 13GB 6GB ceb Cebuano 40MB 24MB 81MB 58MB bcl Central Bikol 886B 886B - - ckb Central Kurdish 509MB 236MB 784MB 367MB cbk Chavacano 521B 521B 168B 168B {{< issue cbk >}} ce Chechen 8MB 6MB 29MB 20MB zh Chinese 544GB 267GB 500GB 266GB cv Chuvash 40MB 27MB 60MB 41MB kw Cornish 44KB 14KB 119KB 72KB hr Croatian 237MB 115MB 361MB 169MB cs Czech 56GB 25GB 72GB 33GB da Danish 16GB 10GB 18GB 10GB diq Dimli (individual language) 147B 147B 294B 147B dv Divehi 131MB 81MB 143MB 111MB nl Dutch 82GB 41GB 97GB 47GB mhr Eastern Mari 7MB 6MB 15MB 10MB arz Egyptian Arabic 68MB 34MB 48MB 21MB en English 2520GB 1294GB 2936GB 1342GB myv Erzya 1KB 1KB 29KB 2KB eo Esperanto 312MB 238MB 560MB 390MB et Estonian 5GB 2GB 7GB 3GB tl Filipino 601MB 426MB 699MB 383MB fi Finnish 28GB 13GB 35GB 20GB fr French 302GB 147GB 340GB 161GB gl Galician 650MB 402MB 989MB 549MB ka Georgian 3GB 1GB 6GB 2GB de German 330GB 155GB 433GB 184GB gom Goan Konkani 2MB 1MB 3MB 2MB el Greek 66GB 28GB 72GB 30GB gn Guarani 36KB 23KB 32KB 25KB gu Gujarati 1GB 756MB 1GB 950MB ht Haitian Creole 3KB 3KB 2KB 1KB he Hebrew 21GB 10GB 29GB 11GB hi Hindi 17GB 9GB 26GB 13GB hu Hungarian 42GB 18GB 60GB 29GB is Icelandic 1GB 887MB 2GB 1GB io Ido 151KB 133KB 276KB 221KB ilo Iloko 896KB 653KB 1MB 857KB id Indonesian 32GB 16GB 40GB 22GB ia Interlingua 678KB 368KB 291KB 172KB ie Interlingue 24KB 1KB 7KB 2KB ga Irish 91MB 62MB 131MB 69MB it Italian 146GB 73GB 192GB 94GB ja Japanese 231GB 112GB 208GB 96GB jv Javanese 675KB 598KB 858KB 728KB xal Kalmyk 115KB 114KB 62KB 62KB kn Kannada 1GB 1GB 2GB 1GB krc Karachay-Balkar 2MB 2MB 2MB 2MB kk Kazakh 2GB 1GB 3GB 1GB km Khmer 1GB 608MB 1GB 860MB kv Komi 2MB 1MB 1MB 588KB ko Korean 25GB 11GB 35GB 15GB ku Kurdish 98MB 62MB 152MB 108MB ky Kyrgyz 629MB 406MB 485MB 334MB lo Lao 181MB 118MB 287MB 163MB la Latin 26MB 8MB 103MB 9MB lv Latvian 4GB 1GB 6GB 2GB lez Lezghian 3MB 3MB 2MB 2MB li Limburgish 29KB 27KB 76KB 54KB lt Lithuanian 9GB 4GB 12GB 5GB jbo Lojban 753KB 694KB 929KB 731KB lmo Lombard 454KB 444KB 1MB 1MB nds Low German 18MB 13MB 25MB 17MB dsb Lower Sorbian 13KB 7KB 31KB 14KB lb Luxembourgish 30MB 21MB 54MB 37MB mk Macedonian 2GB 1GB 3GB 1GB mai Maithili 324KB 10KB 685KB 24KB mg Malagasy 21MB 13MB 59MB 38MB ms Malay 116MB 43MB 146MB 60MB ml Malayalam 5GB 2GB 4GB 2GB mt Maltese 24MB 17MB 51MB 26MB gv Manx - - 1KB 907B mr Marathi 2GB 1GB 3GB 1GB mzn Mazanderani 708KB 617KB 1MB 1MB min Minangkabau 622KB 317KB 8MB 1MB xmf Mingrelian 6MB 4MB 16MB 10MB mwl Mirandese 1KB 1KB 3KB 2KB mn Mongolian 2GB 879MB 1GB 912MB nah Nahuatl languages 11KB 10KB 34KB 21KB nap Neapolitan 17KB 13KB 1KB 1KB {{< issue nap >}} ne Nepali 1GB 1GB 3GB 2GB new Newari 5MB 4MB 6MB 4MB frr Northern Frisian 4KB 4KB 7KB 5KB {{< issue frr >}} lrc Northern Luri 77KB 64KB 183B 183B no Norwegian Bokm\u00e5l 8GB 5GB 9GB 4GB nn Norwegian Nynorsk 88MB 56MB 123MB 66MB oc Occitan 6MB 3MB 12MB 5MB or Odia 259MB 196MB 538MB 357MB os Ossetic 12MB 10MB 11MB 6MB pam Pampanga 763B 307B 3KB 3KB ps Pashto 378MB 253MB 404MB 286MB fa Persian 84GB 39GB 79GB 35GB pms Piedmontese 2MB 1MB 4MB 3MB pl Polish 116GB 50GB 122GB 48GB pt Portuguese 132GB 67GB 159GB 71GB pa Punjabi 799MB 481MB 769MB 430MB qu Quechua 80KB 68KB 322KB 230KB ro Romanian 26GB 11GB 37GB 15GB rm Romansh 7KB 6KB 3KB 3KB bxr Russia Buriat 12KB 10KB 22KB 18KB ru Russian 1239GB 609GB 1201GB 542GB rue Rusyn - - 247B 247B sah Sakha 43MB 27MB 57MB 39MB sa Sanskrit 96MB 38MB 72MB 43MB sco Scots - - 1KB 1KB {{< issue sco >}} gd Scottish Gaelic 1MB 1MB 2MB 1MB sr Serbian 4GB 2GB 6GB 3GB sh Serbian (Latin) 25MB 6MB 13MB 9MB scn Sicilian 3KB 2KB 4KB 3KB sd Sindhi 363MB 274MB 75MB 50MB si Sinhala 1GB 840MB 1GB 791MB sk Slovak 9GB 4GB 14GB 6GB sl Slovenian 2GB 1GB 4GB 1GB so Somali 62KB 15KB 15KB 13KB {{< issue so >}} azb South Azerbaijani 28MB 19MB 47MB 29MB es Spanish 297GB 159GB 342GB 160GB su Sundanese 216KB 145KB 397KB 274KB sw Swahili 13MB 8MB 11MB 7MB sv Swedish 46GB 26GB 43GB 19GB tg Tajik 396MB 260MB 985MB 321MB {{< issue tg >}} ta Tamil 9GB 5GB 10GB 5GB tt Tatar 701MB 319MB 947MB 424MB te Telugu 2GB 1GB 3GB 1GB th Thai 38GB 17GB 62GB 26GB bo Tibetan 195MB 144MB 439MB 358MB gsw[^1] Alemannic German 5MB 2MB 7MB 5MB tr Turkish 63GB 28GB 73GB 33GB {{< issue tr >}} tk Turkmen 10MB 7MB 25MB 20MB tyv Tuvinian 11KB 8KB 9KB 7KB uk Ukrainian 56GB 29GB 53GB 28GB eml Emiliano-Romagnolo[^2] 25KB 23KB 22KB 20KB hsb Upper Sorbian 4MB 1MB 2MB 1MB ur Urdu 2GB 1GB 2GB 1GB ug Uyghur 127MB 86MB 187MB 123MB uz Uzbek 21MB 11MB 56MB 28MB vec Venetian 18KB 16KB 37KB 28KB vi Vietnamese 72GB 33GB 87GB 42GB vo Volap\u00fck 2MB 2MB 2MB 2MB wa Walloon 280KB 207KB 511KB 329KB war Waray 2MB 2MB 4MB 4MB cy Welsh 223MB 139MB 307MB 180MB vls West Flemish - - 134B 134B {{< issue vls >}} fy Western Frisian 35MB 26MB 82MB 57MB mrj Western Mari 1MB 1MB 645KB 521KB pnb Western Panjabi 11MB 9MB 68MB 45MB wuu Wu Chinese 111KB 32KB 145KB 69KB {{< issue wuu >}} yi Yiddish 146MB 87MB 199MB 93MB yo Yoruba 56KB 26KB 229KB 120KB"},{"location":"versions/oscar-2109/#oscar-schema-v110","title":"OSCAR Schema v1.1.0","text":"

The new OSCAR schema incorporates backward-compatible changes.

"},{"location":"versions/oscar-2109/#changes_1","title":"Changes","text":"

The old OSCAR Schema v1.0 featured the following file hierarchy, in an uncompressed form:

/\n\u251c\u2500\u2500 af\n\u2502   \u251c\u2500\u2500 af_sha256.txt\n\u2502   \u2514\u2500\u2500 af.txt.gz\n\u251c\u2500\u2500 de\n\u2502   \u251c\u2500\u2500 de_sha256.txt    # Checksum file \n\u2502   \u2514\u2500\u2500 de.txt.gz        # Textual content\n\u251c\u2500\u2500 en\n\u2502   \u251c\u2500\u2500 en_part_1.txt.gz        # Multipart example\n\u2502   \u251c\u2500\u2500 en_part_2.txt.gz\n\u2502   \u2514\u2500\u2500 en_sha256.txt\n\u251c\u2500\u2500 yi\n\u2502   \u251c\u2500\u2500 yi_sha256.txt\n\u2502   \u2514\u2500\u2500 yi.txt.gz\n\u2514\u2500\u2500 zh\n    \u251c\u2500\u2500 zh_sha256.txt\n    \u2514\u2500\u2500 zh.txt.gz\n

The new OSCAR Schema v1.1 features the following file hierarchy (some languages omitted):

/\n\u251c\u2500\u2500 af\n\u2502   \u251c\u2500\u2500 af_meta.jsonl.gz\n\u2502   \u251c\u2500\u2500 af_sha256.txt\n\u2502   \u2514\u2500\u2500 af.txt.gz\n\u251c\u2500\u2500 de\n\u2502   \u251c\u2500\u2500 de_meta.jsonl.gz # Metadata, in JSONLines format\n\u2502   \u251c\u2500\u2500 de_sha256.txt    # Checksum file \n\u2502   \u2514\u2500\u2500 de.txt.gz        # Textual content\n\u251c\u2500\u2500 en\n\u2502   \u251c\u2500\u2500 en_meta_part_1.jsonl.gz # Multipart example\n\u2502   \u251c\u2500\u2500 en_meta_part_2.jsonl.gz # Each part is independent,\n\u2502   \u251c\u2500\u2500 en_part_1.txt.gz        # Ex: en_part_2.txt.gz and en_meta_part_2.jsonl.gz\n\u2502   \u251c\u2500\u2500 en_part_2.txt.gz\n\u2502   \u2514\u2500\u2500 en_sha256.txt\n\u251c\u2500\u2500 yi\n\u2502   \u251c\u2500\u2500 yi_meta.jsonl.gz\n\u2502   \u251c\u2500\u2500 yi_sha256.txt\n\u2502   \u2514\u2500\u2500 yi.txt.gz\n\u2514\u2500\u2500 zh\n    \u251c\u2500\u2500 zh_meta.jsonl.gz\n    \u251c\u2500\u2500 zh_sha256.txt\n    \u2514\u2500\u2500 zh.txt.gz\n
"},{"location":"versions/oscar-2109/#file-formats","title":"File formats","text":""},{"location":"versions/oscar-2109/#txt-files","title":".txt files","text":"

Lines are newline-separated, and documents are double-newline separated. In other terms, there is a blank line between each document.

"},{"location":"versions/oscar-2109/#jsonl-files","title":".jsonl files","text":"

These are the metadata, in JSONLines format.

Each line follows the following JSON Scheme:

{\n\"$schema\": \"http://json-schema.org/draft-07/schema#\",\n\"title\": \"Metadata\",\n\"description\": \"Holds record headers.\\n\\nEach metadata is linked to a specific paragraph/text zone\",\n\"type\": \"object\",\n\"required\": [\n\"headers\",\n\"nb_sentences\",\n\"offset\"\n],\n\"properties\": {\n\"headers\": {\n\"type\": \"object\",\n\"additionalProperties\": {\n\"type\": \"string\"\n}\n},\n\"nb_sentences\": {\n\"type\": \"integer\",\n\"format\": \"uint\",\n\"minimum\": 0.0\n},\n\"offset\": {\n\"type\": \"integer\",\n\"format\": \"uint\",\n\"minimum\": 0.0\n}\n}\n}\n

Example:

{\n\"headers\":{                  // these headers keys are *almost* always present.\n\"content-length\":\"11062\", // the content length is not changed and reflects the \n// length before filtering and eventual deduplication.\n\"warc-target-uri\":\"...\",\n\"warc-type\":\"conversion\",\n\"content-type\":\"text/plain\",\n\"warc-date\":\"2021-02-24T17:55:29Z\", // Following WARC specification, it is the crawl date.\n\"warc-identified-content-language\":\"eng,zho\",\n\"warc-refers-to\":\"<urn:uuid:c649de0e-42a3-4e69-b675-98e28e084698>\",\n\"warc-block-digest\":\"sha1:V4PYYGYA6ZYA2WACDKSNL6NXGDN6XK6X\",\n\"warc-record-id\":\"<urn:uuid:121a822f-5362-4559-8891-d085415cdd90>\"\n},\n\"offset\":0, // Related text is in the text file, from lines offset+1 to lines offset+nb_sentences.\n\"nb_sentences\":9\n}\n

"},{"location":"versions/oscar-2109/#lang_sha256txt-files","title":"<lang>_sha256.txt files","text":"

These are used to check for eventual corruption during download. They can be used by running sha256sum -c <lang>_sha256.txt.

[^1]: gsw is ISO 639-2 for Alemannic German. It was previously identified as als in previous OSCAR versions, due to a bug in fasttext. [^2]: eml identification tag is deprecated and corresponds to rgn and egl tags in ISO 639-3

"},{"location":"versions/oscar-2201/","title":"OSCAR 22.01","text":"

OSCAR 2201 is the OSCAR version from January, 2022, the November/December 2021 dump of Common Crawl. It features a different file layout that makes it not backward compatible with code designed to run with previous OSCAR versions.

Request access \ud83e\udd17 Datasets Read the paper

"},{"location":"versions/oscar-2201/#summary","title":"Summary","text":"

OSCAR 22.01 is document-oriented, which means that rather than extracting lines and sorting them in language subcorpora, we identify documents as a whole. The main differences are that sentences in a document are contiguous and should make sense one after another, but sentences are not guaranteed to be of the subcorpus' language.

Note

As an example, the English Wikipedia page about La Marseillaise contains sentences in French (The anthem's lyrics). In line-oriented corpora, these sentences would have been put in the French subcorpus. In OSCAR 22.01, they should be along with the article, in a document classified as English.

"},{"location":"versions/oscar-2201/#layout","title":"Layout","text":"

As previous corpora, there is one subcorpus per language, plus one new subcorpus for multilingual documents. Subcorpora are distributed in JSONLines, split into 1GB chunks, then gzipped.

Note

Splits are completely independent and self-contained: It is possible to only download en_meta_134.jsonl.gz and to do processing on it.

"},{"location":"versions/oscar-2201/#example-document","title":"Example document","text":"
{\n\"content\":\"newline\\nseparaaaaaaaaaaated\\ncontent\", // (1)\n\"warc_headers\":{ // (2) \n\"warc-refers-to\":\"<urn:uuid:83f2e1d4-5ed3-41db-86ff-f7826c4c20f9>\", \"warc-date\":\"2021-09-16T11:07:14Z\",\n\"warc-block-digest\":\"sha1:X3OWP47FG2O5LBNMFSNB44FJF2SSRC26\",\n\"warc-type\":\"conversion\",\n\"warc-identified-content-language\":\"eng\",\n\"content-length\":\"1694\",\n\"warc-target-uri\":\"https://foo.bar\",\n\"warc-record-id\":\"<urn:uuid:3304bc27-17d0-4ffd-a692-340381478a5f>\",\n\"content-type\":\"text/plain\"\n},\n\"metadata\":{\n// (3)\n\"identification\":{\n\"label\":\"en\",\n\"prob\":0.6268374\n},\n// (4)\n\"annotation\":[\n\"short_sentences\",\n\"footer\"\n],\n// (5)\n\"sentence_identifications\":[\n{\n\"label\":\"en\",\n\"prob\":0.93925816\n},\nnull,\n{\n\"label\":\"en\",\n\"prob\":0.9606543\n}\n]\n}\n}\n
  1. Content. Lines are separated by \\n.
  2. Headers from the crawler. Note that nothing is changed, so the content length may be incorrect.
  3. Document-wide identification. prob is the weighted average of the confidence of identified lines.
  4. Annotations of the document. null if no annotation.
  5. Line-by-line identifications. null for each line that has no identification.
"},{"location":"versions/oscar-2201/#annotations","title":"Annotations","text":"
  • tiny: The document has a low (<5) number of lines.
  • short_sentences: The document has a high number (>50%) of short lines (<400 bytes)
  • header: The document has a high number of short lines at its head, suggesting the presence of low quality content.
  • footer: The document has a high number of short lines at its tail, suggesting the presence of low quality content.
  • noisy: The document has a high percentage of punctuation (>50%)
  • adult: The document contains adult content. This annotation uses a blocklist and labels a tiny part of the corpus: It does not catch most of the adult content.

More information about the thresholds and annotators are present in our paper.

"},{"location":"versions/oscar-2201/#filtering","title":"Filtering","text":"

Tip

Filtering can be done using oscar-tools, a high performance toolkit that provides rapid and efficient ways of transforming corpora into what you need. More info here.

Filtering can be done using classic Python tools, such as ujson. While we don't supply a Python library enabling easy filtering/transformation for OSCAR 22.01, we provide some filtering examples that you can change to better suit your needs.

"},{"location":"versions/oscar-2201/#getting-documents-that-come-from-wikipedia-only","title":"Getting documents that come from Wikipedia only","text":"

Using filters on warc_headers.warc-target-uri makes filtering on URLs easy.

TODO\n
"},{"location":"versions/oscar-2201/#extracting-lines-from-non-annotated-documents","title":"Extracting lines from non-annotated documents","text":"

Non-annotated documents are suspected to be cleaner than annotated ones, so extracting their content should be interesting to do. We extract lines from documents where metadata.annotations == null.

TODO\n
"},{"location":"versions/oscar-2201/#getting-alemannic-lines-from-the-german-corpus","title":"Getting Alemannic lines from the German corpus","text":"

As detailed in our paper, we found that the German corpus has a (relative to the Alemannic corpus size) important amount of Alemannic. We use a filter on metadata.sentence_identifications to extract those sentences.

TODO\n
"},{"location":"versions/oscar-2201/#languages","title":"Languages","text":"

OSCAR 22.01 has subcorpora for 142 languages (counting the Multilingual corpus). The following table exhibits the size, number of documents and number of words for each of them.

Note that the size accounts for the raw uncompressed file size, counting metadata.

Language table Language Size # Documents # Words Multilingual 12.1 GB 1,210,685 936,187,711 Afrikaans 47.0 MB 12,393 6,227,310 Albanian 3.0 GB 437,287 326,325,149 Alemannic / Swiss German 363.6 kB 139 37,381 Amharic 461.0 MB 37,513 30,481,153 Arabic 84.2 GB 8,718,929 6,103,711,887 Aragonese 10.6 kB 12 51 Armenian 4.7 GB 379,267 268,031,270 Assamese 221.2 MB 17,084 11,109,557 Asturian 73.6 kB 77 3,919 Avaric 18.6 kB 14 582 Azerbaijani 3.5 GB 491,847 291,927,692 Bangla 15.1 GB 1,171,501 751,877,226 Bashkir 95.5 MB 11,198 5,418,474 Basque 1.1 GB 233,658 97,092,942 Belarusian 1.8 GB 180,046 107,227,860 Bihari languages 24.2 kB 27 569 Bishnupriya 2.0 MB 271 98,419 Bosnian 10.3 kB 10 422 Breton 33.7 MB 16,119 3,111,619 Bulgarian 35.1 GB 2,887,115 2,405,981,285 Burmese 1.9 GB 158,733 44,835,970 Catalan 13.9 GB 2,627,307 1,508,919,864 Cebuano 44.6 MB 5,742 5,253,785 Central Kurdish 716.4 MB 84,950 43,913,025 Chechen 14.0 MB 4,086 798,766 Chinese 900.9 GB 56,524,518 23,149,203,886 Chuvash 41.8 MB 4,750 2,465,782 Cornish 1.4 kB 2 55 Croatian 11.2 MB 11,462 505,369 Czech 58.6 GB 10,381,916 5,452,724,456 Danish 12.6 GB 2,265,479 1,454,439,292 Dimli (individual language) 706 Bytes 1 19 Divehi 217.2 MB 24,067 10,112,205 Dutch 114.0 GB 20,206,532 12,329,127,151 Eastern Mari 11.3 MB 1,612 641,525 Egyptian Arabic 2.8 MB 1,256 176,096 English 3.2 TB 431,992,659 377,376,402,775 Esperanto 558.3 MB 111,932 58,416,628 Estonian 9.2 GB 1,362,524 820,975,443 Filipino 646.5 MB 70,394 81,881,278 Finnish 37.8 GB 4,948,961 2,900,615,928 French 382.2 GB 52,037,098 41,713,990,658 Galician 255.2 MB 88,803 27,051,212 Georgian 7.1 GB 488,588 281,430,479 German 496.7 GB 70,075,424 46,826,676,844 Goan Konkani 787.2 kB 46 38,831 Greek 78.3 GB 6,738,546 5,031,242,803 Guarani 9.0 kB 10 374 Gujarati 4.8 GB 136,467 301,170,777 Hebrew 30.3 GB 3,132,396 2,249,377,984 Hindi 23.3 GB 1,529,907 1,534,799,198 Hungarian 53.9 GB 6,866,062 4,598,787,907 Icelandic 2.0 GB 396,183 210,365,124 Ido 77.3 kB 105 2,690 Iloko 97.9 kB 75 8,592 Indonesian 17.4 GB 2,244,622 1,984,195,207 Interlingua 40.2 kB 6 10,125 Irish 45.6 MB 12,233 4,877,850 Italian 229.3 GB 28,502,092 24,294,684,830 Japanese 258.7 GB 36,328,931 5,592,948,356 Javanese 152.7 kB 70 10,441 Kalmyk 9.3 kB 9 250 Kannada 2.6 GB 150,850 108,450,571 Karachay-Balkar 119.6 kB 91 4,089 Kazakh 2.9 GB 261,085 157,267,307 Khmer 1.9 GB 121,910 30,564,131 Komi 119.9 kB 127 3,335 Korean 51.8 GB 5,881,481 3,854,968,649 Kurdish 150.3 MB 29,906 17,390,759 Kyrgyz 518.6 MB 62,244 28,028,986 Lao 337.1 MB 28,914 6,682,982 Latin 4.1 MB 4,397 187,446 Latvian 8.2 GB 1,032,987 707,361,898 Lezghian 375.5 kB 124 19,250 Limburgish 1.4 kB 2 41 Lithuanian 20.0 GB 2,303,070 1,712,802,056 Lojban 1.9 MB 570 260,542 Lombard 2.6 kB 2 225 Low German 9.0 MB 1,938 1,012,561 Lower Sorbian 707 Bytes 1 17 Luxembourgish 15.8 MB 5,108 1,545,946 Macedonian 3.6 GB 341,775 244,058,579 Maithili 21.6 kB 23 483 Malagasy 57.3 MB 3,028 7,279,056 Malay 5.3 MB 5,228 217,818 Malayalam 4.1 GB 250,972 137,831,247 Maltese 2.5 MB 2,208 118,190 Marathi 3.3 GB 250,376 160,179,233 Mazanderani 128.2 kB 76 7,337 Minangkabau 6.0 MB 585 614,613 Mingrelian 7.6 MB 2,550 253,333 Mongolian 2.8 GB 237,719 176,405,432 Nahuatl languages 8.7 kB 12 179 Nepali 3.7 GB 391,947 177,885,116 Newari 5.7 MB 1,134 273,837 Norwegian 2.8 GB 973,188 279,182,902 Norwegian Nynorsk 6.8 MB 5,835 459,183 Occitan 2.1 MB 373 31,061 Odia 487.9 MB 52,942 23,755,902 Ossetic 13.9 MB 3,560 800,430 Pashto 490.3 MB 50,312 46,293,249 Persian 77.4 GB 7,665,871 6,430,164,396 Piedmontese 1.7 MB 698 188,270 Polish 139.0 GB 19,301,137 12,584,498,906 Portuguese 170.3 GB 23,735,707 18,441,864,893 Punjabi 1.1 GB 68,094 70,068,604 Quechua 744 Bytes 1 14 Romanian 49.2 GB 4,624,764 5,261,803,995 Russia Buriat 32.9 kB 39 785 Russian 1.1 TB 76,060,844 62,811,122,663 Sakha 65.6 MB 6,284 3,473,813 Sanskrit 136.0 MB 4,472 5,671,369 Scottish Gaelic 137.7 kB 136 7,769 Serbian 6.9 GB 577,472 482,932,670 Serbian (Latin) 931.8 kB 738 92,875 Sicilian 1.5 kB 2 50 Sindhi 117.1 MB 15,516 10,685,611 Sinhala 2.0 GB 108,593 113,179,741 Slovak 16.5 GB 2,409,555 1,619,121,944 Slovenian 1.2 GB 351,894 118,400,246 Somali 2.1 kB 3 109 South Azerbaijani 14.1 MB 5,381 693,746 Spanish 381.9 GB 51,386,247 42,829,835,316 Sundanese 5.0 MB 263 547,145 Swahili 1.3 MB 462 123,050 Swedish 48.0 GB 7,541,278 5,078,331,128 Tajik 870.9 MB 46,366 56,627,727 Tamil 11.4 GB 556,772 452,343,748 Tatar 915.3 MB 76,398 51,875,265 Telugu 3.4 GB 249,756 137,752,065 Thai 66.1 GB 5,030,254 1,626,779,846 Tibetan 234.5 MB 18,683 2,286,269 Turkish 75.1 GB 10,826,031 6,421,221,358 Turkmen 4.4 MB 2,485 276,632 Ukrainian 48.8 GB 4,558,214 2,879,585,992 Emiliano-Romagnolo[eml] 901 Bytes 1 53 Upper Sorbian 132.8 kB 110 8,825 Urdu 3.4 GB 336,994 332,816,354 Uyghur 201.9 MB 18,556 11,240,889 Uzbek 19.9 MB 9,526 1,370,842 Vietnamese 98.9 GB 9,587,233 12,283,185,482 Volap\u00fck 825.9 kB 661 57,039 Walloon 105.7 kB 138 4,386 Waray 7.6 MB 933 830,872 Welsh 409.3 MB 90,378 49,488,495 Western Frisian 75.3 MB 21,946 6,357,929 Western Mari 743.5 kB 155 43,916 Western Panjabi 46.7 MB 6,790 4,060,419 Wu Chinese 137.2 kB 88 3,056 Yiddish 232.5 MB 23,418 15,809,780 Yoruba 24.7 kB 26 1,042 Multilingual 12.1 GB 1,210,685 936,187,711"},{"location":"versions/oscar-2301/","title":"OSCAR 23.01","text":"

OSCAR 23.01 is the January 2023 version of the OSCAR Corpus based on the November/December 2022 dump of Common Crawl. While being quite similar to OSCAR 22.01, it contains several new features, including KenLM-based adult content detection, precomputed Locality-Sensitive Hashes for near deduplication, and blocklist-based categories. OSCAR 23.01 has also moved from gzip to Zstandard compression. You might already have zstd installed on your system, but if not, please check the Zstandard website for installation instructions.

Tip

OSCAR 23.01 is similar to OSCAR 22.01. As such, please also check out the documentation for OSCAR 22.01 if you need detailed information about metadata.

"},{"location":"versions/oscar-2301/#access","title":"Access","text":"

Note

If you already have access to the corpus, there's nothing to do! Go up in the file hierarchy on the link you've been given, and you should find the new corpus.

Access to the OSCAR Corpus changes depending on your status. More info on our dedicated page.

Getting access

"},{"location":"versions/oscar-2301/#new-features","title":"New Features","text":""},{"location":"versions/oscar-2301/#categories","title":"Categories","text":"

OSCAR 22.01 leveraged the UT1 Blocklists project to attempt to classify some adult content present in OSCAR. The OSCAR 23.01 pipeline iterated on this to include all of the blocklists provided by UT1.

Warning

The UT1 Blocklists page lists all the categories along with a short description. We strongly encourage you to read the descriptions if you plan on using them. Please also note that these descriptions are in French. We're working on an English translation of them.

Note

A document can belong to multiple categories.

These categories are in a field that is at this path: metadata.categories.

Example

{\n\"content\":\"foo\",\n\"metadata\": {\n// ...\n\"categories\": [\"blog\", \"news\"],\n// ...\n}\n// ...\n}\n
"},{"location":"versions/oscar-2301/#kenlm-based-adult-content-filtering","title":"KenLM-based Adult Content Filtering","text":"

For a select number of subcorpora, a measure of perplexity has been added. This perplexity comes from a KenLM model trained on harmful content, previously gathered by using the adult annotation in OSCAR 22.01. In other terms, the lower it is, the more likely a given document contains harmful/adult content.

Danger

This feature can be considered as unstable/unsafe, since we also want to evaluate its impact on particular issues.

As such, we do not provide a boolean value indicating if a given document can be harmful/adult content, but rather the raw perplexity. We have found a threshold that works well in English, but encourage you to experiment with it and to report back your findings.

"},{"location":"versions/oscar-2301/#locality-sensitive-hashing","title":"Locality Sensitive Hashing","text":"

We use TLSH to compute a hash for each document.

Locality sensitive hashing is a hashing method that computes similar hashes for similar documents.

This can be used to do both exact- and near- deduplication. Same documents have same hashes (the reverse might not be true). So you only need to check for identity amongst documents with identical hashes. TLSH hashes can be compared to yield a distance metric. According to the original paper, a cutoff of < 40 yields a false positive rate of 0.07% and a detect rate of 49.6%, while a cutoff of < 100 yields a FP rate of 6.43% and detect rate of 94.5%. You should choose a value that meets your purposes.

The above is true for the default version of TLSH which is used in packages such as py-tlsh. OSCAR 23.01 uses a TLSH with a hyperparameter of 256 buckets (Full hash), and 3 byte checksums (collision rate : 1 in 5800) instead of 1 byte checksums (collision rate : 1 in 24).

If you would like to use py-tlsh, follow these instructions (You need CMake installed to perform the necessary modifications and build):

# download py-tlsh source package\npip download python-tlsh\n# unpack the source tar.gz and enter the directory\ntar -xvf python-tlsh-4.5.0.tar.gz && cd python-tlsh-4.5.0\n# run the following command to implement the changes\n# alternatively, you can use vi or a text editor\n# change TLSH_BUCKETS_128 into TLSH_BUCKETS_256 and change TLSH_CHECKSUM_1B into TLSH_CHECKSUM_3B\nsed -i 's/set(TLSH_BUCKETS_128 1)/set(TLSH_BUCKETS_256 1)/g; s/set(TLSH_CHECKSUM_1B 1)/set(TLSH_CHECKSUM_3B 1)/g' CMakeLists.txt\n\n# build and activate pip venv if not already done\n# python3 -m venv ~/.venv\nsource ~/.venv/bin/activate\n# build and install the new py-tlsh\npython3 setup.py install\n

Hashes are at metadata.tlsh.

"},{"location":"versions/oscar-2301/#minor-changes","title":"Minor changes","text":"
  • metadata.annotations has been renamed metadata.quality_warnings, and only contains length based quality warnings (see the OSCAR 2201 documentation for details).
  • Some language tags have changed to better respect the BCP47:
    • als has become gsw. Previously, als was erroneously used as the tag for Alemannic/Swiss German, whereas it is the tag for Tosk Albanian.
    • eml has become x-eml. The eml tag is deprecated and as such has been replaced by a private tag (x-eml).
"},{"location":"versions/oscar-2301/#layout","title":"Layout","text":"
{\n\"content\":\"English sentence\\nphrase en fran\u00e7ais\\n????????????\", // (1)\n\"warc_headers\":{ // (2)\n\"warc-identified-content-language\":\"fra,eng\",\n\"warc-target-uri\":\"https://fr.wikipedia.org/wiki/...\",\n\"warc-record-id\":\"<urn:uuid:29eaa920-d299-4b1d-b687-c72bd8d68116>\",\n\"warc-type\":\"conversion\",\n\"content-length\":\"35298\", // (3)\n\"warc-refers-to\":\"<urn:uuid:39e42055-0d94-4e45-9c6c-9e7056635d64>\",\n\"warc-block-digest\":\"sha1:WFH2A5WHCS2H365GIAFYQPI7UOAMFGHB\", // (3)\n\"warc-date\":\"2022-11-26T09:45:47Z\",\n\"content-type\":\"text/plain\"\n},\n\"metadata\":{\n\"identification\":{ // (4)\n\"label\":\"fr\",\n\"prob\":0.8938327\n},\n\"harmful_pp\":4063.1814, // (5)\n\"tlsh\":\"tlsh:T125315FF2B6088901EEA097015DB39B4600B...\", // (6)\n\"quality_warnings\":[ // (7)\n\"short_sentences\",\n\"header\",\n\"footer\"\n],\n\"categories\":[ // (8)\n\"examen_pix\",\n\"liste_bu\"\n],\n\"sentence_identifications\":[ // (9)\n{\n\"label\":\"fr\",\n\"prob\":0.99837273\n},\n{\n\"label\":\"en\",\n\"prob\":0.9992377\n},\nnull\n]\n}\n}\n

Some important notes:

  1. Newline-separated content.
  2. Headers from the crawled dumps are left untouched. See the WARC specification for more info.
  3. Since warc_headers are copied and content can be altered by Ungoliant at generation stage, content-length and warc-block-digest can be different from actual values.
  4. Document-level identification. Computation details can be found on the OSCAR 22.01 paper.
  5. Perplexity of the document, computed using a KenLM model trained on harmful content. See this pre-print for more info. The lower this number is, the higher the probability that it will contain harmful or adult content. This annotation will be changed from harmful_pp to harmful_pplin future releases.
  6. Locality Sensitive Hash of the documents' content, using TLSH. Useful for both exact and near deduplication.
  7. (Corresponds to annotations pre-23.01) Potential quality warnings. Based on content/sentence length. See [OSCAR 22.01 paper for more info.
  8. Blocklist-based categories. Uses the UT1 Blocklist, plus custom additions. Please refer to the UT1 website for categories description. Note that the categories are in French.
  9. Sentence-level identifications. A null value means no identification with a good enough threshold (>0.8 on 23.01).
"},{"location":"versions/oscar-2301/#language-table","title":"Language table","text":"Code Language # docs # words Content Length : 0 af Afrikaans 23,994 6,217,024 37.2 MB 1 sq Albanian 1,342,790 462,694,599 3.2 GB 2 am Amharic 119,434 40,262,809 512.9 MB 3 ar Arabic 25,012,116 10,081,452,882 110.7 GB 4 an Aragonese 34 264 11.0 kB 5 hy Armenian 1,056,974 336,045,041 4.9 GB 6 as Assamese 89,542 24,395,215 412.1 MB 7 ast Asturian 440 10,917 74.1 kB 8 av Avaric 44 1,073 18.6 kB 9 az Azerbaijani 1,159,994 316,850,330 3.0 GB 10 bn Bangla 3,474,086 1,092,983,765 19.1 GB 11 ba Bashkir 128,248 26,036,637 363.7 MB 12 eu Basque 678,474 136,672,615 1.2 GB 13 be Belarusian 445,612 164,729,607 2.3 GB 14 bh Bihari languages 48 507 6.8 kB 15 bpy Bishnupriya 2,346 346,947 5.4 MB 16 bs Bosnian 20 395 3.0 kB 17 br Breton 36,338 4,759,407 31.4 MB 18 bg Bulgarian 8,933,998 3,635,273,738 44.1 GB 19 my Burmese 430,276 82,433,836 3.0 GB 20 ca Catalan 6,953,898 2,240,460,836 15.3 GB 21 ceb Cebuano 16,174 6,263,404 41.1 MB 22 ckb Central Kurdish 182,508 61,334,746 772.9 MB 23 ce Chechen 11,686 1,051,752 13.9 MB 24 zh Chinese 138,478,270 44,378,380,161 1.4 TB 25 cv Chuvash 16,652 3,039,925 42.3 MB 26 kw Cornish 8 80 432 Bytes 27 hr Croatian 31,808 3,542,961 26.5 MB 28 cs Czech 34,859,632 9,717,378,559 77.0 GB 29 da Danish 7,214,338 2,217,634,340 14.8 GB 30 dv Divehi 77,060 10,655,359 200.1 MB 31 nl Dutch 72,552,688 19,564,553,306 135.0 GB 32 mhr Eastern Mari 9,502 1,615,215 22.9 MB 33 arz Egyptian Arabic 3,958 385,511 3.7 MB 34 en English 1,235,510,986 523,869,288,690 3.4 TB 35 eo Esperanto 226,924 67,774,923 474.8 MB 36 et Estonian 3,601,904 938,296,892 8.0 GB 37 tl Filipino 250,558 110,560,444 719.2 MB 38 fi Finnish 14,471,710 4,198,143,883 41.1 GB 39 fr French 158,334,998 62,127,088,294 430.5 GB 40 gl Galician 248,762 38,345,625 255.7 MB 41 ka Georgian 1,343,036 373,935,158 8.4 GB 42 de German 206,598,430 73,848,586,648 594.7 GB 43 gom Goan Konkani 398 121,035 2.3 MB 44 el Greek 20,282,864 7,691,622,692 95.7 GB 45 gn Guarani 14 260 2.2 kB 46 gu Gujarati 425,552 417,001,705 5.6 GB 47 ht Haitian Creole 2 20,671 93.1 kB 48 he Hebrew 3,997,888 1,697,158,891 18.0 GB 49 hi Hindi 5,514,454 2,475,605,444 32.6 GB 50 hu Hungarian 21,349,372 16,013,364,289 150.1 GB 51 is Icelandic 1,210,232 294,471,539 2.2 GB 52 io Ido 224 2,598 16.1 kB 53 ilo Iloko 144 4,411 28.0 kB 54 id Indonesian 7,109,778 3,228,020,221 23.4 GB 55 ia Interlingua 34 9,384 33.5 kB 56 ie Interlingue 2 0 881 Bytes 57 ga Irish 29,894 9,054,923 63.2 MB 58 it Italian 89,021,606 36,327,274,203 259.4 GB 59 ja Japanese 94,236,404 4,401,059,165 181.2 GB 60 jv Javanese 172 3,286 25.7 kB 61 xal Kalmyk 2 27 315 Bytes 62 kn Kannada 448,500 124,924,350 2.6 GB 63 krc Karachay-Balkar 496 8,385 122.4 kB 64 kk Kazakh 677,622 214,679,857 3.3 GB 65 km Khmer 450,660 59,880,231 3.2 GB 66 kv Komi 460 5,909 70.3 kB 67 ko Korean 15,147,698 3,435,866,935 38.1 GB 68 ku Kurdish 80,338 25,921,607 174.1 MB 69 ky Kyrgyz 144,288 32,062,783 489.3 MB 70 lo Lao 118,374 10,659,203 472.1 MB 71 la Latin 14,384 307,865 2.0 MB 72 lv Latvian 2,435,882 845,459,899 7.4 GB 73 lez Lezghian 676 60,634 856.6 kB 74 li Limburgish 6 169 1.4 kB 75 lt Lithuanian 5,182,028 1,674,362,574 14.5 GB 76 jbo Lojban 572 312,315 1.5 MB 77 lmo Lombard 112 3,269 21.0 kB 78 nds Low German 5,248 1,612,175 10.7 MB 79 dsb Lower Sorbian 8 84 664 Bytes 80 lb Luxembourgish 18,090 2,514,838 18.4 MB 81 mk Macedonian 1,063,298 389,344,425 4.7 GB 82 mai Maithili 46 467 6.8 kB 83 mg Malagasy 10,830 1,416,430 11.2 MB 84 ms Malay 11,500 238,477 2.6 MB 85 ml Malayalam 800,936 236,597,838 5.8 GB 86 mt Maltese 5,180 149,886 1.3 MB 87 mr Marathi 729,578 252,706,331 4.5 GB 88 mzn Mazanderani 384 16,115 169.2 kB 89 min Minangkabau 2,436 305,589 3.8 MB 90 xmf Mingrelian 7,318 283,316 6.1 MB 91 mwl Mirandese 4 54 423 Bytes 92 mn Mongolian 1,061,710 454,350,415 5.8 GB 93 multi Multilingual 2,948,202 1,251,676,406 11.9 GB 94 nah Nahuatl languages 38 279 2.4 kB 95 ne Nepali 1,152,156 278,901,036 4.9 GB 96 new Newari 1,996 229,703 4.0 MB 97 no Norwegian 2,797,378 373,160,033 2.6 GB 98 nn Norwegian Nynorsk 19,470 575,518 3.7 MB 99 oc Occitan 920 34,701 405.0 kB 100 or Odia 158,426 31,963,340 543.1 MB 101 os Ossetic 8,628 3,935,964 50.7 MB 102 ps Pashto 87,408 30,196,179 261.6 MB 103 fa Persian 23,813,882 9,609,206,698 93.2 GB 104 pms Piedmontese 2,524 510,087 3.1 MB 105 pl Polish 57,184,826 18,073,705,588 147.1 GB 106 pt Portuguese 36,062,800 15,172,557,311 105.0 GB 107 pa Punjabi 222,058 104,235,418 1.4 GB 108 qu Quechua 2 13 143 Bytes 109 ro Romanian 11,985,668 6,302,600,833 45.6 GB 110 bxr Russia Buriat 72 698 8.2 kB 111 ru Russian 194,143,422 78,032,029,344 1.1 TB 112 sah Sakha 17,566 4,288,051 68.8 MB 113 sa Sanskrit 16,802 2,479,345 56.3 MB 114 gd Scottish Gaelic 776 18,458 146.1 kB 115 sr Serbian 1,677,896 632,781,822 7.7 GB 116 sh Serbian (Latin) 3,214 166,517 816.4 kB 117 sd Sindhi 48,566 14,667,207 131.6 MB 118 si Sinhala 301,066 172,755,385 2.6 GB 119 sk Slovak 8,931,784 2,704,716,280 21.5 GB 120 sl Slovenian 1,112,560 192,816,743 1.4 GB 121 so Somali 6 51 503 Bytes 122 azb South Azerbaijani 26,364 2,029,729 28.4 MB 123 es Spanish 153,574,556 63,388,237,965 429.9 GB 124 su Sundanese 18 258 2.0 kB 125 sw Swahili 1,664 164,459 1.0 MB 126 sv Swedish 21,891,348 6,993,719,601 50.0 GB 127 gsw Swiss German 342 34,328 232.7 kB 128 tg Tajik 144,932 76,987,285 1.0 GB 129 ta Tamil 1,638,238 738,824,392 15.8 GB 130 tt Tatar 262,654 59,253,765 833.8 MB 131 te Telugu 644,712 201,575,815 3.9 GB 132 th Thai 14,845,900 2,224,483,018 92.0 GB 133 bo Tibetan 62,352 6,062,558 531.6 MB 134 tr Turkish 26,654,330 8,290,890,087 73.7 GB 135 tk Turkmen 4,576 325,786 3.3 MB 136 uk Ukrainian 10,059,992 3,183,842,018 44.7 GB 137 x-eml Emiliano-Romagnol 4 329 1.8 kB 138 hsb Upper Sorbian 402 15,827 123.2 kB 139 ur Urdu 887,004 434,023,273 3.8 GB 140 ug Uyghur 51,304 14,659,554 219.8 MB 141 uz Uzbek 15,806 1,665,960 15.3 MB 142 vi Vietnamese 33,933,994 22,424,984,210 140.8 GB 143 vo Volap\u00fck 896 49,968 371.9 kB 144 wa Walloon 390 6,347 34.3 kB 145 war Waray 1,494 19,665 126.8 kB 146 cy Welsh 151,512 52,250,043 333.0 MB 147 fy Western Frisian 45,458 9,885,788 70.4 MB 148 mrj Western Mari 496 60,180 765.8 kB 149 pnb Western Panjabi 12,904 11,844,695 105.8 MB 150 wuu Wu Chinese 136 1,199 26.8 kB 151 yi Yiddish 47,438 14,287,370 171.7 MB 152 yo Yoruba 128 2,396 16.6 kB"}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 0000000..fa20972 --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,58 @@ + + + + None + 2024-06-05 + daily + + + None + 2024-06-05 + daily + + + None + 2024-06-05 + daily + + + None + 2024-06-05 + daily + + + None + 2024-06-05 + daily + + + None + 2024-06-05 + daily + + + None + 2024-06-05 + daily + + + None + 2024-06-05 + daily + + + None + 2024-06-05 + daily + + + None + 2024-06-05 + daily + + + None + 2024-06-05 + daily + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz new file mode 100644 index 0000000..61d6cf8 Binary files /dev/null and b/sitemap.xml.gz differ diff --git a/static/media/oscar.png b/static/media/oscar.png new file mode 100644 index 0000000..84b97f2 Binary files /dev/null and b/static/media/oscar.png differ diff --git a/tools/generation-jeanzay/index.html b/tools/generation-jeanzay/index.html new file mode 100644 index 0000000..1566330 --- /dev/null +++ b/tools/generation-jeanzay/index.html @@ -0,0 +1,1006 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Generating a new OSCAR Version on Jean Zay - OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

Generating a new OSCAR Version on Jean Zay

+

Compile the latest version of Unogliant

+

This is currently preferred to just getting it from cargo install ungoliant.

+
    +
  • git clone https://github.com/oscar-project/ungoliant
  • +
  • Open an interactive session on a compil node: srun --partition=compil -A <GROUP ID>@cpu --pty bash
  • +
  • Run module load llvm boost cargo (boost and llvm are necessary for compiling KenLM and FastText)
  • +
  • Run cd ungoliant
  • +
  • Run cargo b --release --features kenlm
  • +
+

Download the data from CommonCrawl

+

We advise the use of the prepost partition for downloading the data form Common Crawl. However, please bear in mind that jobs are limited to 20hours in the prepost partition, meaning that you'll likely run out of time before completing the download of a whole Common Crawl dump.

+
    +
  • Download the wet.paths.gz file for the latest release (likely here
  • +
  • gzip -d wet.paths.gz
  • +
+

Create a dl_corpus.slurm file with the following text inside:

+
#! /bin/bash
+
+#SBATCH --partition=prepost
+#SBATCH --job-name=get_cc # create a short name for your job
+#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=<YOUR MAIL>    # Where to send mail
+#SBATCH --nodes="1" #Combien de nœuds
+#SBATCH --ntasks-per-node="1" # Une tâche par GPU
+#SBATCH --cpus-per-task="64" # nombre de coeurs à réserver par tâche
+#SBATCH --time="20:00:00" # temps d'exécution maximum demande (HH:MM:SS)
+#SBATCH -A <GROUP ID>@cpu
+
+export CARGO_HOME=<CARGO HOME PATH (in SCRATCH if you can>
+export PATHS_FILE=<PATH TO wet.PATHS>
+export DST=<DESTINATION>
+
+
+./target/release/ungoliant download $PATHS_FILE $DST
+
+

When the time has run out, you have to ensure that the last downloaded shards weren't corrupted (because of a potential kill while downloading).

+

Then, after potentially removing faulty shards, run the following slurm job. +The only difference with the previous one is the use of the -o n parameter on ungoliant download, which will ignore the first n lines of the wet.paths. +You can/should also use another DESTINATION folder, and then do the merge by hand.

+
#! /bin/bash
+
+#SBATCH --partition=prepost
+#SBATCH --job-name=get_cc # create a short name for your job
+#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=<YOUR MAIL>    # Where to send mail
+#SBATCH --nodes="1" #Combien de nœuds
+#SBATCH --ntasks-per-node="1" # Une tâche par GPU
+#SBATCH --cpus-per-task="64" # nombre de coeurs à réserver par tâche
+#SBATCH --time="20:00:00" # temps d'exécution maximum demande (HH:MM:SS)
+#SBATCH -A <GROUP ID>@cpu
+
+export CARGO_HOME=<CARGO HOME PATH (in SCRATCH if you can>
+export PATHS_FILE=<PATH TO wet.PATHS>
+export DST=<DESTINATION>
+
+
+./target/release/ungoliant download -o <NB_DOWNLOADED> $PATHS_FILE $DST
+
+

You can then check that no shards are missing:

+
import os
+
+shards_dir = "./shards"
+paths_file = "wet.paths"
+cc_rooturl = "https://data.commoncrawl.org/"
+
+missing_shards = list()
+for i in range(88000):
+    if not os.path.isfile(f"{shards_dir}/{i}.txt.gz"):
+        missing_shards.append(i)
+print(f"missing {len(missing_shards)} shards")
+
+with open(paths_file) as f:
+    shard_paths = f.readlines()
+    for missing_shard_number in missing_shards:
+        print(
+            f"wget -nc {cc_rooturl}{shard_paths[missing_shard_number].strip()} -O {missing_shard_number}.txt.gz"
+        )
+
+

This will give you the wget commands to get the missing shards, with a -nc param to avoid overwriting already existing files.

+

Generate OSCAR

+

When you have your shards ready, create a new SLURM file with:

+

We use a QoS of t4 because since we can only use one node and corpus generation time is likely >20h, we need the 100 mark.

+

Other strategies could be tested (for example, splitting CC data into 4 buckets and launch 4 ungoliant jobs. +Then, merging back the datasets should be done. +Note that in that case, rebuild files will be less efficient (since we'll have 4 of them)

+
#! /bin/bash
+
+#SBATCH --partition=cpu_p1
+#SBATCH --job-name=gen_oscar # create a short name for your job
+#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=<YOUR MAIL>    # Where to send mail
+#SBATCH --nodes="1" #Combien de nœuds
+#SBATCH --ntasks-per-node="1" # Une tâche par GPU
+#SBATCH --cpus-per-task="40" # nombre de coeurs à réserver par tâche
+#SBATCH --time="100:00:00" # temps d'exécution maximum demande (HH:MM:SS)
+#SBATCH --qos=qos_cpu-t4
+#SBATCH -A <GROUP ID>@cpu
+
+export CARGO_HOME=<CARGO HOME PATH>
+export CC_FOLDER=<SHARDS PATH>
+export KENLM_FOLDER=<PATH TO KENLMS MODELS IF APPLICABLE>
+export CORPUS=<DESTINATION FOLDER>
+export BLOCKLIST=<BLOCKLIST FOLDER (must contain subfolders with category names..)>
+export LID_PATH=<PATH TO FASTTEXT LangID>
+export UNGOLIANT_PATH=<PATH TO UNGOLIANT BINARY>
+
+$UNGOLIANT_PATH pipeline $CC_FOLDER $CORPUS --blocklist-path $BLOCKLIST --kenlms-path $KENLM_FOLDER --lid-path $LID_PATH
+
+

As of Jan. 2023, using ungoliant 1.3.0 ([c14acc8](https://github.com/oscar-project/ungoliant/tree/c14acc8c6a87913d138a022cf4819024d66b3e06)), with a 88,000-shard dump of CommonCrawl (November/December 2022, ~9.5TB compressed), this process took around 20 hours and yielded a corpus weighing arount 12TB (uncompressed).

+

Move OSCAR

+

Files in $SCRATCH are deleted after 30 days if no R/W is operated on them. You should move out files to $STORE if you plan on keeping them. +Unfortunately, due to the file size, you'll need to launch another job to do the copying of the files.

+
+

Warning

+

rsync -n enables a dry-run, enabling you to see which files would be moved, and where. Remove the -n parameter when you want to perform the actual copy.

+
+
#! /bin/bash
+
+#SBATCH --partition=prepost
+#SBATCH --job-name=copy_oscar # create a short name for your job
+#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=julien.abadji@inria.fr    # Where to send mail
+#SBATCH --nodes="1" #Combien de nœuds
+#SBATCH --ntasks-per-node="1" # Une tâche par GPU
+#SBATCH --cpus-per-task="4" # nombre de coeurs à réserver par tâche
+#SBATCH --time="20:00:00" # temps d'exécution maximum demande (HH:MM:SS)
+#SBATCH -A <GROUP ID>@cpu
+
+export SRC=<CORPUS SOURCE>
+export DST=<CORPUS DESTINATION>
+
+rsync -anvP $SRC $DST
+
+

On the same example as before, copying took around 9 hours.

+

Preparing for release

+

Splitting

+

We use oscar-tools to split the corpus.

+
+

Note

+

At the time of writing, oscar-tools is not available via crates.io/cargo install, so you have to compile it from source. Luckily, it's easy.

+
+
+Compiling oscar-tools +
    +
  1. Get the source: git clone https://github.com/oscar-project/oscar-tools
  2. +
  3. Go inside a compil node: srun --partition=compil -A <GROUP ID>@cpu --pty bash
  4. +
  5. cd oscar-tools
  6. +
  7. CARGO_HOME=<Somewhere not in your ~, like $SCRATCH/.cargo> cargo b --features zstd --release.
  8. +
  9. Wait ~some hours~
  10. +
  11. That's it! Your binary sits at target/release/oscar-tools.
  12. +
+
+
#! /bin/bash
+
+#SBATCH --partition=prepost
+#SBATCH --job-name=split_oscar # create a short name for your job
+#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=<Your email address>    # Where to send mail
+#SBATCH --nodes="1" #Combien de nœuds
+#SBATCH --ntasks-per-node="1" # Une tâche par GPU
+#SBATCH --cpus-per-task="10" # nombre de coeurs à réserver par tâche
+#SBATCH --time="20:00:00" # temps d'exécution maximum demande (HH:MM:SS)
+#SBATCH -A <group id>@cpu
+
+export OSCAR_TOOLS_BIN=<path to oscar-tools binary>
+export CORPUS=<path to corpus>
+export DST=<where the split corpus will be put>
+
+$OSCAR_TOOLS_BIN v2 split $CORPUS $DST -s 2000
+
+

This step took around 3 hours (assuming both CORPUS and DST are on $SCRATCH).

+

Compressing

+
#! /bin/bash
+
+#SBATCH --partition=prepost
+#SBATCH --job-name=compress_oscar # create a short name for your job
+#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=<email address>    # Where to send mail
+#SBATCH --nodes="1" #Combien de nœuds
+#SBATCH --ntasks-per-node="1" # Une tâche par GPU
+#SBATCH --cpus-per-task="48" # nombre de coeurs à réserver par tâche
+#SBATCH --time="20:00:00" # temps d'exécution maximum demande (HH:MM:SS)
+#SBATCH -A <group id>@cpu
+
+export OSCAR_TOOLS_BIN=<link to oscar-tools binary>
+export CORPUS=<path to split focus>
+export DST=<where the compressed ocrpus will be saved>
+
+$OSCAR_TOOLS_BIN v2 compress $CORPUS $DST
+
+

This step took around 2 hours, going from 12TB to 3.3TB

+

Checksuming

+

The last step is to create checksum files for each language, so that people can check that their downloads have been successful. +Also, it acts as a split list for download-oscar.

+
#! /bin/bash
+
+#SBATCH --partition=prepost
+#SBATCH --job-name=compress_oscar # create a short name for your job
+#SBATCH --mail-type=BEGIN,END,FAIL          # Mail events (NONE, BEGIN, END, FAIL, ALL)
+#SBATCH --mail-user=<email address>    # Where to send mail
+#SBATCH --nodes="1" #Combien de nœuds
+#SBATCH --ntasks-per-node="1" # Une tâche par GPU
+#SBATCH --cpus-per-task="48" # nombre de coeurs à réserver par tâche
+#SBATCH --time="20:00:00" # temps d'exécution maximum demande (HH:MM:SS)
+#SBATCH -A <group id>@cpu
+
+export OSCAR_TOOLS_BIN=<link to oscar-tools binary>
+export CORPUS=<path to split focus>
+
+$OSCAR_TOOLS_BIN v2 checksum $CORPUS
+
+

The process took around 2 hours.

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tools/oscar-tools/index.html b/tools/oscar-tools/index.html new file mode 100644 index 0000000..a212865 --- /dev/null +++ b/tools/oscar-tools/index.html @@ -0,0 +1,909 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + oscar-tools - OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

oscar-tools

+

oscar-tools is a toolkit that was created along with OSCAR-2201 to make operations on the corpus easy and fast.

+

At its core, oscar-tools provides a set of operations targeted at a given OSCAR version. As such, you shoudn't expect to have all operations available on all OSCAR versions. For example, at the time of writing, deduplicate is not available for OSCAR 22.01-like corpora.

+

The CLI of oscar-tools is still a bit messy and can be confusing, because we are actively working on it and on implementing essential features.

+

Installation

+

From releases

+
+

Note

+

Binaries are not available yet.

+
+

From cargo

+
+

Note

+

cargo install oscar-tools is not available yet.

+
+

From repository

+
+

Note

+

This could evolve rapidly.

+
+

Right now the latest version sits on the dev-oscario branch, where we're slowly replacing inline IO blocks by our Corpus IO library, oscar-io.

+
$> git clone https://github.com/oscar-corpus/oscar-tools #Clone the repository
+$> cd oscar-tools
+$> git checkout dev-oscario #Change branch
+$> cargo b --release #Build the project. 
+$> # Building might take some time because of 
+$> # the parquet dependency that will soon be optional.
+$> touch target/release/oscar-tools #Binary is here and self-sufficient.
+
+

Usage

+

oscar-tools --help might help you find the parameters/operations you're looking for.

+
+

Note

+

In the tool, v1 corresponds to 2019-like corpora, whereas v2 corresponds to 22.01-like corpora.

+
+

Each operation has different parameters.

+

v1 / OSCAR 2019

+

At the time of writing, the only operation available is dedup. It uses runiq to deduplicate corpora.

+
oscar-tools-v1-dedup 
+line deduplication
+
+USAGE:
+    oscar-tools v1 dedup [ARGS]
+
+ARGS:
+    <SOURCE>         Corpus source file.
+    <DESTINATION>    Corpus destination file. Should not exist.
+
+OPTIONS:
+    -h, --help    Print help information
+
+

v2 / OSCAR 22.01

+

There is a lot more operations implemented on OSCAR 22.01-like corpora.

+

extract-tags

+

extract-tags extracts documents that meet certain annotation constraints.

+
oscar-tools-v2-extract-tags 
+Extracts a OSCAR v2 corpus restricting tags. Included tags must be present and excluded ones must be
+absent. Use --clean to extract documents with no annotation only
+
+USAGE:
+    oscar-tools v2 extract-tags [OPTIONS] [--] [ARGS]
+
+ARGS:
+    <SOURCE>         Corpus source file/folder. If folder, splits corpus files in provided
+                     folder
+    <DESTINATION>    Corpus source file/folder. If folder, splits corpus files in provided
+                     folder
+
+OPTIONS:
+        --clean                only return documents with no tags. include and exclude will be
+                               ignored
+    -e, --exclude <tags>...    space separated tags to exclude.
+    -h, --help                 Print help information
+    -i, --include <tags>...    space separated tags to include.
+
+

extract-text

+

extract-text "converts" a 2201-like corpus into a 2019-like corpus, by removing all metadata and only storing sentences. Keep in mind that while the format will be similar to 2109-like corpora, the filtering is a bit different and lines from other languages won't be stripped.

+
Extract text from documents. The output will be a OSCAR v1 (2019)-compatible corpus.
+
+USAGE:
+    oscar-tools v2 extract-text [OPTIONS] <SOURCE> <DESTINATION>
+
+ARGS:
+    <SOURCE>         Corpus source file.
+    <DESTINATION>    Corpus destination file (OSCAR v1 (2019)-like)
+
+OPTIONS:
+        --del_src    If set, deletes source files as they are being extracted.
+    -h, --help       Print help information
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/versions/mOSCAR/index.html b/versions/mOSCAR/index.html new file mode 100644 index 0000000..5a55357 --- /dev/null +++ b/versions/mOSCAR/index.html @@ -0,0 +1,2201 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + mOSCAR - OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

mOSCAR

+

mOSCAR, to the best of our knowledge the first large-scale multilingual and multimodal document corpus crawled from the web. It covers 163 languages, 315M documents, 214B tokens and 1.2B images. We carefully conduct a set of filtering and evaluation steps to make sure mOSCAR is sufficiently safe, diverse and of good quality.

+

Access

+

Access to the mOSCAR is granted via the Hugging Face Hub.

+

All data is avaialble at https://huggingface.co/datasets/oscar-corpus/mOSCAR.

+

Layout

+

To Come ...

+

Language table

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lang. nameCodeFamilyScript# documents# images# tokens
Acehneseace_LatnLatin7,80332,4612,889,134
Mesopotamian Arabicacm_ArabArabic2,27410,6201,047,748
Tunisian Arabicaeb_ArabArabic7,64041,5702,715,187
Afrikaansafr_LatnLatin54,895247,77439,956,585
South Levantine Arabicajp_ArabArabic12,09887,8375,167,813
Tosk Albanianals_LatnLatin861,6782,569,164452,737,251
Amharicamh_EthiGe'ez39,588152,64635,089,019
North Levantine Arabicapc_ArabArabic19,904128,9669,560,701
Modern Standard Arabicarb_ArabArabic3,936,85115,126,9313,401,919,964
Najdi Arabicars_ArabArabic60,229296,74143,610,873
Moroccan Arabicary_ArabArabic142,386698,051204,723,454
Egyptian Arabicarz_ArabArabic835,5294,054,632653,626,387
Assameseasm_BengBengali3,9489,210640,390
Asturianast_LatnLatin165,745962,72337,547,944
Awadhiawa_DevaDevanagari29,324107,4834,961,635
Central Aymaraayr_LatnLatin27,384151,8895,148,970
South Azerbaijaniazb_ArabArabic8,27438,2335,256,693
North Azerbaijaniazj_LatnLatin516,0211,808,060257,825,849
Bashkirbak_CyrlCyrillic4,53217,1743,038,766
Bambarabam_LatnLatin7,67439,1901,243,332
Balineseban_LatnLatin1,88611,266542,015
Belarusianbel_CyrlCyrillic63,309287,53972,976,520
Bembabem_LatnLatin1,0967,4791,340,471
Bengaliben_BengBengali270,406947,03535,858,814
Bhojpuribho_DevaDevanagari6,36628,131875,463
Banjarbjn_LatnLatin5,42727,8031,898,526
Bosnianbos_LatnLatin1,960,5997,633,0491,255,000,505
Buginesebug_LatnLatin3,31218,648588,678
Bulgarianbul_CyrlCyrillic2,591,99811,670,0281,760,971,620
Catalancat_LatnLatin1,153,8644,736,634606,447,390
Cebuanoceb_LatnLatin16,99091,23410,748,818
Czechces_LatnLatin3,918,83713,291,3092,823,172,996
Central Kurdishckb_ArabArabic36,725136,56622,322,689
Crimean Tatarcrh_LatnLatin6,37624,1241,742,727
Welshcym_LatnLatin40,408165,89727,748,345
Danishdan_LatnLatin2,076,2989,559,6001,238,277,499
Germandeu_LatnLatin20,662,69687,976,2008,544,986,218
Southwestern Dinkadik_LatnLatin1,7126,6351,319,943
Greekell_GrekGreek4,916,08115,209,0582,923,201,041
Englisheng_LatnLatin52,215,013207,904,31533,570,108,782
Esperantoepo_LatnLatin25,157124,99628,586,195
Estonianest_LatnLatin1,040,3685,217,366619,215,048
Basqueeus_LatnLatin849,0433,445,539277,145,498
Faroesefao_LatnLatin15,41160,3406,691,327
Fijianfij_LatnLatin1,5288,776487,388
Finnishfin_LatnLatin2,396,03310,365,3331,781,044,864
Frenchfra_LatnLatin20,305,73978,179,60114,362,579,829
Friulianfur_LatnLatin37,290256,4565,949,600
Nigerian Fulfuldefuv_LatnLatin1,5687,124401,852
West Central Oromogaz_LatnLatin4,05811,7631,786,093
Scottish Gaelicgla_LatnLatin29,710153,24914,605,090
Irishgle_LatnLatin68,858315,13247,438,400
Galicianglg_LatnLatin518,9732,381,475217,063,180
Guaranigrn_LatnLatin490,9452,416,63389,921,114
Gujaratiguj_GujrGujarati23,06291,3203,324,866
Haitian Creolehat_LatnLatin257,7451,570,69962,847,106
Hausahau_LatnLatin25,364104,93413,089,932
Hebrewheb_HebrHebrew1,109,5914,766,483893,327,320
Hindihin_DevaDevanagari579,4301,830,667122,558,353
Chhattisgarhihne_DevaDevanagari1,5817,263273,174
Croatianhrv_LatnLatin1,719,6178,425,5101,010,674,096
Hungarianhun_LatnLatin3,534,50615,390,0832,831,715,050
Armenianhye_ArmnArmenian339,9621,141,885205,635,952
Igboibo_LatnLatin11,52968,0498,701,070
Ilocanoilo_LatnLatin78,872523,1958,116,113
Indonesianind_LatnLatin7,016,29117,324,7773,981,843,468
Icelandicisl_LatnLatin244,6761,027,465137,015,973
Italianita_LatnLatin12,937,15347,476,9718,311,790,842
Javanesejav_LatnLatin24,785135,58316,908,805
Japanesejpn_JpanKanji14,415,29223,893,7688,923,348,944
Kabylekab_LatnLatin18,508106,7304,079,553
Kannadakan_KndaBrahmicKannada12,97842,6211,442,776
Kashmirikas_ArabArabic3,10911,4085,731,910
Georgiankat_GeorCaucasianGeorgian354,4361,304,281275,223,026
Kazakhkaz_CyrlCyrillic252,242732,648140,049,214
Halh Mongoliankhk_CyrlCyrillic124,412508,21784,535,241
Khmerkhm_KhmrAustroasiatic24,495122,2433,043,925
Kinyarwandakin_LatnLatin30,401172,20112,049,616
Kyrgyzkir_CyrlCyrillic53,010199,71334,404,281
Northern Kurdishkmr_LatnLatin39,262164,66623,834,960
Koreankor_HangHanja2,614,08913,563,2832,006,080,705
Laolao_Laoo50,611208,76831,029,380
Ligurianlij_LatnLatin8,75156,2662,958,179
Limburgishlim_LatnLatin189,5471,076,04742,534,327
Lingalalin_LatnLatin24,614152,1324,053,459
Lithuanianlit_LatnLatin1,688,8118,869,4431,161,476,040
Lombardlmo_LatnLatin30,506151,8559,058,614
Latgalianltg_LatnLatin11,94861,6244,148,492
Luxembourgishltz_LatnLatin44,987246,34616,676,872
Gandalug_LatnLatin1,8787,215789,917
Mizolus_LatnLatin7,88026,8174,978,472
Standard Latvianlvs_LatnLatin896,2434,141,648587,653,855
Magahimag_DevaDevanagari1,0973,847205,763
Malayalammal_Mlym14,14052,6791,689,010
Marathimar_DevaDevanagari50,391163,8686,689,250
Minangkabaumin_LatnLatin9,34135,3091,256,931
Macedonianmkd_CyrlCyrillic542,2501,853,070307,232,151
Maltesemlt_LatnLatin120,888709,24236,097,957
Maorimri_LatnLatin24,322130,13724,957,914
Burmesemya_Mymr8,14444,188539,527
Dutchnld_LatnLatin17,096,72765,606,0139,670,041,731
Norwegian Nynorsknno_LatnLatin199,3551,012,31367,799,774
Norwegian Bokmalnob_LatnLatin2,229,7029,698,1281,294,178,095
Nepalinpi_DevaDevanagari31,239127,1933,138,539
Nyanjanya_LatnLatin12,04767,1928,596,769
Occitanoci_LatnLatin164,852671,88159,309,549
Odiaory_Orya4,31915,574378,635
Pangasinanpag_LatnLatin4,21432,287546,071
Eastern Panjabipan_Guru11,49746,1681,887,991
Papiamentopap_LatnLatin55,224363,01510,002,655
Southern Pastopbt_ArabArabic32,604110,80729,170,322
Western Persianpes_ArabArabic7,048,94625,200,5716,210,479,015
Plateau Malgasyplt_LatnLatin32,521120,67329,263,848
Polishpol_LatnLatin14,549,60560,639,24411,104,144,109
Portuguesepor_LatnLatin8,145,66426,530,4234,760,063,083
Dariprs_ArabArabic515,0412,589,859517,053,967
Ayacucho Quechuaquy_LatnLatin1,57811,817362,690
Romanianron_LatnLatin5,180,17117,964,0483,548,291,261
Rundirun_LatnLatin20,00167,0968,686,054
Russianrus_CyrlCyrillic15,913,84569,542,82818,909,213,208
Sangosag_LatnLatin2,12413,556454,455
Sicilianscn_LatnLatin73,199424,36227,110,743
Sinhalasin_Sinh58,767221,18314,270,972
Slovakslk_LatnLatin3,008,59915,067,2341,963,804,563
Slovenianslv_LatnLatin1,472,0257,210,285935,834,754
Samoansmo_LatnLatin12,34671,35914,954,824
Shonasna_LatnLatin12,69868,7826,112,600
Sindhisnd_ArabArabic21,09574,28917,647,825
Somalisom_LatnLatin77,343301,42934,554,975
Southern Sothosot_LatnLatin7,71843,1466,156,450
Spanishspa_LatnLatin22,713,36678,361,08714,616,773,475
Sardiniansrd_LatnLatin675,5394,059,493106,159,957
Serbiansrp_CyrlCyrillic604,5572,286,171401,223,741
Sundanesesun_LatnLatin44,310236,02513,627,832
Swedishswe_LatnLatin3,302,73010,860,5181,779,284,152
Swahiliswh_LatnLatin137,134593,41859,454,896
Silesianszl_LatnLatin23,535132,4595,996,972
Tamiltam_TamlDravidianTamil36,196167,6694,834,946
Tatartat_CyrlCyrillic37,188143,84222,831,350
Telugutel_TeluBrahmicTelugu22,97481,0332,273,772
Tajiktgk_CyrlCyrillic125,236417,59190,503,778
Tagalogtgl_LatnLatin151,437673,81497,708,639
Thaitha_ThaiThai2,983,83711,621,7862,839,211,104
Tigrinyatir_EthiGe'ez2,6578,7071,725,422
Tok Pisintpi_LatnLatin5,06335,169460,853
Turkmentuk_LatnLatin13,02457,3549,766,999
Turkishtur_LatnLatin4,478,70012,401,0912,394,669,068
Twitwi_LatnLatin3,30513,634495,220
Uyghuruig_ArabArabic10,71341,7096,785,318
Ukrainianukr_CyrlCyrillic2,721,42410,929,7961,928,351,595
Urduurd_ArabArabic407,0981,239,125242,007,283
Northern Uzbekuzn_LatnLatin156,632798,15589,022,562
Venetianvec_LatnLatin330,6111,830,77771,077,531
Vietnamesevie_LatnLatin12,621,52147,411,48811,616,191,199
Wolofwol_LatnLatin4,65820,3801,596,432
Xhosaxho_LatnLatin25,950142,38715,809,823
Eastern Yiddishydd_Hebr12,48657,51017,369,727
Yorubayor_LatnLatin56,700286,93332,614,558
Yue Chineseyue_Hant33,671203,51324,172,441
Chinese (Simplified)zho_HansHanzi9,861,26236,152,7548,078,842,701
Chinese (Traditional)zho_HantHant3,967,96616,307,2582,962,854,441
Standard Malayzsm_LatnLatin1,179,7445,488,632432,667,199
Zuluzul_LatnLatin30,717156,63911,345,288
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/versions/oscar-2019/index.html b/versions/oscar-2019/index.html new file mode 100644 index 0000000..5875a26 --- /dev/null +++ b/versions/oscar-2019/index.html @@ -0,0 +1,2647 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + OSCAR 2019 - OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

OSCAR 2019

+

OSCAR 2019 is the original 2019 release of the OSCAR corpus. +It has been generated from Common Crawl corpus using the goclassy architecture.

+

Features

+

OSCAR 2019 is shuffled at line level and no metadata is provided. Thus it is mainly intended to be used in the training of unsupervised language models for NLP.

+

Data is distributed by language in both original and deduplicated form.

+

If you need the unshuffled version of OSCAR, please contact us using the contact form. Please include your name, affiliation, contact details, which languages do you need and a brief description of how you intend to use OSCAR. You can also download it using HuggingFace’s datasets library.

+

Even though OSCAR is not Postcardware, we do appreciate when our users send us a postcard. If you want to send us one, you can find the address in the contact section down below.

+

Citing OSCAR

+

If you use OSCAR to train a language model, text generation model or any other ML model in general please consider citing our latest paper:

+
@inproceedings{ortiz-suarez-etal-2020-monolingual,
+    title = "A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages",
+    author = "Ortiz Su{\'a}rez, Pedro Javier  and
+      Romary, Laurent  and
+      Sagot, Beno{\^\i}t",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.acl-main.156",
+    pages = "1703--1714",
+    abstract = "We use the multilingual OSCAR corpus, extracted from Common Crawl via language classification, filtering and cleaning, to train monolingual contextualized word embeddings (ELMo) for five mid-resource languages. We then compare the performance of OSCAR-based and Wikipedia-based ELMo embeddings for these languages on the part-of-speech tagging and parsing tasks. We show that, despite the noise in the Common-Crawl-based OSCAR data, embeddings trained on OSCAR perform much better than monolingual embeddings trained on Wikipedia. They actually equal or improve the current state of the art in tagging and parsing for all five languages. In particular, they also improve over multilingual Wikipedia-based contextual embeddings (multilingual BERT), which almost always constitutes the previous state of the art, thereby showing that the benefit of a larger, more diverse corpus surpasses the cross-lingual benefit of multilingual embedding architectures.",
+}
+
+

The Unshuffled OSCAR

+

If you need a copy of any of the unshuffled sub-corpora, please contact us using the contact form down below. Please include your name, affiliation, contact details, which languages do you need and a brief description of how you intend to use OSCAR. We will evaluate your request and answer accordingly.

+

{{% callout note %}} +The unshuffled OSCAR is now available in HuggingFace’s datasets library +{{% /callout %}} +They have obtained our permission to redistribute the unshuffled OSCAR and they allow users to download a corpus all at once as opposed to file by file. You can get more information about how to download OSCAR using their library by visiting OSCAR's dataset card.

+

Downloading OSCAR

+

All the data is distributed by language, both the original and the deduplicated versions of the data are available. To download a file just click the desired link on the table below. Languages are split in shards of around 700MB, these shards are standalone. A plain text file with checksums is also provided.

+

The OSCAR corpus is yet to be filtered, so please be careful when using it, specially for text generation tasks! To see which sub-corpora have been audited, please refer to the list of publications above for more information.

+

You'll be asked to create an HumanID account in order to download a corpus. This is intended, and we do it in order to limit traffic and reduce abuse of the infrastructure. The OSCAR corpus is hosted by Huma-Num, you can read more about them on their website.

+

All sizes are for the uncompressed files.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LanguageWords originalSize originalFile originalWords deduplicatedSize deduplicatedFile deduplicated
Afrikaans43,482,801241Maf29,533,437163Maf
Albanian374,196,1102.3Gsq186,856,6991.2Gsq
Alemannic841,7505.0Mals459,0012.8Mals
Amharic28,301,601360Mam16,086,628206Mam
Arabic8,117,162,82882Gar3,171,221,35432Gar
Aragonese52,8961.3Man45,669801Kan
Armenian273,919,3883.7Ghy110,196,0431.5Ghy
Assamese6,956,663113Mas4,366,57071Mas
Asturian381,0052.4Mast325,2372.0Mast
Avaric24,720409Kav19,478324Kav
Azerbaijani322,641,7102.8Gaz167,742,2961.5Gaz
Bashkir9,796,764128Mba6,922,58990Mba
Basque120,456,652848Meu45,359,710342Meu
Bavarian399503bar399503bar
Belarusian144,579,6301.8Gbe83,499,0371.1Gbe
Bengali623,575,73311Gbn363,766,1435.8Gbn
Bihari8,848110Kbh2,87534Kbh
Bishnupriya198,2864.1Mbpy96,9401.7Mbpy
Bosnian106,448447Kbs20,485116Kbs
Breton5,013,24129Mbr2,890,38416Mbr
Bulgarian2,947,648,10632Gbg1,268,114,97714Gbg
Burmese56,111,1841.9Gmy30,102,1731.1Gmy
Catalan1,360,212,4508.0Gca729,333,4404.3Gca
Cebuano6,603,56739Mceb3,675,02424Mceb
Central Bikol312885bcl312885bcl
Central Khmer20,690,6101.1Gkm10,082,245581Mkm
Central Kurdish48,478,334487Mckb18,726,721226Mckb
Chavacano130520cbk130520cbk
Chechen711,0518.3Mce568,1466.7Mce
Chinese14,986,424,850508Gzh6,350,215,113249Gzh
Chuvash3,041,61439Mcv2,054,81026Mcv
Cornish8,32944Kkw2,70414Kkw
Croatian34,232,765226Mhr16,727,640110Mhr
Czech7,715,977,44153Gcs3,540,997,50924Gcs
Danish2,637,463,88916Gda1,620,091,3179.5Gda
Dhivehi7,559,472126Mdv4,726,66079Mdv
Dimli19146diq19146diq
Dutch13,020,136,37378Gnl6,598,786,13739Gnl
Eastern Mari565,9927.2Mmhr469,2976.0Mmhr
Egyptian Arabic7,305,15166Marz3,659,41933Marz
Emilian-Romagnol6,37625Keml6,12124Keml
English418,187,793,4082.3Ten215,841,256,9711.2Ten
Erzya901.4Kmyv781.2Kmyv
Esperanto48,486,161299Meo37,324,446228Meo
Estonian643,163,7304.8Get309,931,4632.3Get
Finnish3,196,666,41927Gfi1,597,855,46813Gfi
French46,896,036,417282Gfr23,206,776,649138Gfr
Galician102,011,291620Mgl63,600,602384Mgl
Georgian171,950,6213.6Gka91,569,7391.9Gka
German44,878,908,446308Gde21,529,164,172145Gde
Goan Konkani124,2772.2Mgom102,3061.8Mgom
Guarani7,38236Kgn4,68024Kgn
Gujarati72,045,7011.1Ggu50,023,432722Mgu
Haitian1,0143.9Kht8323.3Kht
Hebrew2,067,753,52820Ghe1,032,018,0569.8Ghe
Hindi1,372,234,78217Ghi745,774,9348.9Ghi
Hungarian5,163,936,34540Ghu2,339,127,55518Ghu
Icelandic219,900,0941.5Gis129,818,331846Mis
Ido25,702147Kio22,773130Kio
Iloko142,942874Kilo105,564636Kilo
Indonesian4,574,692,26530Gid2,394,957,62916Gid
Interlingua180,231662Kia100,019360Kia
Interlingue5,35224Kie6021.6Kie
Irish14,483,59388Mga10,017,30360Mga
Italian22,248,707,341137Git11,250,012,89669Git
Japanese4,962,979,182216Gja1,123,067,063106Gja
Javanese104,896659Kjv86,654583Kjv
Kalmyk10,277113Kxal10,155112Kxal
Kannada81,186,8631.7Gkn49,343,4621.1Gkn
Karachay-Balkar185,4362.6Mkrc166,4962.3Mkrc
Kazakh191,126,4692.7Gkk108,388,7431.5Gkk
Kirghiz44,194,823600Mky28,982,620388Mky
Komi201,4042.3Mkv95,2431.2Mkv
Korean2,368,765,14224Gko1,120,375,14912Gko
Kurdish15,561,00394Mku9,946,44060Mku
Lao4,133,311174Mlo2,583,342114Mlo
Latin4,122,20126Mla1,328,0388.3Mla
Latvian520,761,9774.0Glv236,428,9051.8Glv
Lezghian247,6463.3Mlez224,8713.0Mlez
Limburgan4,73029Kli4,28327Kli
Lithuanian1,159,661,7428.8Glt516,183,5253.9Glt
Lojban154,330736Kjbo141,973678Kjbo
Lombard75,229443Klmo73,665433Klmo
Low German2,906,34718Mnds2,146,41713Mnds
Lower Sorbian1,78713Kdsb9667.1Kdsb
Luxembourgish4,403,57729Mlb3,087,65021Mlb
Macedonian189,289,8732.1Gmk102,849,5951.2Gmk
Maithili69,161317Kmai87411Kmai
Malagasy3,068,36021Mmg1,872,04413Mmg
Malay16,696,882111Mms6,045,75342Mms
Malayalam189,534,4724.9Gml95,892,5512.5Gml
Maltese2,995,65424Mmt2,163,35817Mmt
Marathi162,609,4042.7Gmr82,130,8031.4Gmr
Mazanderani73,870691Kmzn64,481602Kmzn
Minangkabau5,682608Kmin4,825310Kmin
Mingrelian299,0985.8Mxmf228,6294.4Mxmf
Mirandese1711.2Kmwl1521.1Kmwl
Modern Greek5,479,180,13762Gel2,412,419,43527Gel
Mongolian181,307,1672.2Gmn68,362,013838Mmn
Nahuatl languages1,23412Knah1,19311Knah
Neapolitan5,28217Knap4,14713Knap
Nepali107,448,2081.8Gne71,628,3171.2Gne
Newari564,6975.5Mnew288,9954.1Mnew
Northern Frisian1,5164.4Kfrr1,5164.4Kfrr
Northern Luri8,02276Klrc6,74063Klrc
Norwegian1,344,326,3888.0Gno804,894,3774.7Gno
Norwegian Nynorsk14,764,98085Mnn9,435,13954Mnn
Occitan750,3015.8Moc512,6783.7Moc
Oriya14,938,567248Mor11,321,740188Mor
Ossetian1,031,26813Mos878,76511Mos
Pampanga130760pam52304pam
Panjabi61,847,806763Mpa37,555,835460Mpa
Persian9,096,554,12179Gfa4,363,505,31938Gfa
Piemontese362,0132.1Mpms337,2461.9Mpms
Polish15,277,255,137109Gpl6,708,709,67447Gpl
Portuguese20,641,903,898124Gpt10,751,156,91864Gpt
Pushto46,559,441361Mps31,347,348242Mps
Quechua10,18678Kqu8,69167Kqu
Romanian3,984,317,05825Gro1,741,794,06911Gro
Romansh1,0937.4Krm9606.5Krm
Russia Buriat96313Kbxr80911Kbxr
Russian92,522,407,8371.2Tru46,692,691,520568Gru
Sanskrit4,331,56993Msa1,713,93037Msa
Scottish Gaelic310,6891.9Mgd207,1101.3Mgd
Serbian364,395,4113.9Gsr207,561,1682.2Gsr
Serbo-Croatian5,292,18425Msh1,040,5735.8Msh
Sicilian5543.3Kscn4682.8Kscn
Sindhi43,530,158347Msd33,028,015263Msd
Sinhala93,053,4651.4Gsi50,864,857802Msi
Slovak1,322,247,7639.1Gsk656,346,1794.5Gsk
Slovenian387,399,7002.5Gsl193,926,6841.3Gsl
Somali1,20261Kso47216Kso
South Azerbaijani2,175,05427Mazb1,528,70919Mazb
Spanish47,545,122,279278Ges25,928,290,729149Ges
Sundanese30,321211Ksu20,278141Ksu
Swahili2,211,92713Msw1,376,9638.1Msw
Swedish7,155,994,31244Gsv4,106,120,60825Gsv
Tagalog98,949,299573Mtl70,121,601407Mtl
Tajik31,758,142379Mtg21,029,893249Mtg
Tamil420,537,1329.3Gta226,013,3305.1Gta
Tatar51,034,893670Mtt23,825,695305Mtt
Telugu123,711,5172.5Gte79,094,1671.6Gte
Thai951,743,08736Gth368,965,20216Gth
Tibetan1,483,589187Mbo936,556138Mbo
Turkish7,577,388,70060Gtr3,365,734,28927Gtr
Turkmen1,113,86911Mtk752,3266.8Mtk
Tuvinian75912Ktyv5407.9Ktyv
Uighur8,657,141122Mug5,852,22583Mug
Ukrainian4,204,381,27653Guk2,252,380,35128Guk
Upper Sorbian545,3514.2Mhsb236,8671.8Mhsb
Urdu331,817,9822.7Gur218,030,2281.7Gur
Uzbek2,450,25621Muz1,381,64412Muz
Venetian3,49218Kvec3,19917Kvec
Vietnamese12,036,845,35968Gvi5,577,159,84332Gvi
Volapük321,1212.0Mvo318,5682.0Mvo
Walloon50,720273Kwa37,543203Kwa
Waray397,3152.5Mwar336,3112.2Mwar
Welsh37,422,441213Mcy23,574,673133Mcy
Western Frisian5,691,07735Mfy4,223,81626Mfy
Western Mari93,3381.2Mmrj87,7801.1Mmrj
Western Panjabi1,426,98612Mpnb1,111,1129.0Mpnb
Wu Chinese11,189109Kwuu4,33332Kwuu
Yakut2,547,62342Msah1,789,17426Msah
Yiddish13,834,320141Myi8,212,97084Myi
Yoruba8,90655Kyo3,51827Kyo
Yue Chinese1863.7Kyue1282.2Kyue
+

License

+

These data are released under this licensing scheme:

+
    +
  • We do not own any of the text from which these data has been extracted.
  • +
  • We license the actual packaging of these data under the Creative Commons CC0 license ("no rights reserved").
  • +
  • To the extent possible under law, Inria has waived all copyright and related or neighboring rights to OSCAR.
  • +
  • This work is published from: France.
  • +
+

+CC0 +

+

Notice and take down policy

+

Notice: Should you consider that our data contains material that is owned by you and should therefore not be reproduced here, please:

+
    +
  • Clearly identify yourself, with detailed contact data such as an address, telephone number or email address at which you can be contacted.
  • +
  • Clearly identify the copyrighted work claimed to be infringed.
  • +
  • Clearly identify the material that is claimed to be infringing and information reasonably sufficient to allow us to locate the material.
  • +
  • And use the contact form below.
  • +
+

Take down: We will comply to legitimate requests by removing the affected sources from the next release of the corpus.

+

Models

+

Here is a list of some language models that have been trained using the OSCAR corpus or that are part of the OSCAR project:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelLanguageCorpusAuthorsPaperFilesLicense
ELMoBulgarianOSCARPedro J. Ortiz, Benoît Sagot and Laurent RomaryACL 2020bg.zipMIT
ELMoBulgarianWikipediaPedro J. Ortiz, Benoît Sagot and Laurent RomaryACL 2020bg.zipMIT
ELMoCatalanOSCARPedro J. Ortiz, Benoît Sagot and Laurent RomaryACL 2020ca.zipMIT
ELMoCatalanWikipediaPedro J. Ortiz, Benoît Sagot and Laurent RomaryACL 2020ca.zipMIT
ELMoDanishOSCARPedro J. Ortiz, Benoît Sagot and Laurent RomaryACL 2020da.zipMIT
ELMoDanishWikipediaPedro J. Ortiz, Benoît Sagot and Laurent RomaryACL 2020da.zipMIT
ELMoFrenchOSCARPedro J. Ortiz, Yoann Dupont, Benjamin Muller, Laurent Romary and Benoît SagotLREC 2020fr.zipMIT
ELMoFinnishOSCARPedro J. Ortiz, Benoît Sagot and Laurent RomaryACL 2020fi.zipMIT
ELMoFinnishWikipediaPedro J. Ortiz, Benoît Sagot and Laurent RomaryACL 2020fi.zipMIT
ELMoIndonesianOSCARPedro J. Ortiz, Benoît Sagot and Laurent RomaryACL 2020id.zipMIT
ELMoIndonesianWikipediaPedro J. Ortiz, Benoît Sagot and Laurent RomaryACL 2020id.zipMIT
+ +

Here is a list of Language models trained by the community:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelLanguageCasedCorpusAuthorsPaperWebsiteFilesLicense
AraBERTArabicCasedOSCAR, Wikipedia, 1.5B words Arabic Corpus, OSIAN, AssafirWissam Antoun, Fady Baly and Hazem HajjACL AnthologyGitHubHugging FaceN/A
Arabic-BERTArabicCasedOSCAR and WikipediaAli SafayaArXivGitHubHugging FaceMIT
AraELECTRAArabicCasedOSCAR, Wikipedia, 1.5B words Arabic Corpus, OSIAN, AssafirWissam Antoun, Fady Baly and Hazem HajjArXiVGitHubHugging FaceN/A
AraGPT2ArabicCasedOSCAR, Wikipedia, 1.5B words Arabic Corpus, OSIAN, AssafirWissam Antoun, Fady Baly and Hazem HajjArXivGitHubHugging FaceN/A
CamemBERTFrenchCasedOSCARLouis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît SagotACL 2020camembert-model.frcamembert-base.tar.gzMIT
CamemBERTFrenchCasedSubsample of OSCAR (4 GB of text)Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît SagotACL 2020camembert-model.frcamembert-base-oscar-4gb.tar.gzMIT
LePetitFrenchCasedSubsample of OSCAR (2 GB of text)Vincent Micheli, Martin d'Hoffschmidt, Quentin HeinrichMedium blogilluin.techHugging FaceMIT
GigaBERTArabicCased and UncasedOSCAR, Wikipedia, GigawordWuwei Lan, Yang Chen, Wei Xu, Alan RitterEMNLP 2020GitHubHugging FaceMIT
ELECTRANorwegianCasedOSCAR and OPUSViktor AlmN/AHugging FaceHugging FaceN/A
BERTRomanianCasedOSCAR, Wikipedia and OPUSDumitrescu Stefan and Andrei AvramSOONGitHubHugging FaceMIT
BERTRomanianUncasedOSCAR, Wikipedia and OPUSDumitrescu Stefan and Andrei AvramSOONGitHubHugging FaceMIT
RoBERTaSinhalaN/AOSCARKeshan SodimanaN/AHugging FaceHugging FaceN/A
BERTTurkishCased and UncasedOSCAR, Wikipedia and OPUSStefan SchweterZenodoGitHubHugging FaceMIT
ELECTRATurkishCasedOSCAR, Wikipedia and OPUSStefan SchweterZenodoGitHubHugging FaceMIT
XLMIndicHindi, Bengali, Gujarati, Panjabi, Marathi, Oriya, Assamese, Sinhala, Nepali, Bihari, Bishnupriya, Maithili, Goan Konkani, SanskritCasedOSCARIbraheem Muhammad Moosa, Mahmud Shimul and Ashfia Binte HabibArxivGitHubHugging FaceMIT
+

If you have trained a model using the OSCAR corpus and would like to have it featured here, please open a pull request in our GitHub repo. Help us grow the community!

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/versions/oscar-2109/index.html b/versions/oscar-2109/index.html new file mode 100644 index 0000000..c129b68 --- /dev/null +++ b/versions/oscar-2109/index.html @@ -0,0 +1,2486 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + OSCAR 21.09 - OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

OSCAR 21.09

+

Features

+

These are the versions of tooling, schemes and data

+
    +
  • CommonCrawl version: February/March 2021 (2021.10)
  • +
  • OSCAR Schema version: v1.1 : Incorporates metadata in a backward compatible manner.
  • +
  • Ungoliant version: v1 : New generation tool, faster and better documented/tested than the previous one: goclassy.
  • +
+

Changes

+
    +
  • As per OSCAR Schema v1.1, each document/record has associated metadata.
  • +
  • New languages: Manx, Rusyn, Scots and West Flemish. Their size and quality still has to be assessed.
  • +
  • Removed languages: Central Bikol and Cantonese. Cantonsese was of a very low quality. Central Bikol corpus is still available on OSCAR 2019.
  • +
+

Table

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LanguageOSCAR 2019OSCAR 2019 deduplicatedOSCAR 21.09OSCAR 21.09 deduplicatedIssues
afAfrikaans251MB170MB258MB157MB
sqAlbanian2GB1GB3GB1GB
amAmharic377MB215MB405MB241MB
arArabic87GB33GB69GB35GB
anAragonese1MB822KB1MB608KB
hyArmenian3GB1GB4GB1GB
asAssamese117MB73MB135MB95MB
astAsturian2MB2MB7MB4MB
avAvaric418KB331KB421KB325KB
azAzerbaijani2GB1GB3GB1GB
bnBangla10GB6GB14GB7GB
baBashkir133MB93MB110MB77MB
euBasque889MB358MB900MB503MB
barBavarian507B507B2KB1KB
beBelarusian1GB1GB2GB1GB
bhBihari languages112KB34KB579KB120KB
bpyBishnupriya4MB1MB11MB4MB
bsBosnian459KB120KB310KB175KB
brBreton29MB16MB49MB23MB
bgBulgarian33GB14GB34GB15GB
myBurmese2GB1GB2GB1GB
yueCantonese3KB2KB--
caCatalan8GB4GB13GB6GB
cebCebuano40MB24MB81MB58MB
bclCentral Bikol886B886B--
ckbCentral Kurdish509MB236MB784MB367MB
cbkChavacano521B521B168B168B{{< issue cbk >}}
ceChechen8MB6MB29MB20MB
zhChinese544GB267GB500GB266GB
cvChuvash40MB27MB60MB41MB
kwCornish44KB14KB119KB72KB
hrCroatian237MB115MB361MB169MB
csCzech56GB25GB72GB33GB
daDanish16GB10GB18GB10GB
diqDimli (individual language)147B147B294B147B
dvDivehi131MB81MB143MB111MB
nlDutch82GB41GB97GB47GB
mhrEastern Mari7MB6MB15MB10MB
arzEgyptian Arabic68MB34MB48MB21MB
enEnglish2520GB1294GB2936GB1342GB
myvErzya1KB1KB29KB2KB
eoEsperanto312MB238MB560MB390MB
etEstonian5GB2GB7GB3GB
tlFilipino601MB426MB699MB383MB
fiFinnish28GB13GB35GB20GB
frFrench302GB147GB340GB161GB
glGalician650MB402MB989MB549MB
kaGeorgian3GB1GB6GB2GB
deGerman330GB155GB433GB184GB
gomGoan Konkani2MB1MB3MB2MB
elGreek66GB28GB72GB30GB
gnGuarani36KB23KB32KB25KB
guGujarati1GB756MB1GB950MB
htHaitian Creole3KB3KB2KB1KB
heHebrew21GB10GB29GB11GB
hiHindi17GB9GB26GB13GB
huHungarian42GB18GB60GB29GB
isIcelandic1GB887MB2GB1GB
ioIdo151KB133KB276KB221KB
iloIloko896KB653KB1MB857KB
idIndonesian32GB16GB40GB22GB
iaInterlingua678KB368KB291KB172KB
ieInterlingue24KB1KB7KB2KB
gaIrish91MB62MB131MB69MB
itItalian146GB73GB192GB94GB
jaJapanese231GB112GB208GB96GB
jvJavanese675KB598KB858KB728KB
xalKalmyk115KB114KB62KB62KB
knKannada1GB1GB2GB1GB
krcKarachay-Balkar2MB2MB2MB2MB
kkKazakh2GB1GB3GB1GB
kmKhmer1GB608MB1GB860MB
kvKomi2MB1MB1MB588KB
koKorean25GB11GB35GB15GB
kuKurdish98MB62MB152MB108MB
kyKyrgyz629MB406MB485MB334MB
loLao181MB118MB287MB163MB
laLatin26MB8MB103MB9MB
lvLatvian4GB1GB6GB2GB
lezLezghian3MB3MB2MB2MB
liLimburgish29KB27KB76KB54KB
ltLithuanian9GB4GB12GB5GB
jboLojban753KB694KB929KB731KB
lmoLombard454KB444KB1MB1MB
ndsLow German18MB13MB25MB17MB
dsbLower Sorbian13KB7KB31KB14KB
lbLuxembourgish30MB21MB54MB37MB
mkMacedonian2GB1GB3GB1GB
maiMaithili324KB10KB685KB24KB
mgMalagasy21MB13MB59MB38MB
msMalay116MB43MB146MB60MB
mlMalayalam5GB2GB4GB2GB
mtMaltese24MB17MB51MB26MB
gvManx--1KB907B
mrMarathi2GB1GB3GB1GB
mznMazanderani708KB617KB1MB1MB
minMinangkabau622KB317KB8MB1MB
xmfMingrelian6MB4MB16MB10MB
mwlMirandese1KB1KB3KB2KB
mnMongolian2GB879MB1GB912MB
nahNahuatl languages11KB10KB34KB21KB
napNeapolitan17KB13KB1KB1KB{{< issue nap >}}
neNepali1GB1GB3GB2GB
newNewari5MB4MB6MB4MB
frrNorthern Frisian4KB4KB7KB5KB{{< issue frr >}}
lrcNorthern Luri77KB64KB183B183B
noNorwegian Bokmål8GB5GB9GB4GB
nnNorwegian Nynorsk88MB56MB123MB66MB
ocOccitan6MB3MB12MB5MB
orOdia259MB196MB538MB357MB
osOssetic12MB10MB11MB6MB
pamPampanga763B307B3KB3KB
psPashto378MB253MB404MB286MB
faPersian84GB39GB79GB35GB
pmsPiedmontese2MB1MB4MB3MB
plPolish116GB50GB122GB48GB
ptPortuguese132GB67GB159GB71GB
paPunjabi799MB481MB769MB430MB
quQuechua80KB68KB322KB230KB
roRomanian26GB11GB37GB15GB
rmRomansh7KB6KB3KB3KB
bxrRussia Buriat12KB10KB22KB18KB
ruRussian1239GB609GB1201GB542GB
rueRusyn--247B247B
sahSakha43MB27MB57MB39MB
saSanskrit96MB38MB72MB43MB
scoScots--1KB1KB{{< issue sco >}}
gdScottish Gaelic1MB1MB2MB1MB
srSerbian4GB2GB6GB3GB
shSerbian (Latin)25MB6MB13MB9MB
scnSicilian3KB2KB4KB3KB
sdSindhi363MB274MB75MB50MB
siSinhala1GB840MB1GB791MB
skSlovak9GB4GB14GB6GB
slSlovenian2GB1GB4GB1GB
soSomali62KB15KB15KB13KB{{< issue so >}}
azbSouth Azerbaijani28MB19MB47MB29MB
esSpanish297GB159GB342GB160GB
suSundanese216KB145KB397KB274KB
swSwahili13MB8MB11MB7MB
svSwedish46GB26GB43GB19GB
tgTajik396MB260MB985MB321MB{{< issue tg >}}
taTamil9GB5GB10GB5GB
ttTatar701MB319MB947MB424MB
teTelugu2GB1GB3GB1GB
thThai38GB17GB62GB26GB
boTibetan195MB144MB439MB358MB
gsw[^1]Alemannic German5MB2MB7MB5MB
trTurkish63GB28GB73GB33GB{{< issue tr >}}
tkTurkmen10MB7MB25MB20MB
tyvTuvinian11KB8KB9KB7KB
ukUkrainian56GB29GB53GB28GB
emlEmiliano-Romagnolo[^2]25KB23KB22KB20KB
hsbUpper Sorbian4MB1MB2MB1MB
urUrdu2GB1GB2GB1GB
ugUyghur127MB86MB187MB123MB
uzUzbek21MB11MB56MB28MB
vecVenetian18KB16KB37KB28KB
viVietnamese72GB33GB87GB42GB
voVolapük2MB2MB2MB2MB
waWalloon280KB207KB511KB329KB
warWaray2MB2MB4MB4MB
cyWelsh223MB139MB307MB180MB
vlsWest Flemish--134B134B{{< issue vls >}}
fyWestern Frisian35MB26MB82MB57MB
mrjWestern Mari1MB1MB645KB521KB
pnbWestern Panjabi11MB9MB68MB45MB
wuuWu Chinese111KB32KB145KB69KB{{< issue wuu >}}
yiYiddish146MB87MB199MB93MB
yoYoruba56KB26KB229KB120KB
+

OSCAR Schema v1.1.0

+

The new OSCAR schema incorporates backward-compatible changes.

+

Changes

+

The old OSCAR Schema v1.0 featured the following file hierarchy, in an uncompressed form:

+
/
+├── af
+   ├── af_sha256.txt
+   └── af.txt.gz
+├── de
+   ├── de_sha256.txt    # Checksum file 
+   └── de.txt.gz        # Textual content
+├── en
+   ├── en_part_1.txt.gz        # Multipart example
+   ├── en_part_2.txt.gz
+   └── en_sha256.txt
+├── yi
+   ├── yi_sha256.txt
+   └── yi.txt.gz
+└── zh
+    ├── zh_sha256.txt
+    └── zh.txt.gz
+
+

The new OSCAR Schema v1.1 features the following file hierarchy (some languages omitted):

+
/
+├── af
+   ├── af_meta.jsonl.gz
+   ├── af_sha256.txt
+   └── af.txt.gz
+├── de
+   ├── de_meta.jsonl.gz # Metadata, in JSONLines format
+   ├── de_sha256.txt    # Checksum file 
+   └── de.txt.gz        # Textual content
+├── en
+   ├── en_meta_part_1.jsonl.gz # Multipart example
+   ├── en_meta_part_2.jsonl.gz # Each part is independent,
+   ├── en_part_1.txt.gz        # Ex: en_part_2.txt.gz and en_meta_part_2.jsonl.gz
+   ├── en_part_2.txt.gz
+   └── en_sha256.txt
+├── yi
+   ├── yi_meta.jsonl.gz
+   ├── yi_sha256.txt
+   └── yi.txt.gz
+└── zh
+    ├── zh_meta.jsonl.gz
+    ├── zh_sha256.txt
+    └── zh.txt.gz
+
+

File formats

+

.txt files

+

Lines are newline-separated, and documents are double-newline separated. +In other terms, there is a blank line between each document.

+

.jsonl files

+

These are the metadata, in JSONLines format.

+

Each line follows the following JSON Scheme:

+
{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Metadata",
+  "description": "Holds record headers.\n\nEach metadata is linked to a specific paragraph/text zone",
+  "type": "object",
+  "required": [
+    "headers",
+    "nb_sentences",
+    "offset"
+  ],
+  "properties": {
+    "headers": {
+      "type": "object",
+      "additionalProperties": {
+        "type": "string"
+      }
+    },
+    "nb_sentences": {
+      "type": "integer",
+      "format": "uint",
+      "minimum": 0.0
+    },
+    "offset": {
+      "type": "integer",
+      "format": "uint",
+      "minimum": 0.0
+    }
+  }
+}
+
+

Example: +

{
+   "headers":{                  // these headers keys are *almost* always present.
+      "content-length":"11062", // the content length is not changed and reflects the 
+                                // length before filtering and eventual deduplication.
+      "warc-target-uri":"...",
+      "warc-type":"conversion",
+      "content-type":"text/plain",
+      "warc-date":"2021-02-24T17:55:29Z", // Following WARC specification, it is the crawl date.
+      "warc-identified-content-language":"eng,zho",
+      "warc-refers-to":"<urn:uuid:c649de0e-42a3-4e69-b675-98e28e084698>",
+      "warc-block-digest":"sha1:V4PYYGYA6ZYA2WACDKSNL6NXGDN6XK6X",
+      "warc-record-id":"<urn:uuid:121a822f-5362-4559-8891-d085415cdd90>"
+   },
+   "offset":0, // Related text is in the text file, from lines offset+1 to lines offset+nb_sentences.
+   "nb_sentences":9
+}
+

+

<lang>_sha256.txt files

+

These are used to check for eventual corruption during download. +They can be used by running sha256sum -c <lang>_sha256.txt.

+

[^1]: gsw is ISO 639-2 for Alemannic German. It was previously identified as als in previous OSCAR versions, due to a bug in fasttext. +[^2]: eml identification tag is deprecated and corresponds to rgn and egl tags in ISO 639-3

+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/versions/oscar-2201/index.html b/versions/oscar-2201/index.html new file mode 100644 index 0000000..d9f20a7 --- /dev/null +++ b/versions/oscar-2201/index.html @@ -0,0 +1,1853 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + OSCAR 22.01 - OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+ +
+ + + +
+
+ + + + + + + + +

OSCAR 22.01

+

OSCAR 2201 is the OSCAR version from January, 2022, the November/December 2021 dump of Common Crawl. +It features a different file layout that makes it not backward compatible with code designed to run with previous OSCAR versions.

+

Request access +🤗 Datasets +Read the paper

+

Summary

+

OSCAR 22.01 is document-oriented, which means that rather than extracting lines and sorting them in language subcorpora, we identify documents as a whole. The main differences are that sentences in a document are contiguous and should make sense one after another, but sentences are not guaranteed to be of the subcorpus' language.

+
+

Note

+

As an example, the English Wikipedia page about La Marseillaise contains sentences in French (The anthem's lyrics). In line-oriented corpora, these sentences would have been put in the French subcorpus. In OSCAR 22.01, they should be along with the article, in a document classified as English.

+
+

Layout

+

As previous corpora, there is one subcorpus per language, plus one new subcorpus for multilingual documents. +Subcorpora are distributed in JSONLines, split into 1GB chunks, then gzipped.

+
+

Note

+

Splits are completely independent and self-contained: It is possible to only download en_meta_134.jsonl.gz and to do processing on it.

+
+

Example document

+
{
+  "content":"newline\nseparaaaaaaaaaaated\ncontent", // (1)
+  "warc_headers":{ // (2) 
+    "warc-refers-to":"<urn:uuid:83f2e1d4-5ed3-41db-86ff-f7826c4c20f9>", 
+    "warc-date":"2021-09-16T11:07:14Z",
+    "warc-block-digest":"sha1:X3OWP47FG2O5LBNMFSNB44FJF2SSRC26",
+    "warc-type":"conversion",
+    "warc-identified-content-language":"eng",
+    "content-length":"1694",
+    "warc-target-uri":"https://foo.bar",
+    "warc-record-id":"<urn:uuid:3304bc27-17d0-4ffd-a692-340381478a5f>",
+    "content-type":"text/plain"
+  },
+  "metadata":{
+    // (3)
+    "identification":{
+      "label":"en",
+      "prob":0.6268374
+    },
+
+    // (4)
+    "annotation":[
+      "short_sentences",
+      "footer"
+    ],
+
+    // (5)
+    "sentence_identifications":[
+      {
+        "label":"en",
+        "prob":0.93925816
+      },
+      null,
+      {
+        "label":"en",
+        "prob":0.9606543
+      }
+    ]
+  }
+}
+
+
    +
  1. Content. Lines are separated by \n.
  2. +
  3. Headers from the crawler. Note that nothing is changed, so the content length may be incorrect.
  4. +
  5. Document-wide identification. prob is the weighted average of the confidence of identified lines.
  6. +
  7. Annotations of the document. null if no annotation.
  8. +
  9. Line-by-line identifications. null for each line that has no identification.
  10. +
+

Annotations

+
    +
  • tiny: The document has a low (<5) number of lines.
  • +
  • short_sentences: The document has a high number (>50%) of short lines (<400 bytes)
  • +
  • header: The document has a high number of short lines at its head, suggesting the presence of low quality content.
  • +
  • footer: The document has a high number of short lines at its tail, suggesting the presence of low quality content.
  • +
  • noisy: The document has a high percentage of punctuation (>50%)
  • +
  • adult: The document contains adult content. This annotation uses a blocklist and labels a tiny part of the corpus: It does not catch most of the adult content.
  • +
+

More information about the thresholds and annotators are present in our paper.

+

Filtering

+
+

Tip

+

Filtering can be done using oscar-tools, a high performance toolkit that provides rapid and efficient ways of transforming corpora into what you need. More info here.

+
+

Filtering can be done using classic Python tools, such as ujson. +While we don't supply a Python library enabling easy filtering/transformation for OSCAR 22.01, we provide some filtering examples that you can change to better suit your needs.

+

Getting documents that come from Wikipedia only

+

Using filters on warc_headers.warc-target-uri makes filtering on URLs easy.

+
TODO
+
+

Extracting lines from non-annotated documents

+

Non-annotated documents are suspected to be cleaner than annotated ones, so extracting their content should be interesting to do. We extract lines from documents where metadata.annotations == null.

+
TODO
+
+

Getting Alemannic lines from the German corpus

+

As detailed in our paper, we found that the German corpus has a (relative to the Alemannic corpus size) important amount of Alemannic. We use a filter on metadata.sentence_identifications to extract those sentences.

+
TODO
+
+

Languages

+

OSCAR 22.01 has subcorpora for 142 languages (counting the Multilingual corpus). +The following table exhibits the size, number of documents and number of words for each of them.

+

Note that the size accounts for the raw uncompressed file size, counting metadata.

+
+Language table + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LanguageSize# Documents# Words
Multilingual12.1 GB1,210,685936,187,711
Afrikaans47.0 MB12,3936,227,310
Albanian3.0 GB437,287326,325,149
Alemannic / Swiss German363.6 kB13937,381
Amharic461.0 MB37,51330,481,153
Arabic84.2 GB8,718,9296,103,711,887
Aragonese10.6 kB1251
Armenian4.7 GB379,267268,031,270
Assamese221.2 MB17,08411,109,557
Asturian73.6 kB773,919
Avaric18.6 kB14582
Azerbaijani3.5 GB491,847291,927,692
Bangla15.1 GB1,171,501751,877,226
Bashkir95.5 MB11,1985,418,474
Basque1.1 GB233,65897,092,942
Belarusian1.8 GB180,046107,227,860
Bihari languages24.2 kB27569
Bishnupriya2.0 MB27198,419
Bosnian10.3 kB10422
Breton33.7 MB16,1193,111,619
Bulgarian35.1 GB2,887,1152,405,981,285
Burmese1.9 GB158,73344,835,970
Catalan13.9 GB2,627,3071,508,919,864
Cebuano44.6 MB5,7425,253,785
Central Kurdish716.4 MB84,95043,913,025
Chechen14.0 MB4,086798,766
Chinese900.9 GB56,524,51823,149,203,886
Chuvash41.8 MB4,7502,465,782
Cornish1.4 kB255
Croatian11.2 MB11,462505,369
Czech58.6 GB10,381,9165,452,724,456
Danish12.6 GB2,265,4791,454,439,292
Dimli (individual language)706 Bytes119
Divehi217.2 MB24,06710,112,205
Dutch114.0 GB20,206,53212,329,127,151
Eastern Mari11.3 MB1,612641,525
Egyptian Arabic2.8 MB1,256176,096
English3.2 TB431,992,659377,376,402,775
Esperanto558.3 MB111,93258,416,628
Estonian9.2 GB1,362,524820,975,443
Filipino646.5 MB70,39481,881,278
Finnish37.8 GB4,948,9612,900,615,928
French382.2 GB52,037,09841,713,990,658
Galician255.2 MB88,80327,051,212
Georgian7.1 GB488,588281,430,479
German496.7 GB70,075,42446,826,676,844
Goan Konkani787.2 kB4638,831
Greek78.3 GB6,738,5465,031,242,803
Guarani9.0 kB10374
Gujarati4.8 GB136,467301,170,777
Hebrew30.3 GB3,132,3962,249,377,984
Hindi23.3 GB1,529,9071,534,799,198
Hungarian53.9 GB6,866,0624,598,787,907
Icelandic2.0 GB396,183210,365,124
Ido77.3 kB1052,690
Iloko97.9 kB758,592
Indonesian17.4 GB2,244,6221,984,195,207
Interlingua40.2 kB610,125
Irish45.6 MB12,2334,877,850
Italian229.3 GB28,502,09224,294,684,830
Japanese258.7 GB36,328,9315,592,948,356
Javanese152.7 kB7010,441
Kalmyk9.3 kB9250
Kannada2.6 GB150,850108,450,571
Karachay-Balkar119.6 kB914,089
Kazakh2.9 GB261,085157,267,307
Khmer1.9 GB121,91030,564,131
Komi119.9 kB1273,335
Korean51.8 GB5,881,4813,854,968,649
Kurdish150.3 MB29,90617,390,759
Kyrgyz518.6 MB62,24428,028,986
Lao337.1 MB28,9146,682,982
Latin4.1 MB4,397187,446
Latvian8.2 GB1,032,987707,361,898
Lezghian375.5 kB12419,250
Limburgish1.4 kB241
Lithuanian20.0 GB2,303,0701,712,802,056
Lojban1.9 MB570260,542
Lombard2.6 kB2225
Low German9.0 MB1,9381,012,561
Lower Sorbian707 Bytes117
Luxembourgish15.8 MB5,1081,545,946
Macedonian3.6 GB341,775244,058,579
Maithili21.6 kB23483
Malagasy57.3 MB3,0287,279,056
Malay5.3 MB5,228217,818
Malayalam4.1 GB250,972137,831,247
Maltese2.5 MB2,208118,190
Marathi3.3 GB250,376160,179,233
Mazanderani128.2 kB767,337
Minangkabau6.0 MB585614,613
Mingrelian7.6 MB2,550253,333
Mongolian2.8 GB237,719176,405,432
Nahuatl languages8.7 kB12179
Nepali3.7 GB391,947177,885,116
Newari5.7 MB1,134273,837
Norwegian2.8 GB973,188279,182,902
Norwegian Nynorsk6.8 MB5,835459,183
Occitan2.1 MB37331,061
Odia487.9 MB52,94223,755,902
Ossetic13.9 MB3,560800,430
Pashto490.3 MB50,31246,293,249
Persian77.4 GB7,665,8716,430,164,396
Piedmontese1.7 MB698188,270
Polish139.0 GB19,301,13712,584,498,906
Portuguese170.3 GB23,735,70718,441,864,893
Punjabi1.1 GB68,09470,068,604
Quechua744 Bytes114
Romanian49.2 GB4,624,7645,261,803,995
Russia Buriat32.9 kB39785
Russian1.1 TB76,060,84462,811,122,663
Sakha65.6 MB6,2843,473,813
Sanskrit136.0 MB4,4725,671,369
Scottish Gaelic137.7 kB1367,769
Serbian6.9 GB577,472482,932,670
Serbian (Latin)931.8 kB73892,875
Sicilian1.5 kB250
Sindhi117.1 MB15,51610,685,611
Sinhala2.0 GB108,593113,179,741
Slovak16.5 GB2,409,5551,619,121,944
Slovenian1.2 GB351,894118,400,246
Somali2.1 kB3109
South Azerbaijani14.1 MB5,381693,746
Spanish381.9 GB51,386,24742,829,835,316
Sundanese5.0 MB263547,145
Swahili1.3 MB462123,050
Swedish48.0 GB7,541,2785,078,331,128
Tajik870.9 MB46,36656,627,727
Tamil11.4 GB556,772452,343,748
Tatar915.3 MB76,39851,875,265
Telugu3.4 GB249,756137,752,065
Thai66.1 GB5,030,2541,626,779,846
Tibetan234.5 MB18,6832,286,269
Turkish75.1 GB10,826,0316,421,221,358
Turkmen4.4 MB2,485276,632
Ukrainian48.8 GB4,558,2142,879,585,992
Emiliano-Romagnolo[eml]901 Bytes153
Upper Sorbian132.8 kB1108,825
Urdu3.4 GB336,994332,816,354
Uyghur201.9 MB18,55611,240,889
Uzbek19.9 MB9,5261,370,842
Vietnamese98.9 GB9,587,23312,283,185,482
Volapük825.9 kB66157,039
Walloon105.7 kB1384,386
Waray7.6 MB933830,872
Welsh409.3 MB90,37849,488,495
Western Frisian75.3 MB21,9466,357,929
Western Mari743.5 kB15543,916
Western Panjabi46.7 MB6,7904,060,419
Wu Chinese137.2 kB883,056
Yiddish232.5 MB23,41815,809,780
Yoruba24.7 kB261,042
Multilingual12.1 GB1,210,685936,187,711
+
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/versions/oscar-2301/index.html b/versions/oscar-2301/index.html new file mode 100644 index 0000000..bade386 --- /dev/null +++ b/versions/oscar-2301/index.html @@ -0,0 +1,2154 @@ + + + + + + + + + + + + + + + + + + + + + + + + OSCAR 23.01 - OSCAR Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

OSCAR 23.01

+

OSCAR 23.01 is the January 2023 version of the OSCAR Corpus based on the November/December 2022 dump of Common Crawl. While being quite similar to OSCAR 22.01, it contains several new features, including KenLM-based adult content detection, precomputed Locality-Sensitive Hashes for near deduplication, and blocklist-based categories. OSCAR 23.01 has also moved from gzip to Zstandard compression. You might already have zstd installed on your system, but if not, please check the Zstandard website for installation instructions.

+
+

Tip

+

OSCAR 23.01 is similar to OSCAR 22.01. As such, please also check out the documentation for OSCAR 22.01 if you need detailed information about metadata.

+
+

Access

+
+

Note

+

If you already have access to the corpus, there's nothing to do! +Go up in the file hierarchy on the link you've been given, and you should find the new corpus.

+
+

Access to the OSCAR Corpus changes depending on your status. More info on our dedicated page.

+

Getting access

+

New Features

+

Categories

+

OSCAR 22.01 leveraged the UT1 Blocklists project to attempt to classify some adult content present in OSCAR. +The OSCAR 23.01 pipeline iterated on this to include all of the blocklists provided by UT1.

+
+

Warning

+

The UT1 Blocklists page lists all the categories along with a short description. +We strongly encourage you to read the descriptions if you plan on using them. Please also note that these descriptions are in French. We're working on an English translation of them.

+
+
+

Note

+

A document can belong to multiple categories.

+
+

These categories are in a field that is at this path: metadata.categories.

+
+

Example

+
{
+    "content":"foo",
+    "metadata": {
+        // ...
+        "categories": ["blog", "news"],
+        // ...
+    }
+    // ...
+}
+
+
+

KenLM-based Adult Content Filtering

+

For a select number of subcorpora, a measure of perplexity has been added. This perplexity comes from a KenLM model trained on harmful content, previously gathered by using the adult annotation in OSCAR 22.01. +In other terms, the lower it is, the more likely a given document contains harmful/adult content.

+
+

Danger

+

This feature can be considered as unstable/unsafe, since we also want to evaluate its impact on particular issues.

+

As such, we do not provide a boolean value indicating if a given document can be harmful/adult content, but rather the raw perplexity. +We have found a threshold that works well in English, but encourage you to experiment with it and to report back your findings.

+
+

Locality Sensitive Hashing

+

We use TLSH to compute a hash for each document.

+

Locality sensitive hashing is a hashing method that computes similar hashes for similar documents.

+

This can be used to do both exact- and near- deduplication. +Same documents have same hashes (the reverse might not be true). So you only need to check for identity amongst documents with identical hashes. +TLSH hashes can be compared to yield a distance metric. According to the original paper, a cutoff of < 40 yields a false positive rate of 0.07% and a detect rate of 49.6%, while a cutoff of < 100 yields a FP rate of 6.43% and detect rate of 94.5%. You should choose a value that meets your purposes.

+

The above is true for the default version of TLSH which is used in packages such as py-tlsh. OSCAR 23.01 uses a TLSH with a hyperparameter of 256 buckets (Full hash), and 3 byte checksums (collision rate : 1 in 5800) instead of 1 byte checksums (collision rate : 1 in 24).

+

If you would like to use py-tlsh, follow these instructions (You need CMake installed to perform the necessary modifications and build): +

# download py-tlsh source package
+pip download python-tlsh
+# unpack the source tar.gz and enter the directory
+tar -xvf python-tlsh-4.5.0.tar.gz && cd python-tlsh-4.5.0
+# run the following command to implement the changes
+# alternatively, you can use vi or a text editor
+# change TLSH_BUCKETS_128 into TLSH_BUCKETS_256 and change TLSH_CHECKSUM_1B into TLSH_CHECKSUM_3B
+sed -i 's/set(TLSH_BUCKETS_128 1)/set(TLSH_BUCKETS_256 1)/g; s/set(TLSH_CHECKSUM_1B 1)/set(TLSH_CHECKSUM_3B 1)/g' CMakeLists.txt
+
+# build and activate pip venv if not already done
+# python3 -m venv ~/.venv
+source ~/.venv/bin/activate
+# build and install the new py-tlsh
+python3 setup.py install
+

+

Hashes are at metadata.tlsh.

+

Minor changes

+
    +
  • metadata.annotations has been renamed metadata.quality_warnings, and only contains length based quality warnings (see the OSCAR 2201 documentation for details).
  • +
  • Some language tags have changed to better respect the BCP47:
      +
    • als has become gsw. Previously, als was erroneously used as the tag for Alemannic/Swiss German, whereas it is the tag for Tosk Albanian.
    • +
    • eml has become x-eml. The eml tag is deprecated and as such has been replaced by a private tag (x-eml).
    • +
    +
  • +
+

Layout

+
{
+   "content":"English sentence\nphrase en français\n????????????", // (1)
+   "warc_headers":{ // (2)
+      "warc-identified-content-language":"fra,eng",
+      "warc-target-uri":"https://fr.wikipedia.org/wiki/...",
+      "warc-record-id":"<urn:uuid:29eaa920-d299-4b1d-b687-c72bd8d68116>",
+      "warc-type":"conversion",
+      "content-length":"35298", // (3)
+      "warc-refers-to":"<urn:uuid:39e42055-0d94-4e45-9c6c-9e7056635d64>",
+      "warc-block-digest":"sha1:WFH2A5WHCS2H365GIAFYQPI7UOAMFGHB", // (3)
+      "warc-date":"2022-11-26T09:45:47Z",
+      "content-type":"text/plain"
+   },
+   "metadata":{
+      "identification":{ // (4)
+         "label":"fr",
+         "prob":0.8938327
+      },
+      "harmful_pp":4063.1814, // (5)
+      "tlsh":"tlsh:T125315FF2B6088901EEA097015DB39B4600B...", // (6)
+      "quality_warnings":[ // (7)
+         "short_sentences",
+         "header",
+         "footer"
+      ],
+      "categories":[ // (8)
+         "examen_pix",
+         "liste_bu"
+      ],
+      "sentence_identifications":[ // (9)
+         {
+            "label":"fr",
+            "prob":0.99837273
+         },
+         {
+            "label":"en",
+            "prob":0.9992377
+         },
+         null
+      ]
+   }
+}
+
+

Some important notes:

+
    +
  1. Newline-separated content.
  2. +
  3. Headers from the crawled dumps are left untouched. See the WARC specification for more info.
  4. +
  5. Since warc_headers are copied and content can be altered by Ungoliant at generation stage, content-length and warc-block-digest can be different from actual values.
  6. +
  7. Document-level identification. Computation details can be found on the OSCAR 22.01 paper.
  8. +
  9. Perplexity of the document, computed using a KenLM model trained on harmful content. See this pre-print for more info. The lower this number is, the higher the probability that it will contain harmful or adult content. This annotation will be changed from harmful_pp to harmful_pplin future releases.
  10. +
  11. Locality Sensitive Hash of the documents' content, using TLSH. Useful for both exact and near deduplication.
  12. +
  13. (Corresponds to annotations pre-23.01) Potential quality warnings. Based on content/sentence length. See [OSCAR 22.01 paper for more info.
  14. +
  15. Blocklist-based categories. Uses the UT1 Blocklist, plus custom additions. Please refer to the UT1 website for categories description. Note that the categories are in French.
  16. +
  17. Sentence-level identifications. A null value means no identification with a good enough threshold (>0.8 on 23.01).
  18. +
+

Language table

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CodeLanguage# docs# wordsContent Length :
0afAfrikaans23,9946,217,02437.2 MB
1sqAlbanian1,342,790462,694,5993.2 GB
2amAmharic119,43440,262,809512.9 MB
3arArabic25,012,11610,081,452,882110.7 GB
4anAragonese3426411.0 kB
5hyArmenian1,056,974336,045,0414.9 GB
6asAssamese89,54224,395,215412.1 MB
7astAsturian44010,91774.1 kB
8avAvaric441,07318.6 kB
9azAzerbaijani1,159,994316,850,3303.0 GB
10bnBangla3,474,0861,092,983,76519.1 GB
11baBashkir128,24826,036,637363.7 MB
12euBasque678,474136,672,6151.2 GB
13beBelarusian445,612164,729,6072.3 GB
14bhBihari languages485076.8 kB
15bpyBishnupriya2,346346,9475.4 MB
16bsBosnian203953.0 kB
17brBreton36,3384,759,40731.4 MB
18bgBulgarian8,933,9983,635,273,73844.1 GB
19myBurmese430,27682,433,8363.0 GB
20caCatalan6,953,8982,240,460,83615.3 GB
21cebCebuano16,1746,263,40441.1 MB
22ckbCentral Kurdish182,50861,334,746772.9 MB
23ceChechen11,6861,051,75213.9 MB
24zhChinese138,478,27044,378,380,1611.4 TB
25cvChuvash16,6523,039,92542.3 MB
26kwCornish880432 Bytes
27hrCroatian31,8083,542,96126.5 MB
28csCzech34,859,6329,717,378,55977.0 GB
29daDanish7,214,3382,217,634,34014.8 GB
30dvDivehi77,06010,655,359200.1 MB
31nlDutch72,552,68819,564,553,306135.0 GB
32mhrEastern Mari9,5021,615,21522.9 MB
33arzEgyptian Arabic3,958385,5113.7 MB
34enEnglish1,235,510,986523,869,288,6903.4 TB
35eoEsperanto226,92467,774,923474.8 MB
36etEstonian3,601,904938,296,8928.0 GB
37tlFilipino250,558110,560,444719.2 MB
38fiFinnish14,471,7104,198,143,88341.1 GB
39frFrench158,334,99862,127,088,294430.5 GB
40glGalician248,76238,345,625255.7 MB
41kaGeorgian1,343,036373,935,1588.4 GB
42deGerman206,598,43073,848,586,648594.7 GB
43gomGoan Konkani398121,0352.3 MB
44elGreek20,282,8647,691,622,69295.7 GB
45gnGuarani142602.2 kB
46guGujarati425,552417,001,7055.6 GB
47htHaitian Creole220,67193.1 kB
48heHebrew3,997,8881,697,158,89118.0 GB
49hiHindi5,514,4542,475,605,44432.6 GB
50huHungarian21,349,37216,013,364,289150.1 GB
51isIcelandic1,210,232294,471,5392.2 GB
52ioIdo2242,59816.1 kB
53iloIloko1444,41128.0 kB
54idIndonesian7,109,7783,228,020,22123.4 GB
55iaInterlingua349,38433.5 kB
56ieInterlingue20881 Bytes
57gaIrish29,8949,054,92363.2 MB
58itItalian89,021,60636,327,274,203259.4 GB
59jaJapanese94,236,4044,401,059,165181.2 GB
60jvJavanese1723,28625.7 kB
61xalKalmyk227315 Bytes
62knKannada448,500124,924,3502.6 GB
63krcKarachay-Balkar4968,385122.4 kB
64kkKazakh677,622214,679,8573.3 GB
65kmKhmer450,66059,880,2313.2 GB
66kvKomi4605,90970.3 kB
67koKorean15,147,6983,435,866,93538.1 GB
68kuKurdish80,33825,921,607174.1 MB
69kyKyrgyz144,28832,062,783489.3 MB
70loLao118,37410,659,203472.1 MB
71laLatin14,384307,8652.0 MB
72lvLatvian2,435,882845,459,8997.4 GB
73lezLezghian67660,634856.6 kB
74liLimburgish61691.4 kB
75ltLithuanian5,182,0281,674,362,57414.5 GB
76jboLojban572312,3151.5 MB
77lmoLombard1123,26921.0 kB
78ndsLow German5,2481,612,17510.7 MB
79dsbLower Sorbian884664 Bytes
80lbLuxembourgish18,0902,514,83818.4 MB
81mkMacedonian1,063,298389,344,4254.7 GB
82maiMaithili464676.8 kB
83mgMalagasy10,8301,416,43011.2 MB
84msMalay11,500238,4772.6 MB
85mlMalayalam800,936236,597,8385.8 GB
86mtMaltese5,180149,8861.3 MB
87mrMarathi729,578252,706,3314.5 GB
88mznMazanderani38416,115169.2 kB
89minMinangkabau2,436305,5893.8 MB
90xmfMingrelian7,318283,3166.1 MB
91mwlMirandese454423 Bytes
92mnMongolian1,061,710454,350,4155.8 GB
93multiMultilingual2,948,2021,251,676,40611.9 GB
94nahNahuatl languages382792.4 kB
95neNepali1,152,156278,901,0364.9 GB
96newNewari1,996229,7034.0 MB
97noNorwegian2,797,378373,160,0332.6 GB
98nnNorwegian Nynorsk19,470575,5183.7 MB
99ocOccitan92034,701405.0 kB
100orOdia158,42631,963,340543.1 MB
101osOssetic8,6283,935,96450.7 MB
102psPashto87,40830,196,179261.6 MB
103faPersian23,813,8829,609,206,69893.2 GB
104pmsPiedmontese2,524510,0873.1 MB
105plPolish57,184,82618,073,705,588147.1 GB
106ptPortuguese36,062,80015,172,557,311105.0 GB
107paPunjabi222,058104,235,4181.4 GB
108quQuechua213143 Bytes
109roRomanian11,985,6686,302,600,83345.6 GB
110bxrRussia Buriat726988.2 kB
111ruRussian194,143,42278,032,029,3441.1 TB
112sahSakha17,5664,288,05168.8 MB
113saSanskrit16,8022,479,34556.3 MB
114gdScottish Gaelic77618,458146.1 kB
115srSerbian1,677,896632,781,8227.7 GB
116shSerbian (Latin)3,214166,517816.4 kB
117sdSindhi48,56614,667,207131.6 MB
118siSinhala301,066172,755,3852.6 GB
119skSlovak8,931,7842,704,716,28021.5 GB
120slSlovenian1,112,560192,816,7431.4 GB
121soSomali651503 Bytes
122azbSouth Azerbaijani26,3642,029,72928.4 MB
123esSpanish153,574,55663,388,237,965429.9 GB
124suSundanese182582.0 kB
125swSwahili1,664164,4591.0 MB
126svSwedish21,891,3486,993,719,60150.0 GB
127gswSwiss German34234,328232.7 kB
128tgTajik144,93276,987,2851.0 GB
129taTamil1,638,238738,824,39215.8 GB
130ttTatar262,65459,253,765833.8 MB
131teTelugu644,712201,575,8153.9 GB
132thThai14,845,9002,224,483,01892.0 GB
133boTibetan62,3526,062,558531.6 MB
134trTurkish26,654,3308,290,890,08773.7 GB
135tkTurkmen4,576325,7863.3 MB
136ukUkrainian10,059,9923,183,842,01844.7 GB
137x-emlEmiliano-Romagnol43291.8 kB
138hsbUpper Sorbian40215,827123.2 kB
139urUrdu887,004434,023,2733.8 GB
140ugUyghur51,30414,659,554219.8 MB
141uzUzbek15,8061,665,96015.3 MB
142viVietnamese33,933,99422,424,984,210140.8 GB
143voVolapük89649,968371.9 kB
144waWalloon3906,34734.3 kB
145warWaray1,49419,665126.8 kB
146cyWelsh151,51252,250,043333.0 MB
147fyWestern Frisian45,4589,885,78870.4 MB
148mrjWestern Mari49660,180765.8 kB
149pnbWestern Panjabi12,90411,844,695105.8 MB
150wuuWu Chinese1361,19926.8 kB
151yiYiddish47,43814,287,370171.7 MB
152yoYoruba1282,39616.6 kB
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + \ No newline at end of file