Librarian pruning
Package diffs
SELECT
purpose, COUNT(pdid), SUM(filesize)
FROM (
SELECT archive.purpose, packagediff.id AS pdid, libraryfilecontent.filesize, COUNT(sprf.id)
FROM
packagediff
JOIN sourcepackagerelease ON packagediff.to_source = sourcepackagerelease.id
JOIN archive ON sourcepackagerelease.upload_archive = archive.id
JOIN libraryfilealias ON packagediff.diff_content = libraryfilealias.id
JOIN libraryfilecontent ON libraryfilealias.content = libraryfilecontent.id
LEFT JOIN sourcepackagereleasefile AS sprf ON sprf.sourcepackagerelease = sourcepackagerelease.id
LEFT JOIN libraryfilealias AS lfa2 ON lfa2.id = sprf.libraryfile
WHERE lfa2.expires IS NOT NULL
GROUP BY archive.purpose, packagediff.id, libraryfilecontent.filesize
HAVING COUNT(sprf.id) > 0) AS whatevs
GROUP BY
purpose;
All SPRs
purpose | count | sum
---------+--------+--------------
1 | 122904 | 52406336082
2 | 299916 | 166301964589
4 | 142 | 1038624543
Expired SPRs
purpose | count | sum
---------+--------+--------------
2 | 228250 | 124930677164
4 | 88 | 696706807
We can recover ~125GB.
Debian sources
select sum(filesize)
from (
select distinct on (libraryfilecontent.id) libraryfilecontent.id, libraryfilecontent.filesize
from
libraryfilealias, libraryfilecontent, sourcepackagereleasefile, sourcepackagepublishinghistory
where
libraryfilecontent.id = libraryfilealias.content
AND sourcepackagepublishinghistory.archive = 3
AND sourcepackagereleasefile.sourcepackagerelease = sourcepackagepublishinghistory.sourcepackagerelease
AND sourcepackagereleasefile.libraryfile = libraryfilealias.id
AND libraryfilealias.expires is NULL
) as DONOTCARE;
sum
--------------
163160261108
We can recover well under 163GB. Most of it is probably in LFCs shared with the Ubuntu primary archive, too.
Custom uploads
SELECT distroseries.name, customformat, SUM(filesize)
FROM
packageupload
JOIN packageuploadcustom ON packageuploadcustom.packageupload = packageupload.id
JOIN libraryfilealias ON libraryfilealias.id = packageuploadcustom.libraryfilealias
JOIN libraryfilecontent ON libraryfilecontent.id = libraryfilealias.content
JOIN distroseries ON distroseries.id = packageupload.distroseries
GROUP BY distroseries.name, customformat;
name | customformat | sum
----------+--------------+-------------
dapper | 0 | 7953438662
| 2 | 1855126
edgy | 0 | 3989003467
| 2 | 7539548
| 3 | 19527463
feisty | 0 | 6936774302
| 2 | 15943395
| 3 | 12111689
gutsy | 0 | 6026825659
| 2 | 13519031
| 3 | 27018915
hardy | 0 | 11659962007
| 1 | 1061623619
| 2 | 29660708
| 3 | 42243159
intrepid | 0 | 7790748428
| 2 | 36114785
| 3 | 143574861
jaunty | 0 | 20580426775
| 1 | 416457777
| 2 | 25339400
| 3 | 201651494
karmic | 0 | 18922297797
| 1 | 1274878005
| 2 | 35827640
| 3 | 259162691
| 4 | 1607553540
lucid | 0 | 15693443523
| 1 | 5143661280
| 2 | 52683490
| 3 | 284948890
| 4 | 2683724680
| 5 | 13
maverick | 0 | 11620892644
| 1 | 28204933972
| 2 | 34029988
| 3 | 124235959
| 4 | 2868157415
| 5 | 140796
natty | 1 | 167744190
~112GB of d-i tarballs. Most of those are probably cullable. All the other types combined are <50GB.
Binaries
By archive purpose
SELECT purpose, SUM(filesize)
FROM (
SELECT DISTINCT on (libraryfilecontent.id) archive.purpose, libraryfilecontent.id, libraryfilecontent.filesize
FROM
libraryfilealias
JOIN libraryfilecontent ON libraryfilecontent.id = libraryfilealias.content
JOIN binarypackagefile ON binarypackagefile.libraryfile = libraryfilealias.id
JOIN binarypackagerelease ON binarypackagerelease.id = binarypackagefile.binarypackagerelease
JOIN binarypackagebuild ON binarypackagebuild.id = binarypackagerelease.build
JOIN packagebuild ON packagebuild.id = binarypackagebuild.package_build
JOIN archive ON archive.id = packagebuild.archive
WHERE
libraryfilealias.expires is NULL
) as DONOTCARE
GROUP BY purpose;
purpose | sum
---------+---------------
4 | 6531672894
1 | 2258952516680
6 | 18479879302
2 | 1179581775510
Only 1TB of live binaries were built in PPAs. And some of that will be in the primary archive now. So we probably want to kill bits of the primary archive.
By series, in the primary archive
SELECT distroseries.name, SUM(filesize)
FROM (
SELECT DISTINCT on (libraryfilecontent.id) distroarchseries.distroseries AS dsid, libraryfilecontent.id, libraryfilecontent.filesize
FROM
libraryfilealias
JOIN libraryfilecontent ON libraryfilecontent.id = libraryfilealias.content
JOIN binarypackagefile ON binarypackagefile.libraryfile = libraryfilealias.id
JOIN binarypackagerelease ON binarypackagerelease.id = binarypackagefile.binarypackagerelease
JOIN binarypackagepublishinghistory ON binarypackagepublishinghistory.binarypackagerelease = binarypackagerelease.id
JOIN distroarchseries ON distroarchseries.id = binarypackagepublishinghistory.distroarchseries
WHERE
libraryfilealias.expires is NULL
AND binarypackagepublishinghistory.archive = 1
) as DONOTCARE
JOIN distroseries ON distroseries.id = dsid
GROUP BY distroseries.name, distroseries.id
ORDER BY distroseries.id;
name | sum
----------+--------------
hoary | 27733894316
warty | 14336672506
breezy | 29942826488
dapper | 195109840998
edgy | 24260472130
feisty | 36122734102
gutsy | 46239699592
hardy | 387068256000
intrepid | 84222652064
jaunty | 409992452454
karmic | 439190753940
lucid | 385088521148
maverick | 312278397004
natty | 31982407516
Not entirely reliable -- the DISTINCT is on LFC, so who knows which distroseries things will show up in. Fixing that will make it duplicatastic between series, but:
SELECT distroseries.name, SUM(filesize)
FROM (
SELECT DISTINCT distroarchseries.distroseries AS dsid, libraryfilecontent.id, libraryfilecontent.filesize
FROM
libraryfilealias
JOIN libraryfilecontent ON libraryfilecontent.id = libraryfilealias.content
JOIN binarypackagefile ON binarypackagefile.libraryfile = libraryfilealias.id
JOIN binarypackagerelease ON binarypackagerelease.id = binarypackagefile.binarypackagerelease
JOIN binarypackagepublishinghistory ON binarypackagepublishinghistory.binarypackagerelease = binarypackagerelease.id
JOIN distroarchseries ON distroarchseries.id = binarypackagepublishinghistory.distroarchseries
WHERE
libraryfilealias.expires is NULL
AND binarypackagepublishinghistory.archive = 1
) as DONOTCARE
JOIN distroseries ON distroseries.id = dsid
GROUP BY distroseries.name, distroseries.id
ORDER BY distroseries.id;
name | sum
----------+--------------
hoary | 31867969066
warty | 16728308976
breezy | 37643317154
dapper | 206771360446
edgy | 53007114660
feisty | 63610979098
gutsy | 91420134964
hardy | 458761150888
intrepid | 163692052384
jaunty | 490078125746
karmic | 525245050336
lucid | 472053820314
maverick | 390624462548
natty | 76826906594
Comparing to the published binaries:
SELECT distroseries.name, SUM(filesize)
FROM (
SELECT DISTINCT distroarchseries.distroseries AS dsid, libraryfilecontent.id, libraryfilecontent.filesize
FROM
libraryfilealias
JOIN libraryfilecontent ON libraryfilecontent.id = libraryfilealias.content
JOIN binarypackagefile ON binarypackagefile.libraryfile = libraryfilealias.id
JOIN binarypackagerelease ON binarypackagerelease.id = binarypackagefile.binarypackagerelease
JOIN binarypackagepublishinghistory ON binarypackagepublishinghistory.binarypackagerelease = binarypackagerelease.id
JOIN distroarchseries ON distroarchseries.id = binarypackagepublishinghistory.distroarchseries
WHERE
libraryfilealias.expires is NULL
AND binarypackagepublishinghistory.archive = 1
AND binarypackagepublishinghistory.status IN (1, 2, 5)
) as DONOTCARE
JOIN distroseries ON distroseries.id = dsid
GROUP BY distroseries.name, distroseries.id
ORDER BY distroseries.id;
name | sum
----------+--------------
hoary | 31867969066
warty | 16728308976
breezy | 37643317154
dapper | 29759999894
edgy | 53007114660
feisty | 31042965794
gutsy | 85342535414
hardy | 110169230692
intrepid | 104598712170
jaunty | 121281205238
karmic | 120171582232
lucid | 113294111936
maverick | 76158710440
natty | 74792133316